diff --git a/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java b/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java index 088677a00..fdf14eca7 100644 --- a/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java +++ b/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java @@ -70,7 +70,13 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT } break; } + byte b = src[pos]; + int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); + if (b == Byte_ascii.Null) throw Err_.new_wo_type("null byte is invalid in byte array; src=", "src", String_.new_u8(src, src_bgn, src_end)); + if (b_len > 1) { + b = Byte_ascii.Null; // NOTE: hacky, but if there is a Byte_ascii.Null, then it will have a b_len of 1 + } switch (area) { case Area__invalid: switch (b) { @@ -104,6 +110,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t: case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z: case Byte_ascii.Colon: case Byte_ascii.Underline: + case Byte_ascii.Null: area = Area__key; if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed b/c of spaces key_bgn = pos; @@ -141,6 +148,9 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT case Byte_ascii.Colon: case Byte_ascii.Underline: case Byte_ascii.Dash: case Byte_ascii.Dot: if (key_bfr_on) key_bfr.Add_byte(b); break; + case Byte_ascii.Null: + if (key_bfr_on) key_bfr.Add_mid(src, pos, pos + b_len); + break; // ws -> end key case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space: area = Area__eql_limbo; @@ -191,6 +201,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t: case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z: case Byte_ascii.Colon: case Byte_ascii.Underline: + case Byte_ascii.Null: Make(src, pos); area = Area__key; atr_bgn = key_bgn = pos; @@ -232,6 +243,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT case Byte_ascii.Question: case Byte_ascii.At: case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick: case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde: + case Byte_ascii.Null: area = Area__val_naked; val_bgn = pos; break; @@ -262,7 +274,8 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT val_end = pos; } else { // quote is just char; EX: title="1 o'clock" or title='The "C" way' - prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char + prv_is_ws = false; + if (val_bfr_on) val_bfr.Add_byte(b); } } break; @@ -276,7 +289,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b" else { - prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space); + prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space); } } break; @@ -285,7 +298,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT int gt_pos = Xnde_find_gt(src, pos, src_end); if (gt_pos == Bry_find_.Not_found) { // area = Area__invalid; // "<" inside quote is invalid; EX: c - if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char + if (val_bfr_on) val_bfr.Add_byte(b); } else { if (qte_closed) {} @@ -301,7 +314,8 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT if (qte_closed) area = Area__invalid; else { - prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char + prv_is_ws = false; + if (val_bfr_on) val_bfr.Add_mid(src, pos, pos + b_len); } break; } @@ -328,7 +342,10 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT case Byte_ascii.Question: case Byte_ascii.At: case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick: case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde: - if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char + if (val_bfr_on) val_bfr.Add_byte(b); + break; + case Byte_ascii.Null: + if (val_bfr_on) val_bfr.Add_mid(src, pos, pos + b_len); break; // case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example // angle_bgn -> check for ; EX: a=bcd @@ -364,7 +381,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT } break; } - ++pos; + pos += b_len; } // iterate atrs and notify diff --git a/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java b/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java index 986202db9..fca4b9727 100644 --- a/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java +++ b/400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java @@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import org.junit.*; public class Mwh_atr_parser_tst { - private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt(); + private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt(); @Test public void Pair__quote__double() {fxt.Test_parse("a=\"b\"" , fxt.Make_pair("a" , "b"));} @Test public void Pair__quote__single() {fxt.Test_parse("a='b'" , fxt.Make_pair("a" , "b"));} @Test public void Pair__quote__none() {fxt.Test_parse("a=b" , fxt.Make_pair("a" , "b"));} @@ -64,6 +64,11 @@ public class Mwh_atr_parser_tst { @Test public void Val__as_int() {fxt.Test_val_as_int("-123" , -123);} + @Test public void Utf_8() { + fxt.Test_parse("a=𤭢 b=2", fxt.Make_pair("a", "𤭢"), fxt.Make_pair("b", "2")); + fxt.Test_parse("a=1 𤭢=2", fxt.Make_pair("a", "1"), fxt.Make_pair("𤭢", "2")); + } + // @Test public void Embedded() { // PURPOSE: handle html inside attrib; PAGE:en.w:Economy_of_Greece DATE:2015-10-15 // fxt.Test_parse("title='[1] c'" // , fxt.Make_fail(0, 11) // "title='