1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Parser: Handle UTF-8 characters in attribute keys / values [#457]

This commit is contained in:
gnosygnu 2019-05-11 10:55:37 -04:00
parent 3a748eea32
commit 31c6576b50
2 changed files with 29 additions and 7 deletions

View File

@ -70,7 +70,13 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
}
break;
}
byte b = src[pos];
int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
if (b == Byte_ascii.Null) throw Err_.new_wo_type("null byte is invalid in byte array; src=", "src", String_.new_u8(src, src_bgn, src_end));
if (b_len > 1) {
b = Byte_ascii.Null; // NOTE: hacky, but if there is a Byte_ascii.Null, then it will have a b_len of 1
}
switch (area) {
case Area__invalid:
switch (b) {
@ -104,6 +110,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon: case Byte_ascii.Underline:
case Byte_ascii.Null:
area = Area__key;
if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed b/c of spaces
key_bgn = pos;
@ -141,6 +148,9 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Colon: case Byte_ascii.Underline: case Byte_ascii.Dash: case Byte_ascii.Dot:
if (key_bfr_on) key_bfr.Add_byte(b);
break;
case Byte_ascii.Null:
if (key_bfr_on) key_bfr.Add_mid(src, pos, pos + b_len);
break;
// ws -> end key
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
area = Area__eql_limbo;
@ -191,6 +201,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon: case Byte_ascii.Underline:
case Byte_ascii.Null:
Make(src, pos);
area = Area__key;
atr_bgn = key_bgn = pos;
@ -232,6 +243,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Question: case Byte_ascii.At:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
case Byte_ascii.Null:
area = Area__val_naked;
val_bgn = pos;
break;
@ -262,7 +274,8 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
val_end = pos;
}
else { // quote is just char; EX: title="1 o'clock" or title='The "C" way'
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
prv_is_ws = false;
if (val_bfr_on) val_bfr.Add_byte(b);
}
}
break;
@ -276,7 +289,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b"
else {
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
}
}
break;
@ -285,7 +298,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
// area = Area__invalid; // "<" inside quote is invalid; EX: <span title='a<b'>c</span>
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
if (val_bfr_on) val_bfr.Add_byte(b);
}
else {
if (qte_closed) {}
@ -301,7 +314,8 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
if (qte_closed)
area = Area__invalid;
else {
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
prv_is_ws = false;
if (val_bfr_on) val_bfr.Add_mid(src, pos, pos + b_len);
}
break;
}
@ -328,7 +342,10 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Question: case Byte_ascii.At:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
if (val_bfr_on) val_bfr.Add_byte(b);
break;
case Byte_ascii.Null:
if (val_bfr_on) val_bfr.Add_mid(src, pos, pos + b_len);
break;
// case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example
// angle_bgn -> check for <nowiki>; EX: a=b<nowiki>c</nowiki>d
@ -364,7 +381,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
}
break;
}
++pos;
pos += b_len;
}
// iterate atrs and notify

View File

@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import org.junit.*;
public class Mwh_atr_parser_tst {
private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt();
private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt();
@Test public void Pair__quote__double() {fxt.Test_parse("a=\"b\"" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__single() {fxt.Test_parse("a='b'" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__none() {fxt.Test_parse("a=b" , fxt.Make_pair("a" , "b"));}
@ -64,6 +64,11 @@ public class Mwh_atr_parser_tst {
@Test public void Val__as_int() {fxt.Test_val_as_int("-123" , -123);}
@Test public void Utf_8() {
fxt.Test_parse("a=𤭢 b=2", fxt.Make_pair("a", "𤭢"), fxt.Make_pair("b", "2"));
fxt.Test_parse("a=1 𤭢=2", fxt.Make_pair("a", "1"), fxt.Make_pair("𤭢", "2"));
}
// @Test public void Embedded() { // PURPOSE: handle html inside attrib; PAGE:en.w:Economy_of_Greece DATE:2015-10-15
// fxt.Test_parse("title='<sup id='cite_ref-a_1-0' class='reference'><a href='#cite_note-a-1'>[1]</a></sup> c'"
// , fxt.Make_fail(0, 11) // "title='<sup" invalid b/c of "<"