1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

v2.10.3.1

This commit is contained in:
gnosygnu
2015-10-18 22:17:57 -04:00
parent 8e18af05b6
commit 4f43f51b18
1935 changed files with 12500 additions and 12889 deletions

View File

@@ -34,6 +34,7 @@ public class Mwh_atr_itm {
public boolean Valid() {return valid;} private final boolean valid;
public boolean Key_exists() {return key_exists;} private final boolean key_exists;
public boolean Repeated() {return repeated;} private final boolean repeated;
public boolean Invalid() {return repeated || !valid;}
public int Atr_bgn() {return atr_bgn;} private int atr_bgn;
public int Atr_end() {return atr_end;} private int atr_end;
public int Key_bgn() {return key_bgn;} private final int key_bgn;
@@ -45,42 +46,21 @@ public class Mwh_atr_itm {
public byte[] Val_bry() {return val_bry;} private byte[] val_bry;
public int Eql_pos() {return eql_pos;} private final int eql_pos;
public int Qte_tid() {return qte_tid;} private final int qte_tid;
public byte Qte_byte() {
switch (qte_tid) {
case Mwh_atr_itm_.Qte_tid__none: return Byte_ascii.Null;
case Mwh_atr_itm_.Qte_tid__apos: return Byte_ascii.Apos;
case Mwh_atr_itm_.Qte_tid__qute: return Byte_ascii.Quote;
default: throw Err_.new_unhandled(qte_tid);
}
}
public Mwh_atr_itm Atr_rng(int bgn, int end) {this.atr_bgn = bgn; this.atr_end = end; return this;}
public void Key_bry_(byte[] v) {this.key_bry = v;}
public void Val_bry_(byte[] v) {this.val_bry = v;}
public String Val_as_str() {return String_.new_u8(Val_as_bry());}
public byte[] Val_as_bry() {if (val_bry == null) val_bry = Bry_.Mid(src, val_bgn, val_end); return val_bry;} // NOTE: val_bry is cached
public byte[] Val_as_bry__blank_to_null() {byte[] rv = Val_as_bry(); return Bry_.Len_eq_0(rv) ? null : rv;}
public int Val_as_int_or(int or) {return val_bry == null ? Bry_.To_int_or__lax(src, val_bgn, val_end, or) : Bry_.To_int_or(val_bry, or);}
public boolean Val_as_bool_by_int() {return Val_as_int_or(0) == 1;}
public boolean Val_as_bool() {return Bry_.Eq(Bry_.Lcase__all(Val_as_bry()), Bool_.True_bry);}
public static final Mwh_atr_itm[] Ary_empty = new Mwh_atr_itm[0];
public static final int Atr_tid__invalid = 1, Atr_tid__repeat = 2, Atr_tid__pair = 4, Atr_tid__name = 8; // NOTE: id order is important; see above;
public static final int Qte_tid__none = 0, Qte_tid__apos = 1, Qte_tid__qute = 2;
public static final int Mask__qte__none = 0, Mask__qte__apos = 1, Mask__qte_qute = 2;
public static final int
Mask__valid = 8
, Mask__repeated = 16
, Mask__key_exists = 32
, Mask__val_made = 64
;
public static final boolean Mask__valid__n = false, Mask__valid__y = true;
public static final boolean Mask__key_exists__n = false, Mask__key_exists__y = true;
public static final boolean Mask__repeated__n = false, Mask__repeated__y = true;
public static final boolean Mask__val_made__n = false, Mask__val_made__y = true;
public static int Calc_atr_utl(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made) {
int rv = qte_tid;
if (valid) rv |= Mwh_atr_itm.Mask__valid;
if (repeated) rv |= Mwh_atr_itm.Mask__repeated;
if (key_exists) rv |= Mwh_atr_itm.Mask__key_exists;
if (val_made) rv |= Mwh_atr_itm.Mask__val_made;
return rv;
}
public static int Calc_qte_tid(int val) {
return val & ((1 << 3) - 1);
}
public static byte Calc_qte_byte(int[] data_ary, int idx) {
int val = data_ary[idx + Mwh_atr_mgr.Idx_atr_utl];
int qte_tid = (val & ((1 << 3) - 1));
return qte_tid == Qte_tid__apos ? Byte_ascii.Apos : Byte_ascii.Quote;
}
// public static final byte Key_tid_generic = 0, Key_tid_id = 1, Key_tid_style = 2, Key_tid_role = 3;
}

View File

@@ -0,0 +1,51 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
public class Mwh_atr_itm_ {
public static final Mwh_atr_itm[] Ary_empty = new Mwh_atr_itm[0];
public static final int Atr_tid__invalid = 1, Atr_tid__repeat = 2, Atr_tid__pair = 4, Atr_tid__name = 8; // NOTE: id order is important; see above;
public static final int Qte_tid__none = 0, Qte_tid__apos = 1, Qte_tid__qute = 2;
public static final int Mask__qte__none = 0, Mask__qte__apos = 1, Mask__qte_qute = 2;
public static final int
Mask__valid = 8
, Mask__repeated = 16
, Mask__key_exists = 32
, Mask__val_made = 64
;
public static final boolean Mask__valid__n = false, Mask__valid__y = true;
public static final boolean Mask__key_exists__n = false, Mask__key_exists__y = true;
public static final boolean Mask__repeated__n = false, Mask__repeated__y = true;
public static final boolean Mask__val_made__n = false, Mask__val_made__y = true;
public static int Calc_atr_utl(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made) {
int rv = qte_tid;
if (valid) rv |= Mwh_atr_itm_.Mask__valid;
if (repeated) rv |= Mwh_atr_itm_.Mask__repeated;
if (key_exists) rv |= Mwh_atr_itm_.Mask__key_exists;
if (val_made) rv |= Mwh_atr_itm_.Mask__val_made;
return rv;
}
public static int Calc_qte_tid(int val) {
return val & ((1 << 3) - 1);
}
public static byte Calc_qte_byte(int[] data_ary, int idx) {
int val = data_ary[idx + Mwh_atr_mgr.Idx_atr_utl];
int qte_tid = (val & ((1 << 3) - 1));
return qte_tid == Qte_tid__apos ? Byte_ascii.Apos : Byte_ascii.Quote;
}
public static final byte Key_tid__generic = 0, Key_tid__id = 1, Key_tid__style = 2, Key_tid__role = 3;
}

View File

@@ -0,0 +1,21 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
public interface Mwh_atr_itm_owner {
void Xatr__set(Xowe_wiki wiki, byte[] src, Mwh_atr_itm xatr, Object xatr_id_obj);
}

View File

@@ -64,7 +64,7 @@ public class Mwh_atr_mgr {
}
data_ary[data_idx + Idx_nde_uid] = nde_uid;
data_ary[data_idx + Idx_nde_tid] = nde_tid;
data_ary[data_idx + Idx_atr_utl] = Mwh_atr_itm.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
data_ary[data_idx + Idx_atr_utl] = Mwh_atr_itm_.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
data_ary[data_idx + Idx_atr_bgn] = atr_bgn;
data_ary[data_idx + Idx_atr_end] = atr_end;
data_ary[data_idx + Idx_key_bgn] = key_bgn;
@@ -78,7 +78,7 @@ public class Mwh_atr_mgr {
int atr_utl_idx = (atr_uid * Idx__mult) + Idx_atr_utl;
int atr_utl = data_ary[atr_utl_idx];
int val_bry_exists = atr_utl & Atr_utl__val_bry_exists;
data_ary[atr_utl_idx] = Mwh_atr_itm.Atr_tid__repeat | val_bry_exists;
data_ary[atr_utl_idx] = Mwh_atr_itm_.Atr_tid__repeat | val_bry_exists;
}
public static final int
Idx_nde_uid = 0

View File

@@ -21,19 +21,19 @@ public class Mwh_atr_mgr_tst {
private final Mwh_atr_mgr_fxt fxt = new Mwh_atr_mgr_fxt();
@Test public void Atr_utl_make() {
// key="val"
fxt.Test_atr_utl_make(Mwh_atr_itm.Qte_tid__qute, Mwh_atr_itm.Mask__valid__y, Mwh_atr_itm.Mask__repeated__n, Mwh_atr_itm.Mask__key_exists__y, Mwh_atr_itm.Mask__val_made__n, 42);
fxt.Test_atr_utl_make(Mwh_atr_itm_.Qte_tid__qute, Mwh_atr_itm_.Mask__valid__y, Mwh_atr_itm_.Mask__repeated__n, Mwh_atr_itm_.Mask__key_exists__y, Mwh_atr_itm_.Mask__val_made__n, 42);
// key=val key=v<nowiki/>al
fxt.Test_atr_utl_make(Mwh_atr_itm.Qte_tid__none, Mwh_atr_itm.Mask__valid__y, Mwh_atr_itm.Mask__repeated__y, Mwh_atr_itm.Mask__key_exists__y, Mwh_atr_itm.Mask__val_made__y, 120);
fxt.Test_atr_utl_make(Mwh_atr_itm_.Qte_tid__none, Mwh_atr_itm_.Mask__valid__y, Mwh_atr_itm_.Mask__repeated__y, Mwh_atr_itm_.Mask__key_exists__y, Mwh_atr_itm_.Mask__val_made__y, 120);
}
}
class Mwh_atr_mgr_fxt {
public void Test_atr_utl_make(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made, int expd) {
int atr_utl = Mwh_atr_itm.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
int atr_utl = Mwh_atr_itm_.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
Tfds.Eq_int(expd, atr_utl);
Tfds.Eq_int(qte_tid, Mwh_atr_itm.Calc_qte_tid(atr_utl));
Tfds.Eq_bool(valid, (atr_utl & Mwh_atr_itm.Mask__valid) == Mwh_atr_itm.Mask__valid);
Tfds.Eq_bool(repeated, (atr_utl & Mwh_atr_itm.Mask__repeated) == Mwh_atr_itm.Mask__repeated);
Tfds.Eq_bool(key_exists, (atr_utl & Mwh_atr_itm.Mask__key_exists) == Mwh_atr_itm.Mask__key_exists);
Tfds.Eq_bool(val_made, (atr_utl & Mwh_atr_itm.Mask__val_made) == Mwh_atr_itm.Mask__val_made);
Tfds.Eq_int(qte_tid, Mwh_atr_itm_.Calc_qte_tid(atr_utl));
Tfds.Eq_bool(valid, (atr_utl & Mwh_atr_itm_.Mask__valid) == Mwh_atr_itm_.Mask__valid);
Tfds.Eq_bool(repeated, (atr_utl & Mwh_atr_itm_.Mask__repeated) == Mwh_atr_itm_.Mask__repeated);
Tfds.Eq_bool(key_exists, (atr_utl & Mwh_atr_itm_.Mask__key_exists) == Mwh_atr_itm_.Mask__key_exists);
Tfds.Eq_bool(val_made, (atr_utl & Mwh_atr_itm_.Mask__val_made) == Mwh_atr_itm_.Mask__val_made);
}
}

View File

@@ -26,115 +26,58 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
private byte area = Area__atr_limbo;
private int atr_bgn = -1, key_bgn = -1, key_end = -1, eql_pos = -1, val_bgn = -1, val_end = -1;
private byte qte_byte = Byte_ascii.Null;
private boolean key_bfr_on = false, val_bfr_on = false, ws_is_before_val = false;
private boolean key_bfr_on = false, val_bfr_on = false, ws_is_before_val = false, qte_closed = false;
private int nde_uid, nde_tid;
public Bry_obj_ref Bry_obj() {return bry_ref;} private final Bry_obj_ref bry_ref = Bry_obj_ref.null_();
public int Nde_end_tid() {return nde_end_tid;} private int nde_end_tid;
public int Parse(Mwh_doc_wkr wkr, int nde_uid, int nde_tid, byte[] src, int src_bgn, int src_end) {
this.nde_uid = nde_uid; this.nde_tid = nde_tid;
this.nde_end_tid = Mwh_doc_parser.Nde_end_tid__invalid;
this.atr_bgn = -1;
area = Area__atr_limbo;
boolean prv_is_ws = false;
int pos = src_bgn;
boolean loop = true;
while (loop) {
if (pos == src_end) {
if (area == Area__val_quote) { // quote still open
int reset_pos = Bry_find_.Find_fwd(src, Byte_ascii.Space, val_bgn, src_end); // try to find 1st space within quote; EX:"a='b c=d" should try to reset at c=d
boolean reset_found = reset_pos != Bry_find_.Not_found;
area = Area__invalid; val_end = reset_found ? reset_pos : src_end;
Make(src, val_end); // create invalid atr
if (reset_found) { // space found; resume from text after space; EX: "a='b c=d"; PAGE:en.w:Aubervilliers DATE:2014-06-25
pos = Bry_find_.Find_fwd_while_not_ws(src, reset_pos, src_end); // skip ws
atr_bgn = -1;
area = Area__atr_limbo;
val_bfr.Clear();
val_bfr_on = false;
ws_is_before_val = false;
continue;
}
else
if (pos >= src_end) {
switch (area) {
case Area__key: // EX: "a"
case Area__eql_limbo: // EX: "a "
case Area__val_naked: // EX: "a=b"
break; // valid atr
case Area__val_quote: // EX: "a='b'"
if (qte_closed)
Make(src, src_end);
else { // dangling; EX: "a='b c=d"
int reset_pos = Bry_find_.Find_fwd(src, Byte_ascii.Space, val_bgn, src_end); // try to find 1st space within quote; EX:"a='b c=d" should try to reset at c=d
boolean reset_found = reset_pos != Bry_find_.Not_found;
area = Area__invalid; val_end = reset_found ? reset_pos : src_end;
Make(src, val_end); // create invalid atr
if (reset_found) { // space found; resume from text after space; EX: "a='b c=d"; PAGE:en.w:Aubervilliers DATE:2014-06-25
pos = Bry_find_.Find_fwd_while_not_ws(src, reset_pos, src_end); // skip ws
atr_bgn = -1;
area = Area__atr_limbo;
continue;
}
}
break;
case Area__invalid: case Area__atr_limbo:
case Area__val_limbo:
area = Area__invalid;
break;
}
else {
if (area == Area__val_limbo) // NOTE: handle dangling "k=" else will be "k"; EX: <a b=> x> <a b>; PAGE:en.s:Notes_by_the_Way/Chapter_2; DATE:2015-01-31
area = Area__invalid;
if (atr_bgn != -1) { // atr_bgn will be -1 if atrs ends on quoted (EX:"a='b'"); else, pending atr that needs to be processed; EX: "a=b" b wil be in bfr
val_end = src_end;
Make(src, src_end);
}
break;
if (atr_bgn != -1) {
val_end = src_end;
Make(src, val_end);
}
}
else if (pos > src_end)
break;
}
byte b = src[pos];
switch (area) {
case Area__atr_limbo: // 1st area after node_name or attribute
switch (b) {
// gt -> stop iterating
case Byte_ascii.Gt:
nde_end_tid = Mwh_doc_parser.Nde_end_tid__gt;
loop = false;
break;
// slash -> check for "/>" or " / "
case Byte_ascii.Slash:
int nxt_pos = pos + 1;
if (nxt_pos == src_end) {
pos = nxt_pos;
return Mwh_doc_parser.Nde_end_tid__invalid;
}
else if (src[nxt_pos] == Byte_ascii.Gt) {
nde_end_tid = Mwh_doc_parser.Nde_end_tid__inline;
pos = nxt_pos;
loop = false;
}
else {
area = Area__invalid; atr_bgn = pos;
}
break;
// ws -> ignore; skip any ws in atr_limbo; note that once a non-ws char is encountered, it will immediately go into another area
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
if (atr_bgn == -1) atr_bgn = pos;
break;
// alphanum -> enter Area__key
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon:
area = Area__key;
if (atr_bgn == -1) atr_bgn = pos;
key_bgn = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt: // handle "<nowiki>"
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
area = Area__invalid;
atr_bgn = pos;
}
else
pos = gt_pos; // position after ">"; note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> invalid
default: // quote and other non-valid key characters are invalid until next space; EX: "<span 'key_cannot_be_quoted' id='123'"
area = Area__invalid; atr_bgn = pos;
break;
}
break;
case Area__invalid:
switch (b) {
// ws -> src_end invalid area
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
// ws -> end invalid area
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
Make(src, pos);
area = Area__atr_limbo;
break;
@@ -143,9 +86,13 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
break;
}
break;
case Area__key:
case Area__atr_limbo: // 1st area after (a) node_name, (b) attribute, (c) invalid_area
switch (b) {
// alphanum -> valid key chars
// ws -> ignore; skip any ws in atr_limbo; note that once a non-ws char is encountered, it will immediately go into another area
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed for multiple spaces; ALSO: cannot move above switch b/c of <nowiki>
break;
// attribFirst -> enter Area__key; REF.MW: $attribFirst = '[:A-Z_a-z0-9]';
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
@@ -158,26 +105,61 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon: case Byte_ascii.Dash: case Byte_ascii.Dot: case Byte_ascii.Underline:
case Byte_ascii.Colon: case Byte_ascii.Underline:
area = Area__key;
if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed b/c of spaces
key_bgn = pos;
break;
// angle_bgn -> check for <nowiki>
case Byte_ascii.Angle_bgn: // handle "<nowiki>"
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
area = Area__invalid; if (atr_bgn == -1) atr_bgn = pos;
}
else
pos = gt_pos; // position after ">"; note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> invalid
default: // quote and other non-valid key characters are invalid until next space; EX: "<span 'key_cannot_be_quoted' id='123'"
area = Area__invalid; if (atr_bgn == -1) atr_bgn = pos;
break;
}
break;
case Area__key:
switch (b) {
// alphanum -> valid key chars; REF.MW: $attrib = '[:A-Z_a-z-.0-9]';
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon: case Byte_ascii.Underline: case Byte_ascii.Dash: case Byte_ascii.Dot:
if (key_bfr_on) key_bfr.Add_byte(b);
break;
// ws -> src_end key
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
// ws -> end key
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
area = Area__eql_limbo;
key_end = pos;
break;
// eq -> src_end key; skip Area_eq and go to Area_val_bgn
// eq -> end key; go to Area_val_limbo
case Byte_ascii.Eq:
area = Area__val_limbo;
key_end = eql_pos = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt:
// angle_bgn -> check for <nowiki>
case Byte_ascii.Angle_bgn:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) // "<" should not be in key; EX: "ke<y"
area = Area__invalid;
else {
if (!key_bfr_on) {key_bfr.Add_mid(src, key_bgn, pos); key_bfr_on = true;}
if (!key_bfr_on) {key_bfr.Add_mid(src, key_bgn, pos); key_bfr_on = true;}
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
}
break;
@@ -190,40 +172,14 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Area__eql_limbo:
switch (b) {
// ws -> skip
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab: // skip ws
if (key_end == -1) { // EX: "a = b"; key_end != -1 b/c 1st \s sets key_end; EX: "a b = c"; key_end
val_end = pos - 1;
Make(src, pos);
area = Area__atr_limbo;
continue;
}
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space: // skip ws
break;
// eq -> enter Area__eq
// eq -> enter Area__val_limbo
case Byte_ascii.Eq:
eql_pos = pos;
area = Area__val_limbo;
break;
// rest -> make atr and enter limbo
case Byte_ascii.Quote: case Byte_ascii.Apos: // FUTURE: previous word was key
default: // NOTE: added this late; xml_parser was not handling "line start=3" DATE:2013-07-03
val_end = pos - 1;
Make(src, pos);
area = Area__atr_limbo;
continue;
}
break;
case Area__val_limbo:
switch (b) {
// ws -> skip
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
ws_is_before_val = true;
break;
// quote -> enter Area_val_quote
case Byte_ascii.Quote: case Byte_ascii.Apos:
area = Area__val_quote; qte_byte = b; prv_is_ws = false;
val_bgn = pos + 1;
break;
// alphanum -> enter Area_val_raw
// attribFirst -> enter Area__key; REF.MW: $attribFirst = '[:A-Z_a-z0-9]';
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
@@ -236,64 +192,124 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon:
case Byte_ascii.Hash:
area = Area__val_naked;
val_bgn = pos;
case Byte_ascii.Colon: case Byte_ascii.Underline:
Make(src, pos);
area = Area__key;
atr_bgn = key_bgn = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found)
area = Area__invalid;
else
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> ignore (?)
// rest -> make atr and enter limbo
default:
area = Area__invalid;
break;
}
break;
case Area__val_quote: { // EX: "'val' " in "key = 'val'"
case Area__val_limbo:
switch (b) {
// ws -> skip
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
ws_is_before_val = true;
break;
// quote -> enter Area_val_quote
case Byte_ascii.Quote: case Byte_ascii.Apos:
area = Area__val_quote; qte_byte = b; qte_closed = false;
prv_is_ws = false;
val_bgn = pos + 1;
break;
// alphanum -> enter Area_val_raw; REF.MW: [a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent: case Byte_ascii.Amp:
case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star: case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot:
case Byte_ascii.Backslash: case Byte_ascii.Slash: case Byte_ascii.Colon: case Byte_ascii.Semic:
case Byte_ascii.Question: case Byte_ascii.At:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
area = Area__val_naked;
val_bgn = pos;
break;
// case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example
// angle_bgn -> check for <nowiki>
case Byte_ascii.Angle_bgn:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found)
area = Area__invalid; // NOTE: valid in MW; making invalid now until finding counter-example
else
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> ignore
default:
area = Area__invalid;
break;
}
break;
case Area__val_quote: { // EX: "'val' " in "key = 'val'"; REF.MW: \"([^<\"]*)\"
switch (b) {
// quote: check if same as opening quote
case Byte_ascii.Quote: case Byte_ascii.Apos:
if (qte_byte == b) { // quote closes val
val_end = pos;
Make(src, pos + 1); // NOTE: set atr_end *after* quote
if (qte_closed)
area = Area__invalid;
else {
if (qte_byte == b) { // quote closes val
qte_closed = true;
val_end = pos;
}
else { // quote is just char; EX: title="1 o'clock" or title='The "C" way'
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
}
}
else { // quote is just char; EX: title="1 o'clock" or title='The "C" way'
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
}
break;
// lt -> check for <nowiki>; EX: <span title='ab<nowiki>c</nowiki>de'>
case Byte_ascii.Lt:
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found)
// area = Area__invalid; // DELETE: 2012-11-13; unpaired < should not mark atr invalid; EX: style='margin:1em<f'
val_bfr.Add_byte(Byte_ascii.Lt);
else
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
prv_is_ws = false;
break;
// ws -> convert all ws to \s; only allow 1 ws at any point in time
case Byte_ascii.Nl: case Byte_ascii.Tab: case Byte_ascii.Cr: // REF.MW:Sanitizer.php|decodeTagAttributes $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
case Byte_ascii.Space:
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b"
else {
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space: // REF.MW:Sanitizer.php|decodeTagAttributes $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
if (qte_closed) {
Make(src, pos); // NOTE: set atr_end *after* quote
if (atr_bgn == -1) atr_bgn = pos; // NOTE: process ws just like Area__atr_limbo
}
else {
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b"
else {
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
}
}
break;
// angle_bgn -> check for <nowiki>; EX: <span title='ab<nowiki>c</nowiki>de'>
case Byte_ascii.Angle_bgn:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
// area = Area__invalid; // "<" inside quote is invalid; EX: <span title='a<b'>c</span>
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
}
else {
if (qte_closed) {}
else {
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
}
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
}
prv_is_ws = false;
break;
// rest -> add to val
default:
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
if (qte_closed)
area = Area__invalid;
else {
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
}
break;
}
break;
}
case Area__val_naked: // no quotes; EX:a=bcd
case Area__val_naked: // no quotes; EX:a=bcd; REF.MW:([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
switch (b) {
// alphanum -> continue reading
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
@@ -308,16 +324,28 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent:
case Byte_ascii.Amp: case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star:
case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot: case Byte_ascii.Slash:
case Byte_ascii.Colon: case Byte_ascii.Semic: case Byte_ascii.Gt:
case Byte_ascii.Question: case Byte_ascii.At: case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end:
case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Pipe: case Byte_ascii.Curly_end: case Byte_ascii.Tilde:
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent: case Byte_ascii.Amp:
case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star: case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot:
case Byte_ascii.Backslash: case Byte_ascii.Slash: case Byte_ascii.Colon: case Byte_ascii.Semic:
case Byte_ascii.Question: case Byte_ascii.At:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
break;
// case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example
// angle_bgn -> check for <nowiki>; EX: a=b<nowiki>c</nowiki>d
case Byte_ascii.Angle_bgn:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
area = Area__invalid; // NOTE: valid in MW; making invalid now until finding counter-example
}
else {
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
}
break;
// ws -> src_end atr
case Byte_ascii.Space: case Byte_ascii.Tab: case Byte_ascii.Nl:
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
val_end = pos;
Make(src, pos);
break;
@@ -329,14 +357,9 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
key_end = pos;
area = Area__val_limbo; // set area to val_bgn (basically, put after =)
}
else // "a=b=c"; discard all
else // "a=b=c"; discard all
area = Area__invalid;
break;
case Byte_ascii.Lt:
val_end = pos;
Make(src, pos);
--pos; // NOTE: --pos to include "<" as part of next atr; above ws excludes from next atr
break;
default:
area = Area__invalid;
break;
@@ -355,10 +378,10 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
byte[] key_bry = text_ary[j * Mwh_atr_mgr.Text__mult];
byte[] val_bry_manual = null;
int atr_utl = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
boolean atr_valid = (atr_utl & Mwh_atr_itm.Mask__valid) == Mwh_atr_itm.Mask__valid;
boolean repeated = (atr_utl & Mwh_atr_itm.Mask__repeated) == Mwh_atr_itm.Mask__repeated;
boolean key_exists = (atr_utl & Mwh_atr_itm.Mask__key_exists) == Mwh_atr_itm.Mask__key_exists;
boolean val_made = (atr_utl & Mwh_atr_itm.Mask__val_made) == Mwh_atr_itm.Mask__val_made;
boolean atr_valid = (atr_utl & Mwh_atr_itm_.Mask__valid) == Mwh_atr_itm_.Mask__valid;
boolean repeated = (atr_utl & Mwh_atr_itm_.Mask__repeated) == Mwh_atr_itm_.Mask__repeated;
boolean key_exists = (atr_utl & Mwh_atr_itm_.Mask__key_exists) == Mwh_atr_itm_.Mask__key_exists;
boolean val_made = (atr_utl & Mwh_atr_itm_.Mask__val_made) == Mwh_atr_itm_.Mask__val_made;
if (val_made)
val_bry_manual = text_ary[(j * Mwh_atr_mgr.Text__mult) + 1];
wkr.On_atr_each(this, src, nde_tid, atr_valid, repeated, key_exists, key_bry, val_bry_manual, data_ary, itm_idx);
@@ -368,6 +391,48 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
return pos;
}
private void Make(byte[] src, int atr_end) {
// calc final values for atr
boolean key_exists = false;
byte[] key_bry = null, val_bry = null;
boolean atr_valid = true;
if (area == Area__invalid) {
atr_valid = false;
key_bry = Bry_.Empty;
key_bfr.Clear();
if (val_bgn == -1) val_bgn = atr_bgn;
val_bfr.Clear();
}
else {
if (key_bgn != -1 && val_bgn != -1) // key && val exists; EX: "<input id='123'>"
key_exists = true;
else { // not a pair; EX: "<input checked>"
if (key_end == -1) key_end = val_end; // NOTE: key_end == -1 when eos; EX: "a" would have key_bgn = 0; key_end = -1; val_end = 1 DATE:2014-07-03
val_bgn = val_end = -1;
}
key_bry = key_bfr_on ? key_bfr.To_bry_and_clear() : Bry_.Mid(src, key_bgn, key_end); // always make key_bry; needed for repeated_atrs as well as key_tid
if (val_bfr_on) val_bry = val_bfr.To_bry_and_clear();
}
int qte_tid = Mwh_atr_itm_.Mask__qte__none;
if (qte_byte != Byte_ascii.Null)
qte_tid = qte_byte == Byte_ascii.Quote ? Mwh_atr_itm_.Mask__qte_qute : Mwh_atr_itm_.Mask__qte__apos;
int atr_uid = atr_mgr.Add(nde_uid, nde_tid, atr_valid, false, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, eql_pos, qte_tid, val_bgn, val_end, val_bry);
// handle repeated atrs
if (atr_valid) {
int repeated_uid = repeated_atrs_hash.Get_as_int_or(key_bry, -1);
if (repeated_uid != -1) {
repeated_atrs_hash.Del(key_bry);
atr_mgr.Set_repeated(repeated_uid);
}
repeated_atrs_hash.Add_bry_int(key_bry, atr_uid);
}
// reset temp variables
area = Area__atr_limbo; qte_byte = Byte_ascii.Null;
atr_bgn = key_bgn = val_bgn = key_end = val_end = eql_pos = -1;
key_bfr_on = val_bfr_on = ws_is_before_val = qte_closed = false;
}
public int Xnde_find_gt_find(byte[] src, int pos, int end) {
bry_ref.Val_(null);
byte b = src[pos];
@@ -381,7 +446,7 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
return bry == null ? Bry_find_.Not_found : bry.length + pos;
}
private int Xnde_find_gt(byte[] src, int lt_pos, int end) {
int pos = lt_pos + 1;
int pos = lt_pos + 1; if (pos == end) return Bry_find_.Not_found;
byte b = src[pos];
if (b == Byte_ascii.Slash && pos + 1 < end) {
++pos;
@@ -406,47 +471,6 @@ public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATT
}
return Bry_find_.Not_found;
}
private void Make(byte[] src, int atr_end) {
// calc final values for atr
boolean key_exists = false;
byte[] key_bry = null, val_bry = null;
boolean atr_valid = true;
if (area != Area__invalid) {
if (key_bgn != -1 && val_bgn != -1) // key && val exists; EX: "<input id='123'>"
key_exists = true;
else { // not a pair; EX: "<input checked>"
if (key_end == -1) key_end = val_end; // NOTE: key_end == -1 when eos; EX: "a" would have key_bgn = 0; key_end = -1; val_end = 1 DATE:2014-07-03
val_bgn = val_end = -1;
}
key_bry = key_bfr_on ? key_bfr.Xto_bry_and_clear() : Bry_.Mid(src, key_bgn, key_end); // always make key_bry; needed for repeated_atrs as well as key_tid
if (val_bfr_on) val_bry = val_bfr.Xto_bry_and_clear();
}
else {
atr_valid = false;
key_bry = Bry_.Empty;
key_bfr.Clear();
if (val_bgn == -1) val_bgn = atr_bgn;
}
int qte_tid = Mwh_atr_itm.Mask__qte__none;
if (qte_byte != Byte_ascii.Null)
qte_tid = qte_byte == Byte_ascii.Quote ? Mwh_atr_itm.Mask__qte_qute : Mwh_atr_itm.Mask__qte__apos;
int atr_uid = atr_mgr.Add(nde_uid, nde_tid, atr_valid, false, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, eql_pos, qte_tid, val_bgn, val_end, val_bry);
// handle repeated atrs
if (atr_valid) {
int repeated_uid = repeated_atrs_hash.Get_as_int_or(key_bry, -1);
if (repeated_uid != -1) {
repeated_atrs_hash.Del(key_bry);
atr_mgr.Set_repeated(repeated_uid);
}
repeated_atrs_hash.Add_bry_int(key_bry, atr_uid);
}
// reset temp variables
area = Area__atr_limbo; qte_byte = Byte_ascii.Null;
atr_bgn = key_bgn = val_bgn = key_end = val_end = eql_pos = -1;
key_bfr_on = val_bfr_on = ws_is_before_val = false;
}
private static final Hash_adp_bry xnde_hash = Hash_adp_bry.ci_a7()
.Add_bry_bry(Xop_xnde_tag_.Tag_nowiki.Name_bry())
.Add_bry_bry(Xop_xnde_tag_.Tag_noinclude.Name_bry())

View File

@@ -21,7 +21,7 @@ class Mwh_atr_parser_fxt {
private final Mwh_atr_parser parser = new Mwh_atr_parser();
private final Mwh_doc_wkr__atr_bldr wkr = new Mwh_doc_wkr__atr_bldr();
public Mwh_atr_itm Make_pair(String key, String val) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.Y, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, Bry_.new_u8(val) , -1, -1);}
public Mwh_atr_itm Make_name(String key) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.N, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, null , -1, -1);}
public Mwh_atr_itm Make_name(String key) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.N, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, Bry_.new_u8(key) , -1, -1);}
public Mwh_atr_itm Make_fail(int bgn, int end) {return new Mwh_atr_itm(Bry_.Empty, Bool_.N, Bool_.N, Bool_.N, bgn, end, -1, -1, null , -1, -1, null , -1, -1);}
public void Test_val_as_int(String raw, int expd) {
byte[] src = Bry_.new_u8(raw);
@@ -44,14 +44,14 @@ class Mwh_atr_parser_fxt {
for (int i = 0; i < len; ++i) {
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
}
Tfds.Eq_str_lines(expd_bfr.Xto_str_and_clear(), actl_bfr.Xto_str_and_clear());
Tfds.Eq_str_lines(expd_bfr.To_str_and_clear(), actl_bfr.To_str_and_clear());
}
private void To_bfr(Bry_bfr expd_bfr, Mwh_atr_itm expd_itm, Bry_bfr actl_bfr, Mwh_atr_itm actl_itm) {
To_bfr__main(expd_bfr, expd_itm);
To_bfr__main(actl_bfr, actl_itm);
To_bfr__head(expd_bfr, expd_itm);
To_bfr__head(actl_bfr, actl_itm);
if (expd_itm.Atr_bgn() != -1) {
if (expd_itm != null && expd_itm.Atr_bgn() != -1) {
To_bfr__atr_rng(expd_bfr, expd_itm);
To_bfr__atr_rng(actl_bfr, actl_itm);
}
@@ -74,26 +74,3 @@ class Mwh_atr_parser_fxt {
bfr.Add_str_a7("rng:").Add_int_variable(itm.Atr_bgn()).Add_byte_semic().Add_int_variable(itm.Atr_end()).Add_byte_nl();
}
}
class Mwh_doc_wkr__atr_bldr implements Mwh_doc_wkr {
private final List_adp list = List_adp_.new_();
public Hash_adp_bry Nde_regy() {return null;}
public void On_atr_each(Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] data_ary, int itm_idx) {
int atr_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_bgn];
int atr_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_end];
int key_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_bgn];
int key_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_end];
int val_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_bgn];
int val_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_end];
int eql_pos = data_ary[itm_idx + Mwh_atr_mgr.Idx_eql_pos];
int qte_tid = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
qte_tid = Mwh_atr_itm.Calc_qte_tid(qte_tid);
Mwh_atr_itm atr = new Mwh_atr_itm(src, valid, repeated, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, val_bgn, val_end, val_bry_manual, eql_pos, qte_tid);
list.Add(atr);
}
public void On_txt_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
public void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public Mwh_atr_itm[] To_atr_ary() {return (Mwh_atr_itm[])list.To_ary_and_clear(Mwh_atr_itm.class);}
}

View File

@@ -22,6 +22,7 @@ public class Mwh_atr_parser_tst {
@Test public void Pair__quote__double() {fxt.Test_parse("a=\"b\"" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__single() {fxt.Test_parse("a='b'" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__none() {fxt.Test_parse("a=b" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__none__amp() {fxt.Test_parse("a=&bc" , fxt.Make_pair("a" , "&bc"));}
@Test public void Pair__empty() {fxt.Test_parse("a=''" , fxt.Make_pair("a" , ""));}
@Test public void Pair__key_w_underline() {fxt.Test_parse("a_b=c" , fxt.Make_pair("a_b" , "c"));}
@@ -46,18 +47,32 @@ public class Mwh_atr_parser_tst {
@Test public void Many__quote__apos() {fxt.Test_parse("a='b' c='d' e='f'" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
@Test public void Many__naked() {fxt.Test_parse("a=b c=d e=f" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
@Test public void Many__naked__pair() {fxt.Test_parse("a b=c" , fxt.Make_name("a"), fxt.Make_pair("b", "c"));}
@Test public void Val__ws__nl() {fxt.Test_parse("a='b\nc'" , fxt.Make_pair("a", "b c"));}
@Test public void Val__ws__mult() {fxt.Test_parse("a='b c'" , fxt.Make_pair("a", "b c"));}
@Test public void Val__ws__mult_mult() {fxt.Test_parse("a='b c d'" , fxt.Make_pair("a", "b c d"));} // PURPOSE: fix wherein 1st-gobble gobbled rest of spaces (was b cd)
@Test public void Val__apos() {fxt.Test_parse("a=\"b c'd\"" , fxt.Make_pair("a", "b c'd"));} // PURPOSE: fix wherein apos was gobbled up; PAGE:en.s:Alice's_Adventures_in_Wonderland; DATE:2013-11-22
@Test public void Val__apos_2() {fxt.Test_parse("a=\"b'c d\"" , fxt.Make_pair("a", "b'c d"));} // PURPOSE: fix wherein apos was causing "'b'c d"; PAGE:en.s:Grimm's_Household_Tales,_Volume_1; DATE:2013-12-22
@Test public void Quote__ws__nl() {fxt.Test_parse("a='b\nc'" , fxt.Make_pair("a", "b c"));}
@Test public void Quote__ws__mult() {fxt.Test_parse("a='b c'" , fxt.Make_pair("a", "b c"));}
@Test public void Quote__ws__mult_mult() {fxt.Test_parse("a='b c d'" , fxt.Make_pair("a", "b c d"));} // PURPOSE: fix wherein 1st-gobble gobbled rest of spaces (was b cd)
@Test public void Quote__apos() {fxt.Test_parse("a=\"b c'd\"" , fxt.Make_pair("a", "b c'd"));} // PURPOSE: fix wherein apos was gobbled up; PAGE:en.s:Alice's_Adventures_in_Wonderland; DATE:2013-11-22
@Test public void Quote__apos_2() {fxt.Test_parse("a=\"b'c d\"" , fxt.Make_pair("a", "b'c d"));} // PURPOSE: fix wherein apos was causing "'b'c d"; PAGE:en.s:Grimm's_Household_Tales,_Volume_1; DATE:2013-12-22
// @Test public void Quote__angle() {fxt.Test_parse("a='<'" , fxt.Make_fail(0, 5));} // PURPOSE: "<" inside quotes is always invalid
@Test public void Quote__invalid() {fxt.Test_parse("a='b'c" , fxt.Make_fail(0, 6));}
@Test public void Nowiki__val() {fxt.Test_parse("a=<nowiki>'b'</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(0, 13));}
@Test public void Nowiki__key() {fxt.Test_parse("<nowiki>a=b</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(8, 11));}
@Test public void Nowiki__key_2() {fxt.Test_parse("a<nowiki>b</nowiki>c=d" , fxt.Make_pair("abc", "d").Atr_rng(0, 22));}
@Test public void Nowiki__key_3() {fxt.Test_parse("a<nowiki>=</nowiki>\"b\"" , fxt.Make_pair("a", "b").Atr_rng(0, 22));} // EX:fr.w:{{Portail|Transpédia|Californie}}
@Test public void Nowiki__quote() {fxt.Test_parse("a=\"b<nowiki>c</nowiki>d<nowiki>e</nowiki>f\"", fxt.Make_pair("a", "bcdef"));}
@Test public void Nowiki__atr() {fxt.Test_parse("<nowiki>a=b</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(8, 20));}
@Test public void Nowiki__key() {fxt.Test_parse("a<nowiki>b</nowiki>c=d" , fxt.Make_pair("abc", "d").Atr_rng(0, 22));}
@Test public void Nowiki__eql() {fxt.Test_parse("a<nowiki>=</nowiki>\"b\"" , fxt.Make_pair("a", "b").Atr_rng(0, 22));} // EX:fr.w:{{Portail|Transpédia|Californie}}
@Test public void Nowiki__val__naked() {fxt.Test_parse("a=b<nowiki>c</nowiki>d" , fxt.Make_pair("a", "bcd").Atr_rng(0, 22));}
@Test public void Nowiki__val__quote() {fxt.Test_parse("a=<nowiki>'b'</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(0, 22));}
@Test public void Nowiki__val__quote_2() {fxt.Test_parse("a=\"b<nowiki>c</nowiki>d<nowiki>e</nowiki>f\"", fxt.Make_pair("a", "bcdef"));}
@Test public void Val__as_int() {fxt.Test_val_as_int("-123" , -123);}
// @Test public void Embedded() { // PURPOSE: handle html inside attrib; PAGE:en.w:Economy_of_Greece DATE:2015-10-15
// fxt.Test_parse("title='<sup id='cite_ref-a_1-0' class='reference'><a href='#cite_note-a-1'>[1]</a></sup> c'"
// , fxt.Make_fail(0, 11) // "title='<sup" invalid b/c of "<"
// , fxt.Make_pair("id", "cite_ref-a_1-0")
// , fxt.Make_fail(31, 52) // "class='reference'><a" invalid b/c no ws after '
// , fxt.Make_fail(53, 88) // "href='#cite_note-a-1'>[1]</a></sup>" invalid b/c no ws after '
// , fxt.Make_fail(89, 91) // " c'" invalid b/c name (c) cannot have apos
// );
// }
}

View File

@@ -21,5 +21,5 @@ class Mwh_doc_itm {
public int Itm_tid() {return itm_tid;} private final int itm_tid;
public byte[] Itm_bry() {return itm_bry;} private final byte[] itm_bry;
public int Nde_tid() {return nde_tid;} private final int nde_tid;
public static final int Itm_tid__txt = 0, Itm_tid__nde_head = 1, Itm_tid__nde_tail = 2, Itm_tid__comment = 3;
public static final int Itm_tid__txt = 0, Itm_tid__nde_head = 1, Itm_tid__nde_tail = 2, Itm_tid__comment = 3, Itm_tid__entity = 4;
}

View File

@@ -17,11 +17,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.core.primitives.*;
import gplx.xowa.parsers.xndes.*;
import gplx.xowa.parsers.amps.*; import gplx.xowa.parsers.xndes.*;
public class Mwh_doc_parser {
private final Mwh_doc_mgr dom_mgr = new Mwh_doc_mgr(16);
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
private final List_adp nde_stack = List_adp_.new_();
private final Xop_amp_mgr amp_mgr = Xop_amp_mgr.Instance; private final Xop_tkn_mkr tkn_mkr = new Xop_tkn_mkr();
private byte[] src; private int src_end;
private Mwh_doc_wkr wkr;
private Hash_adp_bry nde_regy;
@@ -34,11 +35,28 @@ public class Mwh_doc_parser {
int pos = txt_bgn = src_bgn;
nde_uid = cur_nde_tid = -1;
cur_nde = null;
while (pos < src_end) {
if (src[pos] == Byte_ascii.Angle_bgn) // "<": possible nde start
pos = Parse_nde(pos);
else // else, just increment
++pos;
byte b = src[pos];
switch (b) {
case Byte_ascii.Angle_bgn: // "<": possible nde start
pos = Parse_nde(pos);
break;
case Byte_ascii.Amp: // "&": check for entity; EX: &nbsp; in sr-ec -> sr-el
Xop_tkn_itm tkn = amp_mgr.Parse_as_tkn(tkn_mkr, src, src_end, pos, pos + 1);
if (tkn == null)
++pos;
else {
wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, pos);
wkr.On_entity_end(this, src, cur_nde_tid, tkn.Src_bgn(), tkn.Src_end());
pos = tkn.Src_end();
txt_bgn = pos;
}
break;
default: // else, just increment
++pos;
break;
}
}
if (src_end != txt_bgn) wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, pos);
}
@@ -142,10 +160,45 @@ public class Mwh_doc_parser {
break;
case Nde_end_tid__ws:
case Nde_end_tid__slash:
case Nde_end_tid__backslash: // handled above
pos = atr_parser.Parse(wkr, nde_uid, cur_nde_tid, src, pos, src_end);
nde_end_tid = atr_parser.Nde_end_tid();
txt_bgn = pos;
case Nde_end_tid__backslash:
// look for ">" or "/>"
int tmp_pos = pos, atrs_end = src_end, head_end = src_end;
boolean loop = true;
while (loop) {
byte b = src[tmp_pos];
switch (b) {
// angle_end -> stop iterating
case Byte_ascii.Angle_end:
atrs_end = tmp_pos;
head_end = tmp_pos + 1;
nde_end_tid = Mwh_doc_parser.Nde_end_tid__gt;
loop = false;
break;
// slash -> check for "/>" or " / "
case Byte_ascii.Slash:
int nxt_pos = tmp_pos + 1;
if (nxt_pos == src_end) {
nde_end_tid = Mwh_doc_parser.Nde_end_tid__invalid;
loop = false;
}
else if (src[nxt_pos] == Byte_ascii.Angle_end) {
atrs_end = tmp_pos;
head_end = tmp_pos + 2;
nde_end_tid = Mwh_doc_parser.Nde_end_tid__inline;
loop = false;
}
break;
}
if (loop) {
++tmp_pos;
if (tmp_pos == src_end) break;
}
else
break;
}
atr_parser.Parse(wkr, nde_uid, cur_nde_tid, src, pos, atrs_end);
pos = head_end;
txt_bgn = head_end;
break;
}
switch (nde_end_tid) {

View File

@@ -23,6 +23,7 @@ class Mwh_doc_parser_fxt {
public Mwh_doc_itm Make_txt (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_txt (String raw, int nde_tid) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_comment (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_entity (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__entity , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_nde_head(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_nde_tail(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , -1, Bry_.new_u8(raw));}
public void Test_parse(String raw, Mwh_doc_itm... expd) {
@@ -41,7 +42,7 @@ class Mwh_doc_parser_fxt {
for (int i = 0; i < len; ++i) {
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
}
Tfds.Eq_str_lines(expd_bfr.Xto_str_and_clear(), actl_bfr.Xto_str_and_clear());
Tfds.Eq_str_lines(expd_bfr.To_str_and_clear(), actl_bfr.To_str_and_clear());
}
private void To_bfr(Bry_bfr expd_bfr, Mwh_doc_itm expd_itm, Bry_bfr actl_bfr, Mwh_doc_itm actl_itm) {
To_bfr__main(expd_bfr, expd_itm); To_bfr__main(actl_bfr, actl_itm);
@@ -62,12 +63,13 @@ class Mwh_doc_parser_fxt {
class Mwh_doc_wkr__itm_bldr implements Mwh_doc_wkr {
private final List_adp list = List_adp_.new_();
public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw();
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__entity , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public Mwh_doc_itm[] To_atr_ary() {return (Mwh_doc_itm[])list.To_ary_and_clear(Mwh_doc_itm.class);}
}

View File

@@ -21,6 +21,7 @@ public class Mwh_doc_parser_tst {
private final Mwh_doc_parser_fxt fxt = new Mwh_doc_parser_fxt();
@Test public void Text__basic() {fxt.Test_parse("abc" , fxt.Make_txt("abc"));}
@Test public void Comment() {fxt.Test_parse("a<!--b-->c" , fxt.Make_txt("a"), fxt.Make_comment("<!--b-->"), fxt.Make_txt("c"));}
@Test public void Entity() {fxt.Test_parse("a&nbsp;b" , fxt.Make_txt("a"), fxt.Make_entity("&nbsp;"), fxt.Make_txt("b"));}
@Test public void Fail__inline_eos() {fxt.Test_parse("a<b/" , fxt.Make_txt("a<b/"));}
@Test public void Fail__unknown() {fxt.Test_parse("a<bc/>d" , fxt.Make_txt("a<bc/>d"));}
@Test public void Node__inline() {fxt.Test_parse("a<b/>c" , fxt.Make_txt("a"), fxt.Make_nde_head("<b/>") , fxt.Make_txt("c"));}

View File

@@ -24,4 +24,5 @@ public interface Mwh_doc_wkr {
void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline);
void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
}

View File

@@ -0,0 +1,43 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
public class Mwh_doc_wkr__atr_bldr implements Mwh_doc_wkr {
private final List_adp list = List_adp_.new_();
public Hash_adp_bry Nde_regy() {return null;}
public void On_atr_each(Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] data_ary, int itm_idx) {
int atr_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_bgn];
int atr_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_end];
int key_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_bgn];
int key_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_end];
int val_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_bgn];
int val_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_end];
int eql_pos = data_ary[itm_idx + Mwh_atr_mgr.Idx_eql_pos];
int qte_tid = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
qte_tid = Mwh_atr_itm_.Calc_qte_tid(qte_tid);
if (!key_exists) val_bry_manual = key_bry;
Mwh_atr_itm atr = new Mwh_atr_itm(src, valid, repeated, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, val_bgn, val_end, val_bry_manual, eql_pos, qte_tid);
list.Add(atr);
}
public void On_txt_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
public void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public Mwh_atr_itm[] To_atr_ary() {return (Mwh_atr_itm[])list.To_ary_and_clear(Mwh_atr_itm.class);}
}