1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2015-09-27 23:04:13 -04:00
parent fa70c05354
commit 8e18af05b6
84 changed files with 2795 additions and 507 deletions

View File

@@ -0,0 +1,86 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
public class Mwh_atr_itm {
public Mwh_atr_itm
( byte[] src, boolean valid, boolean repeated, boolean key_exists, int atr_bgn, int atr_end
, int key_bgn, int key_end, byte[] key_bry
, int val_bgn, int val_end, byte[] val_bry
, int eql_pos, int qte_tid
) {
this.src = src;
this.valid = valid; this.repeated = repeated; this.key_exists = key_exists;
this.atr_bgn = atr_bgn; this.atr_end = atr_end;
this.key_bgn = key_bgn; this.key_end = key_end; this.key_bry = key_bry;
this.val_bgn = val_bgn; this.val_end = val_end; this.val_bry = val_bry;
this.eql_pos = eql_pos; this.qte_tid = qte_tid;
}
public byte[] Src() {return src;} private final byte[] src;
public boolean Valid() {return valid;} private final boolean valid;
public boolean Key_exists() {return key_exists;} private final boolean key_exists;
public boolean Repeated() {return repeated;} private final boolean repeated;
public int Atr_bgn() {return atr_bgn;} private int atr_bgn;
public int Atr_end() {return atr_end;} private int atr_end;
public int Key_bgn() {return key_bgn;} private final int key_bgn;
public int Key_end() {return key_end;} private final int key_end;
public byte[] Key_bry() {return key_bry;} private byte[] key_bry;
public byte Key_tid() {return key_tid;} public Mwh_atr_itm Key_tid_(byte v) {key_tid = v; return this;} private byte key_tid;
public int Val_bgn() {return val_bgn;} private final int val_bgn;
public int Val_end() {return val_end;} private final int val_end;
public byte[] Val_bry() {return val_bry;} private byte[] val_bry;
public int Eql_pos() {return eql_pos;} private final int eql_pos;
public int Qte_tid() {return qte_tid;} private final int qte_tid;
public Mwh_atr_itm Atr_rng(int bgn, int end) {this.atr_bgn = bgn; this.atr_end = end; return this;}
public String Val_as_str() {return String_.new_u8(Val_as_bry());}
public byte[] Val_as_bry() {if (val_bry == null) val_bry = Bry_.Mid(src, val_bgn, val_end); return val_bry;} // NOTE: val_bry is cached
public byte[] Val_as_bry__blank_to_null() {byte[] rv = Val_as_bry(); return Bry_.Len_eq_0(rv) ? null : rv;}
public int Val_as_int_or(int or) {return val_bry == null ? Bry_.To_int_or__lax(src, val_bgn, val_end, or) : Bry_.To_int_or(val_bry, or);}
public boolean Val_as_bool_by_int() {return Val_as_int_or(0) == 1;}
public boolean Val_as_bool() {return Bry_.Eq(Bry_.Lcase__all(Val_as_bry()), Bool_.True_bry);}
public static final Mwh_atr_itm[] Ary_empty = new Mwh_atr_itm[0];
public static final int Atr_tid__invalid = 1, Atr_tid__repeat = 2, Atr_tid__pair = 4, Atr_tid__name = 8; // NOTE: id order is important; see above;
public static final int Qte_tid__none = 0, Qte_tid__apos = 1, Qte_tid__qute = 2;
public static final int Mask__qte__none = 0, Mask__qte__apos = 1, Mask__qte_qute = 2;
public static final int
Mask__valid = 8
, Mask__repeated = 16
, Mask__key_exists = 32
, Mask__val_made = 64
;
public static final boolean Mask__valid__n = false, Mask__valid__y = true;
public static final boolean Mask__key_exists__n = false, Mask__key_exists__y = true;
public static final boolean Mask__repeated__n = false, Mask__repeated__y = true;
public static final boolean Mask__val_made__n = false, Mask__val_made__y = true;
public static int Calc_atr_utl(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made) {
int rv = qte_tid;
if (valid) rv |= Mwh_atr_itm.Mask__valid;
if (repeated) rv |= Mwh_atr_itm.Mask__repeated;
if (key_exists) rv |= Mwh_atr_itm.Mask__key_exists;
if (val_made) rv |= Mwh_atr_itm.Mask__val_made;
return rv;
}
public static int Calc_qte_tid(int val) {
return val & ((1 << 3) - 1);
}
public static byte Calc_qte_byte(int[] data_ary, int idx) {
int val = data_ary[idx + Mwh_atr_mgr.Idx_atr_utl];
int qte_tid = (val & ((1 << 3) - 1));
return qte_tid == Qte_tid__apos ? Byte_ascii.Apos : Byte_ascii.Quote;
}
// public static final byte Key_tid_generic = 0, Key_tid_id = 1, Key_tid_style = 2, Key_tid_role = 3;
}

View File

@@ -0,0 +1,98 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.core.brys.*;
public class Mwh_atr_mgr {
private final int data_max_orig;
public Mwh_atr_mgr(int max) {
this.data_max_orig = max * Idx__mult;
this.Max_(max);
}
public int Len() {return itm_len;} private int itm_len;
public int[] Data_ary() {return data_ary;} private int[] data_ary; private int data_max;
public byte[][] Text_ary() {return text_ary;} private byte[][] text_ary;
private void Max_(int len) {
this.data_max = len * Idx__mult;
this.data_ary = new int[data_max];
this.text_ary = new byte[len * Text__mult][];
this.itm_len = 0;
}
public void Clear() {
if (data_max == data_max_orig)
itm_len = 0;
else
Max_(data_max_orig / Idx__mult);
}
public int Add(int nde_uid, int nde_tid, boolean valid, boolean repeated, boolean key_exists, int atr_bgn, int atr_end, int key_bgn, int key_end, byte[] key_bry, int eql_pos, int qte_tid, int val_bgn, int val_end, byte[] val_bry) {
int data_idx = itm_len * Idx__mult;
if (data_idx == data_max) {
int new_data_max = data_max == 0 ? Idx__mult : data_max * 2;
int[] new_data_ary = new int[new_data_max];
Int_.Ary_copy_to(data_ary, data_max, data_ary);
this.data_ary = new_data_ary;
int text_max = text_ary.length;
int new_text_max = data_max == 0 ? Text__mult : text_max * 2;
byte[][] new_text_ary = new byte[new_text_max][];
for (int i = 0; i < text_max; ++i)
new_text_ary[i] = text_ary[i];
this.text_ary = new_text_ary;
this.data_max = new_data_max;
}
boolean val_made = false;
int text_idx = itm_len * Text__mult;
text_ary[text_idx] = key_bry;
if (val_bry != null) {
text_ary[text_idx + 1] = val_bry;
val_made = true;
}
data_ary[data_idx + Idx_nde_uid] = nde_uid;
data_ary[data_idx + Idx_nde_tid] = nde_tid;
data_ary[data_idx + Idx_atr_utl] = Mwh_atr_itm.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
data_ary[data_idx + Idx_atr_bgn] = atr_bgn;
data_ary[data_idx + Idx_atr_end] = atr_end;
data_ary[data_idx + Idx_key_bgn] = key_bgn;
data_ary[data_idx + Idx_key_end] = key_end;
data_ary[data_idx + Idx_val_bgn] = val_bgn;
data_ary[data_idx + Idx_val_end] = val_end;
data_ary[data_idx + Idx_eql_pos] = eql_pos;
return itm_len++;
}
public void Set_repeated(int atr_uid) {
int atr_utl_idx = (atr_uid * Idx__mult) + Idx_atr_utl;
int atr_utl = data_ary[atr_utl_idx];
int val_bry_exists = atr_utl & Atr_utl__val_bry_exists;
data_ary[atr_utl_idx] = Mwh_atr_itm.Atr_tid__repeat | val_bry_exists;
}
public static final int
Idx_nde_uid = 0
, Idx_nde_tid = 1
, Idx_atr_utl = 2
, Idx_atr_bgn = 3
, Idx_atr_end = 4
, Idx_key_bgn = 5
, Idx_key_end = 6
, Idx_val_bgn = 7
, Idx_val_end = 8
, Idx_eql_pos = 9
, Idx__mult = 10
;
public static final int Text__mult = 2;
public static final int Atr_utl__val_bry_exists = 16;
}

View File

@@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import org.junit.*;
public class Mwh_atr_mgr_tst {
private final Mwh_atr_mgr_fxt fxt = new Mwh_atr_mgr_fxt();
@Test public void Atr_utl_make() {
// key="val"
fxt.Test_atr_utl_make(Mwh_atr_itm.Qte_tid__qute, Mwh_atr_itm.Mask__valid__y, Mwh_atr_itm.Mask__repeated__n, Mwh_atr_itm.Mask__key_exists__y, Mwh_atr_itm.Mask__val_made__n, 42);
// key=val key=v<nowiki/>al
fxt.Test_atr_utl_make(Mwh_atr_itm.Qte_tid__none, Mwh_atr_itm.Mask__valid__y, Mwh_atr_itm.Mask__repeated__y, Mwh_atr_itm.Mask__key_exists__y, Mwh_atr_itm.Mask__val_made__y, 120);
}
}
class Mwh_atr_mgr_fxt {
public void Test_atr_utl_make(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made, int expd) {
int atr_utl = Mwh_atr_itm.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
Tfds.Eq_int(expd, atr_utl);
Tfds.Eq_int(qte_tid, Mwh_atr_itm.Calc_qte_tid(atr_utl));
Tfds.Eq_bool(valid, (atr_utl & Mwh_atr_itm.Mask__valid) == Mwh_atr_itm.Mask__valid);
Tfds.Eq_bool(repeated, (atr_utl & Mwh_atr_itm.Mask__repeated) == Mwh_atr_itm.Mask__repeated);
Tfds.Eq_bool(key_exists, (atr_utl & Mwh_atr_itm.Mask__key_exists) == Mwh_atr_itm.Mask__key_exists);
Tfds.Eq_bool(val_made, (atr_utl & Mwh_atr_itm.Mask__val_made) == Mwh_atr_itm.Mask__val_made);
}
}

View File

@@ -0,0 +1,457 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.core.primitives.*;
import gplx.xowa.parsers.xndes.*; // for brys: <nowiki>, <noinclude>, <includeonly>, <onlyinclude>
public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATTRIBS_REGEX
private static final byte Area__invalid = 0, Area__atr_limbo = 1, Area__key = 2, Area__eql_limbo = 3, Area__val_limbo = 4, Area__val_quote = 5, Area__val_naked = 6;
private final Hash_adp_bry repeated_atrs_hash = Hash_adp_bry.ci_a7(); // ASCII:xnde_atrs
private final Mwh_atr_mgr atr_mgr = new Mwh_atr_mgr(16);
private final Bry_bfr key_bfr = Bry_bfr.new_(), val_bfr = Bry_bfr.new_();
private byte area = Area__atr_limbo;
private int atr_bgn = -1, key_bgn = -1, key_end = -1, eql_pos = -1, val_bgn = -1, val_end = -1;
private byte qte_byte = Byte_ascii.Null;
private boolean key_bfr_on = false, val_bfr_on = false, ws_is_before_val = false;
private int nde_uid, nde_tid;
public Bry_obj_ref Bry_obj() {return bry_ref;} private final Bry_obj_ref bry_ref = Bry_obj_ref.null_();
public int Nde_end_tid() {return nde_end_tid;} private int nde_end_tid;
public int Parse(Mwh_doc_wkr wkr, int nde_uid, int nde_tid, byte[] src, int src_bgn, int src_end) {
this.nde_uid = nde_uid; this.nde_tid = nde_tid;
this.nde_end_tid = Mwh_doc_parser.Nde_end_tid__invalid;
area = Area__atr_limbo;
boolean prv_is_ws = false;
int pos = src_bgn;
boolean loop = true;
while (loop) {
if (pos == src_end) {
if (area == Area__val_quote) { // quote still open
int reset_pos = Bry_find_.Find_fwd(src, Byte_ascii.Space, val_bgn, src_end); // try to find 1st space within quote; EX:"a='b c=d" should try to reset at c=d
boolean reset_found = reset_pos != Bry_find_.Not_found;
area = Area__invalid; val_end = reset_found ? reset_pos : src_end;
Make(src, val_end); // create invalid atr
if (reset_found) { // space found; resume from text after space; EX: "a='b c=d"; PAGE:en.w:Aubervilliers DATE:2014-06-25
pos = Bry_find_.Find_fwd_while_not_ws(src, reset_pos, src_end); // skip ws
atr_bgn = -1;
area = Area__atr_limbo;
val_bfr.Clear();
val_bfr_on = false;
ws_is_before_val = false;
continue;
}
else
break;
}
else {
if (area == Area__val_limbo) // NOTE: handle dangling "k=" else will be "k"; EX: <a b=> x> <a b>; PAGE:en.s:Notes_by_the_Way/Chapter_2; DATE:2015-01-31
area = Area__invalid;
if (atr_bgn != -1) { // atr_bgn will be -1 if atrs ends on quoted (EX:"a='b'"); else, pending atr that needs to be processed; EX: "a=b" b wil be in bfr
val_end = src_end;
Make(src, src_end);
}
break;
}
}
else if (pos > src_end)
break;
byte b = src[pos];
switch (area) {
case Area__atr_limbo: // 1st area after node_name or attribute
switch (b) {
// gt -> stop iterating
case Byte_ascii.Gt:
nde_end_tid = Mwh_doc_parser.Nde_end_tid__gt;
loop = false;
break;
// slash -> check for "/>" or " / "
case Byte_ascii.Slash:
int nxt_pos = pos + 1;
if (nxt_pos == src_end) {
pos = nxt_pos;
return Mwh_doc_parser.Nde_end_tid__invalid;
}
else if (src[nxt_pos] == Byte_ascii.Gt) {
nde_end_tid = Mwh_doc_parser.Nde_end_tid__inline;
pos = nxt_pos;
loop = false;
}
else {
area = Area__invalid; atr_bgn = pos;
}
break;
// ws -> ignore; skip any ws in atr_limbo; note that once a non-ws char is encountered, it will immediately go into another area
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
if (atr_bgn == -1) atr_bgn = pos;
break;
// alphanum -> enter Area__key
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon:
area = Area__key;
if (atr_bgn == -1) atr_bgn = pos;
key_bgn = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt: // handle "<nowiki>"
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) {
area = Area__invalid;
atr_bgn = pos;
}
else
pos = gt_pos; // position after ">"; note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> invalid
default: // quote and other non-valid key characters are invalid until next space; EX: "<span 'key_cannot_be_quoted' id='123'"
area = Area__invalid; atr_bgn = pos;
break;
}
break;
case Area__invalid:
switch (b) {
// ws -> src_end invalid area
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
Make(src, pos);
area = Area__atr_limbo;
break;
// rest -> continue eating up invalid chars
default:
break;
}
break;
case Area__key:
switch (b) {
// alphanum -> valid key chars
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon: case Byte_ascii.Dash: case Byte_ascii.Dot: case Byte_ascii.Underline:
if (key_bfr_on) key_bfr.Add_byte(b);
break;
// ws -> src_end key
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
area = Area__eql_limbo;
key_end = pos;
break;
// eq -> src_end key; skip Area_eq and go to Area_val_bgn
case Byte_ascii.Eq:
area = Area__val_limbo;
key_end = eql_pos = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found) // "<" should not be in key; EX: "ke<y"
area = Area__invalid;
else {
if (!key_bfr_on) {key_bfr.Add_mid(src, key_bgn, pos); key_bfr_on = true;}
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
}
break;
// rest -> enter invalid
default:
area = Area__invalid;
break;
}
break;
case Area__eql_limbo:
switch (b) {
// ws -> skip
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab: // skip ws
if (key_end == -1) { // EX: "a = b"; key_end != -1 b/c 1st \s sets key_end; EX: "a b = c"; key_end
val_end = pos - 1;
Make(src, pos);
area = Area__atr_limbo;
continue;
}
break;
// eq -> enter Area__eq
case Byte_ascii.Eq:
eql_pos = pos;
area = Area__val_limbo;
break;
// rest -> make atr and enter limbo
case Byte_ascii.Quote: case Byte_ascii.Apos: // FUTURE: previous word was key
default: // NOTE: added this late; xml_parser was not handling "line start=3" DATE:2013-07-03
val_end = pos - 1;
Make(src, pos);
area = Area__atr_limbo;
continue;
}
break;
case Area__val_limbo:
switch (b) {
// ws -> skip
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab:
ws_is_before_val = true;
break;
// quote -> enter Area_val_quote
case Byte_ascii.Quote: case Byte_ascii.Apos:
area = Area__val_quote; qte_byte = b; prv_is_ws = false;
val_bgn = pos + 1;
break;
// alphanum -> enter Area_val_raw
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Colon:
case Byte_ascii.Hash:
area = Area__val_naked;
val_bgn = pos;
break;
// lt -> check for <nowiki>
case Byte_ascii.Lt:
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found)
area = Area__invalid;
else
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
break;
// rest -> ignore (?)
default:
break;
}
break;
case Area__val_quote: { // EX: "'val' " in "key = 'val'"
switch (b) {
// quote: check if same as opening quote
case Byte_ascii.Quote: case Byte_ascii.Apos:
if (qte_byte == b) { // quote closes val
val_end = pos;
Make(src, pos + 1); // NOTE: set atr_end *after* quote
}
else { // quote is just char; EX: title="1 o'clock" or title='The "C" way'
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
}
break;
// lt -> check for <nowiki>; EX: <span title='ab<nowiki>c</nowiki>de'>
case Byte_ascii.Lt:
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
int gt_pos = Xnde_find_gt(src, pos, src_end);
if (gt_pos == Bry_find_.Not_found)
// area = Area__invalid; // DELETE: 2012-11-13; unpaired < should not mark atr invalid; EX: style='margin:1em<f'
val_bfr.Add_byte(Byte_ascii.Lt);
else
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
prv_is_ws = false;
break;
// ws -> convert all ws to \s; only allow 1 ws at any point in time
case Byte_ascii.Nl: case Byte_ascii.Tab: case Byte_ascii.Cr: // REF.MW:Sanitizer.php|decodeTagAttributes $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
case Byte_ascii.Space:
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b"
else {
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
}
break;
// rest -> add to val
default:
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
break;
}
break;
}
case Area__val_naked: // no quotes; EX:a=bcd
switch (b) {
// alphanum -> continue reading
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent:
case Byte_ascii.Amp: case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star:
case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot: case Byte_ascii.Slash:
case Byte_ascii.Colon: case Byte_ascii.Semic: case Byte_ascii.Gt:
case Byte_ascii.Question: case Byte_ascii.At: case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end:
case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
case Byte_ascii.Curly_bgn: case Byte_ascii.Pipe: case Byte_ascii.Curly_end: case Byte_ascii.Tilde:
break;
// ws -> src_end atr
case Byte_ascii.Space: case Byte_ascii.Tab: case Byte_ascii.Nl:
val_end = pos;
Make(src, pos);
break;
case Byte_ascii.Eq: // EX:"a= b=c" or "a=b=c"; PAGE:en.w:2013_in_American_television
if (ws_is_before_val) { // "a= b=c"; discard 1st and resume at 2nd
int old_val_bgn = val_bgn;
area = Area__invalid; Make(src, val_bgn); // invalidate cur atr; EX:"a="
atr_bgn = key_bgn = old_val_bgn; // reset atr / key to new atr; EX: "b"
key_end = pos;
area = Area__val_limbo; // set area to val_bgn (basically, put after =)
}
else // "a=b=c"; discard all
area = Area__invalid;
break;
case Byte_ascii.Lt:
val_end = pos;
Make(src, pos);
--pos; // NOTE: --pos to include "<" as part of next atr; above ws excludes from next atr
break;
default:
area = Area__invalid;
break;
}
break;
}
++pos;
}
// iterate atrs and notify
int len = atr_mgr.Len();
int[] data_ary = atr_mgr.Data_ary();
byte[][] text_ary = atr_mgr.Text_ary();
for (int j = 0; j < len; ++j) {
int itm_idx = j * Mwh_atr_mgr.Idx__mult;
byte[] key_bry = text_ary[j * Mwh_atr_mgr.Text__mult];
byte[] val_bry_manual = null;
int atr_utl = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
boolean atr_valid = (atr_utl & Mwh_atr_itm.Mask__valid) == Mwh_atr_itm.Mask__valid;
boolean repeated = (atr_utl & Mwh_atr_itm.Mask__repeated) == Mwh_atr_itm.Mask__repeated;
boolean key_exists = (atr_utl & Mwh_atr_itm.Mask__key_exists) == Mwh_atr_itm.Mask__key_exists;
boolean val_made = (atr_utl & Mwh_atr_itm.Mask__val_made) == Mwh_atr_itm.Mask__val_made;
if (val_made)
val_bry_manual = text_ary[(j * Mwh_atr_mgr.Text__mult) + 1];
wkr.On_atr_each(this, src, nde_tid, atr_valid, repeated, key_exists, key_bry, val_bry_manual, data_ary, itm_idx);
}
atr_mgr.Clear();
repeated_atrs_hash.Clear();
return pos;
}
public int Xnde_find_gt_find(byte[] src, int pos, int end) {
bry_ref.Val_(null);
byte b = src[pos];
if (b == Byte_ascii.Slash && pos + 1 < end) { // if </ move pos to after /
++pos;
b = src[pos];
}
int gt_pos = Bry_find_.Find_fwd(src, Byte_ascii.Gt, pos, end); if (gt_pos == Bry_.NotFound) return Bry_find_.Not_found;
byte[] bry = (byte[])xnde_hash.Get_by_mid(src, pos, gt_pos);
bry_ref.Val_(bry);
return bry == null ? Bry_find_.Not_found : bry.length + pos;
}
private int Xnde_find_gt(byte[] src, int lt_pos, int end) {
int pos = lt_pos + 1;
byte b = src[pos];
if (b == Byte_ascii.Slash && pos + 1 < end) {
++pos;
b = src[pos];
}
int match_pos = Xnde_find_gt_find(src, pos, end);
if (match_pos == Bry_find_.Not_found) {return Bry_find_.Not_found;}
boolean slash_found = false;
for (int i = match_pos; i < end; i++) {
b = src[i];
switch (b) {
case Byte_ascii.Gt: return i;
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab: // skip any ws
break;
case Byte_ascii.Slash:
if (slash_found) {return Bry_find_.Not_found;} // only allow one slash
else slash_found = true;
break;
default:
return Bry_find_.Not_found;
}
}
return Bry_find_.Not_found;
}
private void Make(byte[] src, int atr_end) {
// calc final values for atr
boolean key_exists = false;
byte[] key_bry = null, val_bry = null;
boolean atr_valid = true;
if (area != Area__invalid) {
if (key_bgn != -1 && val_bgn != -1) // key && val exists; EX: "<input id='123'>"
key_exists = true;
else { // not a pair; EX: "<input checked>"
if (key_end == -1) key_end = val_end; // NOTE: key_end == -1 when eos; EX: "a" would have key_bgn = 0; key_end = -1; val_end = 1 DATE:2014-07-03
val_bgn = val_end = -1;
}
key_bry = key_bfr_on ? key_bfr.Xto_bry_and_clear() : Bry_.Mid(src, key_bgn, key_end); // always make key_bry; needed for repeated_atrs as well as key_tid
if (val_bfr_on) val_bry = val_bfr.Xto_bry_and_clear();
}
else {
atr_valid = false;
key_bry = Bry_.Empty;
key_bfr.Clear();
if (val_bgn == -1) val_bgn = atr_bgn;
}
int qte_tid = Mwh_atr_itm.Mask__qte__none;
if (qte_byte != Byte_ascii.Null)
qte_tid = qte_byte == Byte_ascii.Quote ? Mwh_atr_itm.Mask__qte_qute : Mwh_atr_itm.Mask__qte__apos;
int atr_uid = atr_mgr.Add(nde_uid, nde_tid, atr_valid, false, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, eql_pos, qte_tid, val_bgn, val_end, val_bry);
// handle repeated atrs
if (atr_valid) {
int repeated_uid = repeated_atrs_hash.Get_as_int_or(key_bry, -1);
if (repeated_uid != -1) {
repeated_atrs_hash.Del(key_bry);
atr_mgr.Set_repeated(repeated_uid);
}
repeated_atrs_hash.Add_bry_int(key_bry, atr_uid);
}
// reset temp variables
area = Area__atr_limbo; qte_byte = Byte_ascii.Null;
atr_bgn = key_bgn = val_bgn = key_end = val_end = eql_pos = -1;
key_bfr_on = val_bfr_on = ws_is_before_val = false;
}
private static final Hash_adp_bry xnde_hash = Hash_adp_bry.ci_a7()
.Add_bry_bry(Xop_xnde_tag_.Tag_nowiki.Name_bry())
.Add_bry_bry(Xop_xnde_tag_.Tag_noinclude.Name_bry())
.Add_bry_bry(Xop_xnde_tag_.Tag_includeonly.Name_bry())
.Add_bry_bry(Xop_xnde_tag_.Tag_onlyinclude.Name_bry())
;
public static final int Key_tid__unknown = -1;
}

View File

@@ -0,0 +1,99 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
class Mwh_atr_parser_fxt {
private final Bry_bfr expd_bfr = Bry_bfr.new_(), actl_bfr = Bry_bfr.new_();
private final Mwh_atr_parser parser = new Mwh_atr_parser();
private final Mwh_doc_wkr__atr_bldr wkr = new Mwh_doc_wkr__atr_bldr();
public Mwh_atr_itm Make_pair(String key, String val) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.Y, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, Bry_.new_u8(val) , -1, -1);}
public Mwh_atr_itm Make_name(String key) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.N, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, null , -1, -1);}
public Mwh_atr_itm Make_fail(int bgn, int end) {return new Mwh_atr_itm(Bry_.Empty, Bool_.N, Bool_.N, Bool_.N, bgn, end, -1, -1, null , -1, -1, null , -1, -1);}
public void Test_val_as_int(String raw, int expd) {
byte[] src = Bry_.new_u8(raw);
Mwh_atr_itm itm = new Mwh_atr_itm(src, true, false, false, 0, src.length, -1, -1, null, 0, src.length, src, -1, -1);
Tfds.Eq_int(expd, itm.Val_as_int_or(-1));
}
public void Test_parse(String raw, Mwh_atr_itm... expd) {
Mwh_atr_itm[] actl = Exec_parse(raw);
Test_print(expd, actl);
}
private Mwh_atr_itm[] Exec_parse(String raw) {
byte[] bry = Bry_.new_u8(raw);
parser.Parse(wkr, -1, -1, bry, 0, bry.length);
return wkr.To_atr_ary();
}
public void Test_print(Mwh_atr_itm[] expd_ary, Mwh_atr_itm[] actl_ary) {
int expd_len = expd_ary.length;
int actl_len = actl_ary.length;
int len = expd_len > actl_len ? expd_len : actl_len;
for (int i = 0; i < len; ++i) {
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
}
Tfds.Eq_str_lines(expd_bfr.Xto_str_and_clear(), actl_bfr.Xto_str_and_clear());
}
private void To_bfr(Bry_bfr expd_bfr, Mwh_atr_itm expd_itm, Bry_bfr actl_bfr, Mwh_atr_itm actl_itm) {
To_bfr__main(expd_bfr, expd_itm);
To_bfr__main(actl_bfr, actl_itm);
To_bfr__head(expd_bfr, expd_itm);
To_bfr__head(actl_bfr, actl_itm);
if (expd_itm.Atr_bgn() != -1) {
To_bfr__atr_rng(expd_bfr, expd_itm);
To_bfr__atr_rng(actl_bfr, actl_itm);
}
}
private void To_bfr__head(Bry_bfr bfr, Mwh_atr_itm itm) {
if (itm == null) return;
bfr.Add_str_a7("head:").Add_yn(itm.Valid()).Add_byte_semic().Add_yn(itm.Repeated()).Add_byte_semic().Add_yn(itm.Key_exists()).Add_byte_nl();
}
private void To_bfr__main(Bry_bfr bfr, Mwh_atr_itm itm) {
if (itm == null) return;
if (itm.Valid()) {
bfr.Add_str_a7("key:").Add(itm.Key_bry()).Add_byte_nl();
bfr.Add_str_a7("val:").Add(itm.Val_as_bry()).Add_byte_nl();
}
// else
// To_bfr__atr_rng(bfr, itm);
}
private void To_bfr__atr_rng(Bry_bfr bfr, Mwh_atr_itm itm) {
if (itm == null) return;
bfr.Add_str_a7("rng:").Add_int_variable(itm.Atr_bgn()).Add_byte_semic().Add_int_variable(itm.Atr_end()).Add_byte_nl();
}
}
class Mwh_doc_wkr__atr_bldr implements Mwh_doc_wkr {
private final List_adp list = List_adp_.new_();
public Hash_adp_bry Nde_regy() {return null;}
public void On_atr_each(Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] data_ary, int itm_idx) {
int atr_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_bgn];
int atr_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_end];
int key_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_bgn];
int key_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_end];
int val_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_bgn];
int val_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_end];
int eql_pos = data_ary[itm_idx + Mwh_atr_mgr.Idx_eql_pos];
int qte_tid = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
qte_tid = Mwh_atr_itm.Calc_qte_tid(qte_tid);
Mwh_atr_itm atr = new Mwh_atr_itm(src, valid, repeated, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, val_bgn, val_end, val_bry_manual, eql_pos, qte_tid);
list.Add(atr);
}
public void On_txt_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
public void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public Mwh_atr_itm[] To_atr_ary() {return (Mwh_atr_itm[])list.To_ary_and_clear(Mwh_atr_itm.class);}
}

View File

@@ -0,0 +1,63 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import org.junit.*;
public class Mwh_atr_parser_tst {
private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt();
@Test public void Pair__quote__double() {fxt.Test_parse("a=\"b\"" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__single() {fxt.Test_parse("a='b'" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__quote__none() {fxt.Test_parse("a=b" , fxt.Make_pair("a" , "b"));}
@Test public void Pair__empty() {fxt.Test_parse("a=''" , fxt.Make_pair("a" , ""));}
@Test public void Pair__key_w_underline() {fxt.Test_parse("a_b=c" , fxt.Make_pair("a_b" , "c"));}
@Test public void Name__quote__none() {fxt.Test_parse("b" , fxt.Make_name("b"));}
@Test public void Name__ws() {fxt.Test_parse(" b " , fxt.Make_name("b"));} // PURPOSE:discovered while writing test for ref's "lower-alpha" DATE:2014-07-03
@Test public void Name__mult() {fxt.Test_parse("a b1 c" , fxt.Make_name("a"), fxt.Make_name("b1"), fxt.Make_name("c"));}
@Test public void Fail__key_w_plus() {fxt.Test_parse("a+b" , fxt.Make_fail(0, 3));}
@Test public void Fail__key_w_plus__many() {fxt.Test_parse("a+b c=d" , fxt.Make_fail(0, 3) , fxt.Make_pair("c", "d"));}
@Test public void Fail__val_w_plus() {fxt.Test_parse("a=b+c" , fxt.Make_fail(0, 5));}
@Test public void Fail__recover() {fxt.Test_parse("* a=b" , fxt.Make_fail(0, 1) , fxt.Make_pair("a", "b"));} // PURPOSE: * is invalid, but should not stop parsing of a=b
@Test public void Fail__incomplete() {fxt.Test_parse("a= c=d" , fxt.Make_fail(0, 3) , fxt.Make_pair("c", "d"));} // PURPOSE: discard xatr if incomplete and followed by valid atr; PAGE:en.w:2013_in_American_television DATE:2014-09-25
@Test public void Fail__incomplete_2() {fxt.Test_parse("a=c=d" , fxt.Make_fail(0, 5));} // PURPOSE: variation of above; per MW regex, missing space invalidates entire attribute; DATE:2014-09-25
@Test public void Fail__incomplete_pair() {fxt.Test_parse("a= b=" , fxt.Make_fail(0, 3) , fxt.Make_fail(3, 5));} // PURPOSE: "b=" should be invalid not a kv of "b" = "b"; PAGE:en.s:Notes_by_the_Way/Chapter_2; DATE:2015-01-31
@Test public void Dangling_eos() {fxt.Test_parse("a='b' c='d" , fxt.Make_pair("a", "b") , fxt.Make_fail(5, 10));} // PURPOSE: handle dangling quote at eos; PAGE:en.w:Aubervilliers DATE:2014-06-25
@Test public void Dangling_bos() {fxt.Test_parse("a='b c=d" , fxt.Make_fail(0, 4) , fxt.Make_pair("c", "d"));}// PURPOSE: handle dangling quote at bos; resume at next valid atr; PAGE:en.w:Aubervilliers DATE:2014-06-25
@Test public void Ws__ini() {fxt.Test_parse(" a='b'" , fxt.Make_pair("a", "b").Atr_rng(0, 6));}
@Test public void Ws__end() {fxt.Test_parse(" a='b' c='d'" , fxt.Make_pair("a", "b").Atr_rng(0, 6), fxt.Make_pair("c", "d").Atr_rng(6, 12));}
@Test public void Ws() {fxt.Test_parse("a = 'b'" , fxt.Make_pair("a", "b"));} // PURPOSE: fix wherein multiple space was causing "a=a"; PAGE:fr.s:La_Sculpture_dans_les_cimetières_de_Paris/Père-Lachaise; DATE:2014-01-18
@Test public void Many__quote__apos() {fxt.Test_parse("a='b' c='d' e='f'" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
@Test public void Many__naked() {fxt.Test_parse("a=b c=d e=f" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
@Test public void Val__ws__nl() {fxt.Test_parse("a='b\nc'" , fxt.Make_pair("a", "b c"));}
@Test public void Val__ws__mult() {fxt.Test_parse("a='b c'" , fxt.Make_pair("a", "b c"));}
@Test public void Val__ws__mult_mult() {fxt.Test_parse("a='b c d'" , fxt.Make_pair("a", "b c d"));} // PURPOSE: fix wherein 1st-gobble gobbled rest of spaces (was b cd)
@Test public void Val__apos() {fxt.Test_parse("a=\"b c'd\"" , fxt.Make_pair("a", "b c'd"));} // PURPOSE: fix wherein apos was gobbled up; PAGE:en.s:Alice's_Adventures_in_Wonderland; DATE:2013-11-22
@Test public void Val__apos_2() {fxt.Test_parse("a=\"b'c d\"" , fxt.Make_pair("a", "b'c d"));} // PURPOSE: fix wherein apos was causing "'b'c d"; PAGE:en.s:Grimm's_Household_Tales,_Volume_1; DATE:2013-12-22
@Test public void Nowiki__val() {fxt.Test_parse("a=<nowiki>'b'</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(0, 13));}
@Test public void Nowiki__key() {fxt.Test_parse("<nowiki>a=b</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(8, 11));}
@Test public void Nowiki__key_2() {fxt.Test_parse("a<nowiki>b</nowiki>c=d" , fxt.Make_pair("abc", "d").Atr_rng(0, 22));}
@Test public void Nowiki__key_3() {fxt.Test_parse("a<nowiki>=</nowiki>\"b\"" , fxt.Make_pair("a", "b").Atr_rng(0, 22));} // EX:fr.w:{{Portail|Transpédia|Californie}}
@Test public void Nowiki__quote() {fxt.Test_parse("a=\"b<nowiki>c</nowiki>d<nowiki>e</nowiki>f\"", fxt.Make_pair("a", "bcdef"));}
@Test public void Val__as_int() {fxt.Test_val_as_int("-123" , -123);}
}

View File

@@ -0,0 +1,25 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
class Mwh_doc_itm {
public Mwh_doc_itm(int itm_tid, int nde_tid, byte[] itm_bry) {this.itm_tid = itm_tid; this.itm_bry = itm_bry; this.nde_tid = nde_tid;}
public int Itm_tid() {return itm_tid;} private final int itm_tid;
public byte[] Itm_bry() {return itm_bry;} private final byte[] itm_bry;
public int Nde_tid() {return nde_tid;} private final int nde_tid;
public static final int Itm_tid__txt = 0, Itm_tid__nde_head = 1, Itm_tid__nde_tail = 2, Itm_tid__comment = 3;
}

View File

@@ -0,0 +1,62 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
class Mwh_doc_mgr {
private final int data_max_orig;
public Mwh_doc_mgr(int max) {
this.data_max_orig = max * Idx__mult;
this.Max_(max);
}
public int Len() {return itm_len;} private int itm_len;
public int[] Data_ary() {return data_ary;} private int[] data_ary; private int data_max;
private void Max_(int len) {
this.data_max = len * Idx__mult;
this.data_ary = new int[data_max];
this.itm_len = 0;
}
public void Clear() {
if (data_max == data_max_orig)
itm_len = 0;
else
Max_(data_max_orig / Idx__mult);
}
public int Add(int dom_tid, int src_bgn, int src_end) {
int data_idx = itm_len * Idx__mult;
if (data_idx == data_max) {
int new_data_max = data_max == 0 ? Idx__mult : data_max * 2;
int[] new_data_ary = new int[new_data_max];
Int_.Ary_copy_to(data_ary, data_max, data_ary);
this.data_ary = new_data_ary;
this.data_max = new_data_max;
}
int dom_uid = itm_len;
data_ary[data_idx + Idx_dom_uid] = dom_uid;
data_ary[data_idx + Idx_dom_tid] = dom_tid;
data_ary[data_idx + Idx_src_bgn] = src_bgn;
data_ary[data_idx + Idx_src_end] = src_end;
++itm_len;
return dom_uid;
}
public static final int
Idx_dom_uid = 0
, Idx_dom_tid = 1
, Idx_src_bgn = 2
, Idx_src_end = 3
, Idx__mult = 4
;
}

View File

@@ -0,0 +1,191 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.core.primitives.*;
import gplx.xowa.parsers.xndes.*;
public class Mwh_doc_parser {
private final Mwh_doc_mgr dom_mgr = new Mwh_doc_mgr(16);
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
private final List_adp nde_stack = List_adp_.new_();
private byte[] src; private int src_end;
private Mwh_doc_wkr wkr;
private Hash_adp_bry nde_regy;
private int txt_bgn, nde_uid;
private Xop_xnde_tag cur_nde; private int cur_nde_tid;
public void Parse(Mwh_doc_wkr wkr, byte[] src, int src_bgn, int src_end) {
this.wkr = wkr; this.src = src; this.src_end = src_end;
this.nde_regy = wkr.Nde_regy();
nde_stack.Clear();
int pos = txt_bgn = src_bgn;
nde_uid = cur_nde_tid = -1;
cur_nde = null;
while (pos < src_end) {
if (src[pos] == Byte_ascii.Angle_bgn) // "<": possible nde start
pos = Parse_nde(pos);
else // else, just increment
++pos;
}
if (src_end != txt_bgn) wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, pos);
}
private int Parse_nde(int pos) {
int nde_end_tid = Nde_end_tid__invalid;
boolean nde_is_head = true;
int nde_bgn = pos;
++pos;
int name_bgn = pos;
int name_end = pos;
while (pos < src_end) {
byte b = src[pos];
switch (b) {
// valid chars for name
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Dot: case Byte_ascii.Dash: case Byte_ascii.Underline: case Byte_ascii.Colon: // XML allowed punctuation
case Byte_ascii.Dollar:// MW: handles <br$2>;
++pos;
break;
// comment check
case Byte_ascii.Bang:
boolean comment_found = false;
if (name_bgn == pos && Bry_.Eq(src, pos + 1, pos + 3, Comment_bgn)) {
int comment_end_pos = Bry_find_.Find_fwd(src, Comment_end, pos + 3);
if (comment_end_pos != Bry_find_.Not_found) {
nde_end_tid = Nde_end_tid__comment;
pos = comment_end_pos + 3;
comment_found = true;
}
}
if (!comment_found)
return pos;
else
break;
// invalid char; not a node; treat as text; EX: "<!@#", "< /b>"
default:
return pos;
// slash -> either "</b>" or "<b/>"
case Byte_ascii.Slash:
if (name_bgn == pos) { // "</"; EX: "</b>"
nde_is_head = false;
++name_bgn;
++pos;
continue;
}
else { // check for "/>"; NOTE: <pre/a>, <pre//> are allowed
name_end = pos;
++pos;
if (pos == src_end) return pos; // end of doc; treat as text; EX: "<b/EOS"
if (src[pos] == Byte_ascii.Gt) {
nde_end_tid = Nde_end_tid__inline;
++pos;
}
else
nde_end_tid = Nde_end_tid__slash;
}
break;
// stops "name"
case Byte_ascii.Gt:
nde_end_tid = Nde_end_tid__gt;
name_end = pos;
++pos;
break;
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
nde_end_tid = Nde_end_tid__ws;
name_end = pos;
break;
case Byte_ascii.Backslash: // MW: allows "<br\>" -> "<br/>"
nde_end_tid = Nde_end_tid__backslash;
name_end = pos;
break;
}
if (nde_end_tid != Nde_end_tid__invalid) break;
}
// get name
Xop_xnde_tag nde_itm = null;
if (nde_end_tid != Nde_end_tid__comment) {
nde_itm = (Xop_xnde_tag)nde_regy.Get_by_mid(src, name_bgn, name_end);
if (nde_itm == null) return pos; // not a known nde; exit
}
if (txt_bgn != nde_bgn) { // notify txt
wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, nde_bgn);
txt_bgn = pos;
}
if (nde_is_head) {
wkr.On_nde_head_bgn(this, src, cur_nde_tid, name_bgn, name_end);
switch (nde_end_tid) {
case Nde_end_tid__comment:
wkr.On_comment_end(this, src, cur_nde_tid, nde_bgn, pos);
break;
case Nde_end_tid__ws:
case Nde_end_tid__slash:
case Nde_end_tid__backslash: // handled above
pos = atr_parser.Parse(wkr, nde_uid, cur_nde_tid, src, pos, src_end);
nde_end_tid = atr_parser.Nde_end_tid();
txt_bgn = pos;
break;
}
switch (nde_end_tid) {
case Nde_end_tid__inline:
wkr.On_nde_head_end(this, src, cur_nde_tid, nde_bgn, pos, Bool_.Y);
txt_bgn = pos;
break;
case Nde_end_tid__gt:
wkr.On_nde_head_end(this, src, cur_nde_tid, nde_bgn, pos, Bool_.N);
txt_bgn = pos;
if ( nde_itm != null
&& !nde_itm.Single_only_html() // ignore <b>
&& (cur_nde == null || !cur_nde.Xtn()) // <pre> ignores inner
) {
if (cur_nde != null)
nde_stack.Add(cur_nde);
this.cur_nde = nde_itm;
this.cur_nde_tid = nde_itm.Id();
}
break;
case Nde_end_tid__ws:
case Nde_end_tid__slash:
case Nde_end_tid__backslash: break; // handled above
}
nde_uid = dom_mgr.Add(Mwh_doc_itm.Itm_tid__nde_head, nde_bgn, pos);
}
else {
switch (nde_end_tid) {
case Nde_end_tid__gt:
wkr.On_nde_tail_end(this, src, cur_nde_tid, nde_bgn, pos);
txt_bgn = pos;
if (nde_itm.Id() == cur_nde_tid) {
cur_nde = (Xop_xnde_tag)List_adp_.Pop_or(nde_stack, null);
cur_nde_tid = cur_nde == null ? -1 : cur_nde.Id();
}
break;
}
}
return pos;
}
public static final int Nde_end_tid__invalid = 0, Nde_end_tid__gt = 1, Nde_end_tid__ws = 2, Nde_end_tid__inline = 3, Nde_end_tid__slash = 4, Nde_end_tid__backslash = 5, Nde_end_tid__comment = 6;
private static final byte[] Comment_bgn = Bry_.new_a7("--"), Comment_end = Bry_.new_a7("-->");
}

View File

@@ -0,0 +1,73 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
class Mwh_doc_parser_fxt {
private final Bry_bfr expd_bfr = Bry_bfr.new_(), actl_bfr = Bry_bfr.new_();
private final Mwh_doc_parser parser = new Mwh_doc_parser();
private final Mwh_doc_wkr__itm_bldr wkr = new Mwh_doc_wkr__itm_bldr();
public Mwh_doc_itm Make_txt (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_txt (String raw, int nde_tid) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_comment (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_nde_head(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , -1, Bry_.new_u8(raw));}
public Mwh_doc_itm Make_nde_tail(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , -1, Bry_.new_u8(raw));}
public void Test_parse(String raw, Mwh_doc_itm... expd) {
Mwh_doc_itm[] actl = Exec_parse(raw);
Test_print(expd, actl);
}
public Mwh_doc_itm[] Exec_parse(String raw) {
byte[] bry = Bry_.new_u8(raw);
parser.Parse(wkr, bry, 0, bry.length);
return wkr.To_atr_ary();
}
public void Test_print(Mwh_doc_itm[] expd_ary, Mwh_doc_itm[] actl_ary) {
int expd_len = expd_ary.length;
int actl_len = actl_ary.length;
int len = expd_len > actl_len ? expd_len : actl_len;
for (int i = 0; i < len; ++i) {
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
}
Tfds.Eq_str_lines(expd_bfr.Xto_str_and_clear(), actl_bfr.Xto_str_and_clear());
}
private void To_bfr(Bry_bfr expd_bfr, Mwh_doc_itm expd_itm, Bry_bfr actl_bfr, Mwh_doc_itm actl_itm) {
To_bfr__main(expd_bfr, expd_itm); To_bfr__main(actl_bfr, actl_itm);
if (expd_itm != null && expd_itm.Nde_tid() != -1) {
To_bfr__nde_tid(expd_bfr, expd_itm); To_bfr__nde_tid(actl_bfr, actl_itm);
}
}
private void To_bfr__main(Bry_bfr bfr, Mwh_doc_itm itm) {
if (itm == null) return;
bfr.Add_str_a7("itm_tid:").Add_int_variable(itm.Itm_tid()).Add_byte_nl();
bfr.Add_str_a7("txt:").Add(itm.Itm_bry()).Add_byte_nl();
}
private void To_bfr__nde_tid(Bry_bfr bfr, Mwh_doc_itm itm) {
if (itm == null) return;
bfr.Add_str_a7("nde_tid:").Add_int_variable(itm.Nde_tid()).Add_byte_nl();
}
}
class Mwh_doc_wkr__itm_bldr implements Mwh_doc_wkr {
private final List_adp list = List_adp_.new_();
public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw();
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
public Mwh_doc_itm[] To_atr_ary() {return (Mwh_doc_itm[])list.To_ary_and_clear(Mwh_doc_itm.class);}
}

View File

@@ -0,0 +1,60 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import org.junit.*; import gplx.xowa.parsers.xndes.*;
public class Mwh_doc_parser_tst {
private final Mwh_doc_parser_fxt fxt = new Mwh_doc_parser_fxt();
@Test public void Text__basic() {fxt.Test_parse("abc" , fxt.Make_txt("abc"));}
@Test public void Comment() {fxt.Test_parse("a<!--b-->c" , fxt.Make_txt("a"), fxt.Make_comment("<!--b-->"), fxt.Make_txt("c"));}
@Test public void Fail__inline_eos() {fxt.Test_parse("a<b/" , fxt.Make_txt("a<b/"));}
@Test public void Fail__unknown() {fxt.Test_parse("a<bc/>d" , fxt.Make_txt("a<bc/>d"));}
@Test public void Node__inline() {fxt.Test_parse("a<b/>c" , fxt.Make_txt("a"), fxt.Make_nde_head("<b/>") , fxt.Make_txt("c"));}
@Test public void Node__pair() {fxt.Test_parse("a<b>c</b>d" , fxt.Make_txt("a"), fxt.Make_nde_head("<b>") , fxt.Make_txt("c"), fxt.Make_nde_tail("</b>"), fxt.Make_txt("d"));}
@Test public void Atrs__pair() {
fxt.Test_parse("<div id='1'>a</div>"
, fxt.Make_nde_head("<div id='1'>")
, fxt.Make_txt("a")
, fxt.Make_nde_tail("</div>"));
}
@Test public void Atrs__inline() {
fxt.Test_parse("a<div id='1'/>b"
, fxt.Make_txt("a")
, fxt.Make_nde_head("<div id='1'/>")
, fxt.Make_txt("b"));
}
@Test public void Node__single_only() {
fxt.Test_parse("<b>a<br>b</b>c"
, fxt.Make_nde_head("<b>")
, fxt.Make_txt("a", Xop_xnde_tag_.Tid_b)
, fxt.Make_nde_head("<br>")
, fxt.Make_txt("b", Xop_xnde_tag_.Tid_b) // <b> not <br>
, fxt.Make_nde_tail("</b>")
, fxt.Make_txt("c", Xop_xnde_tag_.Tid__null)
);
}
@Test public void Node__pre() {
fxt.Test_parse("<pre>a<div>b</pre>c"
, fxt.Make_nde_head("<pre>")
, fxt.Make_txt("a", Xop_xnde_tag_.Tid_pre)
, fxt.Make_nde_head("<div>")
, fxt.Make_txt("b", Xop_xnde_tag_.Tid_pre) // <pre> not <div>
, fxt.Make_nde_tail("</pre>")
, fxt.Make_txt("c", Xop_xnde_tag_.Tid__null)
);
}
}

View File

@@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
public interface Mwh_doc_wkr {
Hash_adp_bry Nde_regy();
void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx);
void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end);
void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline);
void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
}

View File

@@ -0,0 +1,31 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.xowa.parsers.xndes.*;
public class Mwh_doc_wkr_ {
public static Hash_adp_bry Nde_regy__mw() {
Xop_xnde_tag[] ary = Xop_xnde_tag_.Ary;
int len = ary.length;
Hash_adp_bry rv = Hash_adp_bry.ci_a7();
for (int i = 0; i < len; ++i) {
Xop_xnde_tag itm = ary[i];
rv.Add(itm.Name_bry(), itm);
}
return rv;
}
}