mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Source: Restore broken commit
This commit is contained in:
66
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm.java
Normal file
66
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm.java
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public class Mwh_atr_itm {
|
||||
public Mwh_atr_itm
|
||||
( byte[] src, boolean valid, boolean repeated, boolean key_exists, int atr_bgn, int atr_end
|
||||
, int key_bgn, int key_end, byte[] key_bry
|
||||
, int val_bgn, int val_end, byte[] val_bry
|
||||
, int eql_pos, int qte_tid
|
||||
) {
|
||||
this.src = src;
|
||||
this.valid = valid; this.repeated = repeated; this.key_exists = key_exists;
|
||||
this.atr_bgn = atr_bgn; this.atr_end = atr_end;
|
||||
this.key_bgn = key_bgn; this.key_end = key_end; this.key_bry = key_bry;
|
||||
this.val_bgn = val_bgn; this.val_end = val_end; this.val_bry = val_bry;
|
||||
this.eql_pos = eql_pos; this.qte_tid = qte_tid;
|
||||
}
|
||||
public byte[] Src() {return src;} private final byte[] src;
|
||||
public boolean Valid() {return valid;} private final boolean valid;
|
||||
public boolean Key_exists() {return key_exists;} private final boolean key_exists;
|
||||
public boolean Repeated() {return repeated;} private final boolean repeated;
|
||||
public boolean Invalid() {return repeated || !valid;}
|
||||
public int Atr_bgn() {return atr_bgn;} private int atr_bgn;
|
||||
public int Atr_end() {return atr_end;} private int atr_end;
|
||||
public int Key_bgn() {return key_bgn;} private final int key_bgn;
|
||||
public int Key_end() {return key_end;} private final int key_end;
|
||||
public byte[] Key_bry() {return key_bry;} private byte[] key_bry;
|
||||
public byte Key_tid() {return key_tid;} public Mwh_atr_itm Key_tid_(byte v) {key_tid = v; return this;} private byte key_tid;
|
||||
public int Val_bgn() {return val_bgn;} private final int val_bgn;
|
||||
public int Val_end() {return val_end;} private final int val_end;
|
||||
public byte[] Val_bry() {return val_bry;} private byte[] val_bry;
|
||||
public int Eql_pos() {return eql_pos;} private final int eql_pos;
|
||||
public int Qte_tid() {return qte_tid;} private final int qte_tid;
|
||||
public byte Qte_byte() {
|
||||
switch (qte_tid) {
|
||||
case Mwh_atr_itm_.Qte_tid__none: return Byte_ascii.Null;
|
||||
case Mwh_atr_itm_.Qte_tid__apos: return Byte_ascii.Apos;
|
||||
case Mwh_atr_itm_.Qte_tid__qute: return Byte_ascii.Quote;
|
||||
default: throw Err_.new_unhandled(qte_tid);
|
||||
}
|
||||
}
|
||||
public Mwh_atr_itm Atr_rng(int bgn, int end) {this.atr_bgn = bgn; this.atr_end = end; return this;}
|
||||
public void Key_bry_(byte[] v) {this.key_bry = v;}
|
||||
public void Val_bry_(byte[] v) {this.val_bry = v;}
|
||||
public String Val_as_str() {return String_.new_u8(Val_as_bry());}
|
||||
public byte[] Val_as_bry() {if (val_bry == null) val_bry = Bry_.Mid(src, val_bgn, val_end); return val_bry;} // NOTE: val_bry is cached
|
||||
public byte[] Val_as_bry__blank_to_null() {byte[] rv = Val_as_bry(); return Bry_.Len_eq_0(rv) ? null : rv;}
|
||||
public int Val_as_int_or(int or) {return val_bry == null ? Bry_.To_int_or__lax(src, val_bgn, val_end, or) : Bry_.To_int_or(val_bry, or);}
|
||||
public boolean Val_as_bool_by_int() {return Val_as_int_or(0) == 1;}
|
||||
public boolean Val_as_bool() {return Bry_.Eq(Bry_.Lcase__all(Val_as_bry()), Bool_.True_bry);}
|
||||
}
|
||||
51
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_.java
Normal file
51
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_.java
Normal file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public class Mwh_atr_itm_ {
|
||||
public static final Mwh_atr_itm[] Ary_empty = new Mwh_atr_itm[0];
|
||||
public static final int Atr_tid__invalid = 1, Atr_tid__repeat = 2, Atr_tid__pair = 4, Atr_tid__name = 8; // NOTE: id order is important; see above;
|
||||
public static final int Qte_tid__none = 0, Qte_tid__apos = 1, Qte_tid__qute = 2;
|
||||
public static final int Mask__qte__none = 0, Mask__qte__apos = 1, Mask__qte_qute = 2;
|
||||
public static final int
|
||||
Mask__valid = 8
|
||||
, Mask__repeated = 16
|
||||
, Mask__key_exists = 32
|
||||
, Mask__val_made = 64
|
||||
;
|
||||
public static final boolean Mask__valid__n = false, Mask__valid__y = true;
|
||||
public static final boolean Mask__key_exists__n = false, Mask__key_exists__y = true;
|
||||
public static final boolean Mask__repeated__n = false, Mask__repeated__y = true;
|
||||
public static final boolean Mask__val_made__n = false, Mask__val_made__y = true;
|
||||
public static int Calc_atr_utl(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made) {
|
||||
int rv = qte_tid;
|
||||
if (valid) rv |= Mwh_atr_itm_.Mask__valid;
|
||||
if (repeated) rv |= Mwh_atr_itm_.Mask__repeated;
|
||||
if (key_exists) rv |= Mwh_atr_itm_.Mask__key_exists;
|
||||
if (val_made) rv |= Mwh_atr_itm_.Mask__val_made;
|
||||
return rv;
|
||||
}
|
||||
public static int Calc_qte_tid(int val) {
|
||||
return val & ((1 << 3) - 1);
|
||||
}
|
||||
public static byte Calc_qte_byte(int[] data_ary, int idx) {
|
||||
int val = data_ary[idx + Mwh_atr_mgr.Idx_atr_utl];
|
||||
int qte_tid = (val & ((1 << 3) - 1));
|
||||
return qte_tid == Qte_tid__apos ? Byte_ascii.Apos : Byte_ascii.Quote;
|
||||
}
|
||||
public static final byte Key_tid__generic = 0, Key_tid__id = 1, Key_tid__style = 2, Key_tid__role = 3;
|
||||
}
|
||||
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_owner1.java
Normal file
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_owner1.java
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public interface Mwh_atr_itm_owner1 {
|
||||
void Xatr__set(Xowe_wiki wiki, byte[] src, Mwh_atr_itm xatr, Object xatr_id_obj);
|
||||
}
|
||||
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_owner2.java
Normal file
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_itm_owner2.java
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public interface Mwh_atr_itm_owner2 {
|
||||
void Xatr__set(Xowe_wiki wiki, byte[] src, Mwh_atr_itm xatr, byte xatr_id);
|
||||
}
|
||||
98
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_mgr.java
Normal file
98
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_mgr.java
Normal file
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import gplx.core.brys.*;
|
||||
public class Mwh_atr_mgr {
|
||||
private final int data_max_orig;
|
||||
public Mwh_atr_mgr(int max) {
|
||||
this.data_max_orig = max * Idx__mult;
|
||||
this.Max_(max);
|
||||
}
|
||||
public int Len() {return itm_len;} private int itm_len;
|
||||
public int[] Data_ary() {return data_ary;} private int[] data_ary; private int data_max;
|
||||
public byte[][] Text_ary() {return text_ary;} private byte[][] text_ary;
|
||||
private void Max_(int len) {
|
||||
this.data_max = len * Idx__mult;
|
||||
this.data_ary = new int[data_max];
|
||||
this.text_ary = new byte[len * Text__mult][];
|
||||
this.itm_len = 0;
|
||||
}
|
||||
public void Clear() {
|
||||
if (data_max == data_max_orig)
|
||||
itm_len = 0;
|
||||
else
|
||||
Max_(data_max_orig / Idx__mult);
|
||||
}
|
||||
public int Add(int nde_uid, int nde_tid, boolean valid, boolean repeated, boolean key_exists, int atr_bgn, int atr_end, int key_bgn, int key_end, byte[] key_bry, int eql_pos, int qte_tid, int val_bgn, int val_end, byte[] val_bry) {
|
||||
int data_idx = itm_len * Idx__mult;
|
||||
if (data_idx == data_max) {
|
||||
int new_data_max = data_max == 0 ? Idx__mult : data_max * 2;
|
||||
int[] new_data_ary = new int[new_data_max];
|
||||
Int_.Ary_copy_to(data_ary, data_max, data_ary);
|
||||
this.data_ary = new_data_ary;
|
||||
|
||||
int text_max = text_ary.length;
|
||||
int new_text_max = data_max == 0 ? Text__mult : text_max * 2;
|
||||
byte[][] new_text_ary = new byte[new_text_max][];
|
||||
for (int i = 0; i < text_max; ++i)
|
||||
new_text_ary[i] = text_ary[i];
|
||||
this.text_ary = new_text_ary;
|
||||
|
||||
this.data_max = new_data_max;
|
||||
}
|
||||
boolean val_made = false;
|
||||
int text_idx = itm_len * Text__mult;
|
||||
text_ary[text_idx] = key_bry;
|
||||
if (val_bry != null) {
|
||||
text_ary[text_idx + 1] = val_bry;
|
||||
val_made = true;
|
||||
}
|
||||
data_ary[data_idx + Idx_nde_uid] = nde_uid;
|
||||
data_ary[data_idx + Idx_nde_tid] = nde_tid;
|
||||
data_ary[data_idx + Idx_atr_utl] = Mwh_atr_itm_.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
|
||||
data_ary[data_idx + Idx_atr_bgn] = atr_bgn;
|
||||
data_ary[data_idx + Idx_atr_end] = atr_end;
|
||||
data_ary[data_idx + Idx_key_bgn] = key_bgn;
|
||||
data_ary[data_idx + Idx_key_end] = key_end;
|
||||
data_ary[data_idx + Idx_val_bgn] = val_bgn;
|
||||
data_ary[data_idx + Idx_val_end] = val_end;
|
||||
data_ary[data_idx + Idx_eql_pos] = eql_pos;
|
||||
return itm_len++;
|
||||
}
|
||||
public void Set_repeated(int atr_uid) {
|
||||
int atr_utl_idx = (atr_uid * Idx__mult) + Idx_atr_utl;
|
||||
int atr_utl = data_ary[atr_utl_idx];
|
||||
int val_bry_exists = atr_utl & Atr_utl__val_bry_exists;
|
||||
data_ary[atr_utl_idx] = Mwh_atr_itm_.Atr_tid__repeat | val_bry_exists;
|
||||
}
|
||||
public static final int
|
||||
Idx_nde_uid = 0
|
||||
, Idx_nde_tid = 1
|
||||
, Idx_atr_utl = 2
|
||||
, Idx_atr_bgn = 3
|
||||
, Idx_atr_end = 4
|
||||
, Idx_key_bgn = 5
|
||||
, Idx_key_end = 6
|
||||
, Idx_val_bgn = 7
|
||||
, Idx_val_end = 8
|
||||
, Idx_eql_pos = 9
|
||||
, Idx__mult = 10
|
||||
;
|
||||
public static final int Text__mult = 2;
|
||||
public static final int Atr_utl__val_bry_exists = 16;
|
||||
}
|
||||
39
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_mgr_tst.java
Normal file
39
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_mgr_tst.java
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import org.junit.*;
|
||||
public class Mwh_atr_mgr_tst {
|
||||
private final Mwh_atr_mgr_fxt fxt = new Mwh_atr_mgr_fxt();
|
||||
@Test public void Atr_utl_make() {
|
||||
// key="val"
|
||||
fxt.Test_atr_utl_make(Mwh_atr_itm_.Qte_tid__qute, Mwh_atr_itm_.Mask__valid__y, Mwh_atr_itm_.Mask__repeated__n, Mwh_atr_itm_.Mask__key_exists__y, Mwh_atr_itm_.Mask__val_made__n, 42);
|
||||
// key=val key=v<nowiki/>al
|
||||
fxt.Test_atr_utl_make(Mwh_atr_itm_.Qte_tid__none, Mwh_atr_itm_.Mask__valid__y, Mwh_atr_itm_.Mask__repeated__y, Mwh_atr_itm_.Mask__key_exists__y, Mwh_atr_itm_.Mask__val_made__y, 120);
|
||||
}
|
||||
}
|
||||
class Mwh_atr_mgr_fxt {
|
||||
public void Test_atr_utl_make(int qte_tid, boolean valid, boolean repeated, boolean key_exists, boolean val_made, int expd) {
|
||||
int atr_utl = Mwh_atr_itm_.Calc_atr_utl(qte_tid, valid, repeated, key_exists, val_made);
|
||||
Tfds.Eq_int(expd, atr_utl);
|
||||
Tfds.Eq_int(qte_tid, Mwh_atr_itm_.Calc_qte_tid(atr_utl));
|
||||
Tfds.Eq_bool(valid, (atr_utl & Mwh_atr_itm_.Mask__valid) == Mwh_atr_itm_.Mask__valid);
|
||||
Tfds.Eq_bool(repeated, (atr_utl & Mwh_atr_itm_.Mask__repeated) == Mwh_atr_itm_.Mask__repeated);
|
||||
Tfds.Eq_bool(key_exists, (atr_utl & Mwh_atr_itm_.Mask__key_exists) == Mwh_atr_itm_.Mask__key_exists);
|
||||
Tfds.Eq_bool(val_made, (atr_utl & Mwh_atr_itm_.Mask__val_made) == Mwh_atr_itm_.Mask__val_made);
|
||||
}
|
||||
}
|
||||
481
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java
Normal file
481
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser.java
Normal file
@@ -0,0 +1,481 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import gplx.core.primitives.*;
|
||||
import gplx.xowa.parsers.xndes.*; // for brys: <nowiki>, <noinclude>, <includeonly>, <onlyinclude>
|
||||
public class Mwh_atr_parser { // REF.MW:Sanitizer.php|decodeTagAttributes;MW_ATTRIBS_REGEX
|
||||
private static final byte Area__invalid = 0, Area__atr_limbo = 1, Area__key = 2, Area__eql_limbo = 3, Area__val_limbo = 4, Area__val_quote = 5, Area__val_naked = 6;
|
||||
private final Hash_adp_bry repeated_atrs_hash = Hash_adp_bry.ci_a7(); // ASCII:xnde_atrs
|
||||
private final Mwh_atr_mgr atr_mgr = new Mwh_atr_mgr(16);
|
||||
private final Bry_bfr key_bfr = Bry_bfr_.New(), val_bfr = Bry_bfr_.New();
|
||||
private byte area = Area__atr_limbo;
|
||||
private int atr_bgn = -1, key_bgn = -1, key_end = -1, eql_pos = -1, val_bgn = -1, val_end = -1;
|
||||
private byte qte_byte = Byte_ascii.Null;
|
||||
private boolean key_bfr_on = false, val_bfr_on = false, ws_is_before_val = false, qte_closed = false;
|
||||
private int nde_uid, nde_tid;
|
||||
public Bry_obj_ref Bry_obj() {return bry_ref;} private final Bry_obj_ref bry_ref = Bry_obj_ref.New_empty();
|
||||
public int Nde_end_tid() {return nde_end_tid;} private int nde_end_tid;
|
||||
public int Parse(Mwh_atr_wkr wkr, int nde_uid, int nde_tid, byte[] src, int src_bgn, int src_end) {
|
||||
this.nde_uid = nde_uid; this.nde_tid = nde_tid;
|
||||
this.nde_end_tid = Mwh_doc_parser.Nde_end_tid__invalid;
|
||||
this.atr_bgn = -1;
|
||||
area = Area__atr_limbo;
|
||||
boolean prv_is_ws = false;
|
||||
int pos = src_bgn;
|
||||
boolean loop = true;
|
||||
while (loop) {
|
||||
if (pos >= src_end) {
|
||||
switch (area) {
|
||||
case Area__key: // EX: "a"
|
||||
case Area__eql_limbo: // EX: "a "
|
||||
case Area__val_naked: // EX: "a=b"
|
||||
break; // valid atr
|
||||
case Area__val_quote: // EX: "a='b'"
|
||||
if (qte_closed)
|
||||
Make(src, src_end);
|
||||
else { // dangling; EX: "a='b c=d"
|
||||
int reset_pos = Bry_find_.Find_fwd(src, Byte_ascii.Space, val_bgn, src_end); // try to find 1st space within quote; EX:"a='b c=d" should try to reset at c=d
|
||||
boolean reset_found = reset_pos != Bry_find_.Not_found;
|
||||
area = Area__invalid; val_end = reset_found ? reset_pos : src_end;
|
||||
Make(src, val_end); // create invalid atr
|
||||
if (reset_found) { // space found; resume from text after space; EX: "a='b c=d"; PAGE:en.w:Aubervilliers DATE:2014-06-25
|
||||
pos = Bry_find_.Find_fwd_while_not_ws(src, reset_pos, src_end); // skip ws
|
||||
atr_bgn = -1;
|
||||
area = Area__atr_limbo;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case Area__invalid: case Area__atr_limbo:
|
||||
case Area__val_limbo:
|
||||
area = Area__invalid;
|
||||
break;
|
||||
}
|
||||
if (atr_bgn != -1) {
|
||||
val_end = src_end;
|
||||
Make(src, val_end);
|
||||
}
|
||||
break;
|
||||
}
|
||||
byte b = src[pos];
|
||||
switch (area) {
|
||||
case Area__invalid:
|
||||
switch (b) {
|
||||
// ws -> end invalid area
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
Make(src, pos);
|
||||
area = Area__atr_limbo;
|
||||
break;
|
||||
// rest -> continue eating up invalid chars
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case Area__atr_limbo: // 1st area after (a) node_name, (b) attribute, (c) invalid_area
|
||||
switch (b) {
|
||||
// ws -> ignore; skip any ws in atr_limbo; note that once a non-ws char is encountered, it will immediately go into another area
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed for multiple spaces; ALSO: cannot move above switch b/c of <nowiki>
|
||||
break;
|
||||
// attribFirst -> enter Area__key; REF.MW: $attribFirst = '[:A-Z_a-z0-9]';
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Colon: case Byte_ascii.Underline:
|
||||
area = Area__key;
|
||||
if (atr_bgn == -1) atr_bgn = pos; // NOTE: atr_bgn == -1 needed b/c of spaces
|
||||
key_bgn = pos;
|
||||
break;
|
||||
// angle_bgn -> check for <nowiki>
|
||||
case Byte_ascii.Angle_bgn: // handle "<nowiki>"
|
||||
int gt_pos = Xnde_find_gt(src, pos, src_end);
|
||||
if (gt_pos == Bry_find_.Not_found) {
|
||||
area = Area__invalid; if (atr_bgn == -1) atr_bgn = pos;
|
||||
}
|
||||
else
|
||||
pos = gt_pos; // position after ">"; note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
|
||||
break;
|
||||
// rest -> invalid
|
||||
default: // quote and other non-valid key characters are invalid until next space; EX: "<span 'key_cannot_be_quoted' id='123'"
|
||||
area = Area__invalid; if (atr_bgn == -1) atr_bgn = pos;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case Area__key:
|
||||
switch (b) {
|
||||
// alphanum -> valid key chars; REF.MW: $attrib = '[:A-Z_a-z-.0-9]';
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Colon: case Byte_ascii.Underline: case Byte_ascii.Dash: case Byte_ascii.Dot:
|
||||
if (key_bfr_on) key_bfr.Add_byte(b);
|
||||
break;
|
||||
// ws -> end key
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
area = Area__eql_limbo;
|
||||
key_end = pos;
|
||||
break;
|
||||
// eq -> end key; go to Area_val_limbo
|
||||
case Byte_ascii.Eq:
|
||||
area = Area__val_limbo;
|
||||
key_end = eql_pos = pos;
|
||||
break;
|
||||
// angle_bgn -> check for <nowiki>
|
||||
case Byte_ascii.Angle_bgn:
|
||||
int gt_pos = Xnde_find_gt(src, pos, src_end);
|
||||
if (gt_pos == Bry_find_.Not_found) // "<" should not be in key; EX: "ke<y"
|
||||
area = Area__invalid;
|
||||
else {
|
||||
if (!key_bfr_on) {key_bfr.Add_mid(src, key_bgn, pos); key_bfr_on = true;}
|
||||
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
|
||||
}
|
||||
break;
|
||||
// rest -> enter invalid
|
||||
default:
|
||||
area = Area__invalid;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case Area__eql_limbo:
|
||||
switch (b) {
|
||||
// ws -> skip
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space: // skip ws
|
||||
break;
|
||||
// eq -> enter Area__val_limbo
|
||||
case Byte_ascii.Eq:
|
||||
eql_pos = pos;
|
||||
area = Area__val_limbo;
|
||||
break;
|
||||
// attribFirst -> enter Area__key; REF.MW: $attribFirst = '[:A-Z_a-z0-9]';
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Colon: case Byte_ascii.Underline:
|
||||
Make(src, pos);
|
||||
area = Area__key;
|
||||
atr_bgn = key_bgn = pos;
|
||||
break;
|
||||
// rest -> make atr and enter limbo
|
||||
default:
|
||||
area = Area__invalid;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case Area__val_limbo:
|
||||
switch (b) {
|
||||
// ws -> skip
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
ws_is_before_val = true;
|
||||
break;
|
||||
// quote -> enter Area_val_quote
|
||||
case Byte_ascii.Quote: case Byte_ascii.Apos:
|
||||
area = Area__val_quote; qte_byte = b; qte_closed = false;
|
||||
prv_is_ws = false;
|
||||
val_bgn = pos + 1;
|
||||
break;
|
||||
// alphanum -> enter Area_val_raw; REF.MW: [a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent: case Byte_ascii.Amp:
|
||||
case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star: case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot:
|
||||
case Byte_ascii.Backslash: case Byte_ascii.Slash: case Byte_ascii.Colon: case Byte_ascii.Semic:
|
||||
case Byte_ascii.Question: case Byte_ascii.At:
|
||||
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
|
||||
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
|
||||
area = Area__val_naked;
|
||||
val_bgn = pos;
|
||||
break;
|
||||
// case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example
|
||||
// angle_bgn -> check for <nowiki>
|
||||
case Byte_ascii.Angle_bgn:
|
||||
int gt_pos = Xnde_find_gt(src, pos, src_end);
|
||||
if (gt_pos == Bry_find_.Not_found)
|
||||
area = Area__invalid; // NOTE: valid in MW; making invalid now until finding counter-example
|
||||
else
|
||||
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
|
||||
break;
|
||||
// rest -> ignore
|
||||
default:
|
||||
area = Area__invalid;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case Area__val_quote: { // EX: "'val' " in "key = 'val'"; REF.MW: \"([^<\"]*)\"
|
||||
switch (b) {
|
||||
// quote: check if same as opening quote
|
||||
case Byte_ascii.Quote: case Byte_ascii.Apos:
|
||||
if (qte_closed)
|
||||
area = Area__invalid;
|
||||
else {
|
||||
if (qte_byte == b) { // quote closes val
|
||||
qte_closed = true;
|
||||
val_end = pos;
|
||||
}
|
||||
else { // quote is just char; EX: title="1 o'clock" or title='The "C" way'
|
||||
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
|
||||
}
|
||||
}
|
||||
break;
|
||||
// ws -> convert all ws to \s; only allow 1 ws at any point in time
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space: // REF.MW:Sanitizer.php|decodeTagAttributes $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
|
||||
if (qte_closed) {
|
||||
Make(src, pos); // NOTE: set atr_end *after* quote
|
||||
if (atr_bgn == -1) atr_bgn = pos; // NOTE: process ws just like Area__atr_limbo
|
||||
}
|
||||
else {
|
||||
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
|
||||
if (prv_is_ws) {} // noop; only allow one ws at a time; EX: "a b" -> "a b"; "a\n\nb" -> "a b"
|
||||
else {
|
||||
prv_is_ws = true; val_bfr.Add_byte(Byte_ascii.Space);
|
||||
}
|
||||
}
|
||||
break;
|
||||
// angle_bgn -> check for <nowiki>; EX: <span title='ab<nowiki>c</nowiki>de'>
|
||||
case Byte_ascii.Angle_bgn:
|
||||
int gt_pos = Xnde_find_gt(src, pos, src_end);
|
||||
if (gt_pos == Bry_find_.Not_found) {
|
||||
// area = Area__invalid; // "<" inside quote is invalid; EX: <span title='a<b'>c</span>
|
||||
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
|
||||
}
|
||||
else {
|
||||
if (qte_closed) {}
|
||||
else {
|
||||
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
|
||||
}
|
||||
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
|
||||
}
|
||||
prv_is_ws = false;
|
||||
break;
|
||||
// rest -> add to val
|
||||
default:
|
||||
if (qte_closed)
|
||||
area = Area__invalid;
|
||||
else {
|
||||
prv_is_ws = false; if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Area__val_naked: // no quotes; EX:a=bcd; REF.MW:([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
|
||||
switch (b) {
|
||||
// alphanum -> continue reading
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Bang: case Byte_ascii.Hash: case Byte_ascii.Dollar: case Byte_ascii.Percent: case Byte_ascii.Amp:
|
||||
case Byte_ascii.Paren_bgn: case Byte_ascii.Paren_end: case Byte_ascii.Star: case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot:
|
||||
case Byte_ascii.Backslash: case Byte_ascii.Slash: case Byte_ascii.Colon: case Byte_ascii.Semic:
|
||||
case Byte_ascii.Question: case Byte_ascii.At:
|
||||
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Pow: case Byte_ascii.Underline: case Byte_ascii.Tick:
|
||||
case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end: case Byte_ascii.Pipe: case Byte_ascii.Tilde:
|
||||
if (val_bfr_on) val_bfr.Add_byte(b); // INLINE: add char
|
||||
break;
|
||||
// case Byte_ascii.Angle_end: NOTE: valid in MW; making invalid now until finding counter-example
|
||||
// angle_bgn -> check for <nowiki>; EX: a=b<nowiki>c</nowiki>d
|
||||
case Byte_ascii.Angle_bgn:
|
||||
int gt_pos = Xnde_find_gt(src, pos, src_end);
|
||||
if (gt_pos == Bry_find_.Not_found) {
|
||||
area = Area__invalid; // NOTE: valid in MW; making invalid now until finding counter-example
|
||||
}
|
||||
else {
|
||||
if (!val_bfr_on) {val_bfr.Add_mid(src, val_bgn, pos); val_bfr_on = true;} // INLINE: val_bfr.init
|
||||
pos = gt_pos; // note that there is ++pos below and loop will continue at gt_pos + 1 (next character after)
|
||||
}
|
||||
break;
|
||||
// ws -> src_end atr
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
val_end = pos;
|
||||
Make(src, pos);
|
||||
break;
|
||||
case Byte_ascii.Eq: // EX:"a= b=c" or "a=b=c"; PAGE:en.w:2013_in_American_television
|
||||
if (ws_is_before_val) { // "a= b=c"; discard 1st and resume at 2nd
|
||||
int old_val_bgn = val_bgn;
|
||||
area = Area__invalid; Make(src, val_bgn); // invalidate cur atr; EX:"a="
|
||||
atr_bgn = key_bgn = old_val_bgn; // reset atr / key to new atr; EX: "b"
|
||||
key_end = pos;
|
||||
area = Area__val_limbo; // set area to val_bgn (basically, put after =)
|
||||
}
|
||||
else // "a=b=c"; discard all
|
||||
area = Area__invalid;
|
||||
break;
|
||||
default:
|
||||
area = Area__invalid;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
|
||||
// iterate atrs and notify
|
||||
int len = atr_mgr.Len();
|
||||
int[] data_ary = atr_mgr.Data_ary();
|
||||
byte[][] text_ary = atr_mgr.Text_ary();
|
||||
for (int j = 0; j < len; ++j) {
|
||||
int itm_idx = j * Mwh_atr_mgr.Idx__mult;
|
||||
byte[] key_bry = text_ary[j * Mwh_atr_mgr.Text__mult];
|
||||
byte[] val_bry_manual = null;
|
||||
int atr_utl = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
|
||||
boolean atr_valid = (atr_utl & Mwh_atr_itm_.Mask__valid) == Mwh_atr_itm_.Mask__valid;
|
||||
boolean repeated = (atr_utl & Mwh_atr_itm_.Mask__repeated) == Mwh_atr_itm_.Mask__repeated;
|
||||
boolean key_exists = (atr_utl & Mwh_atr_itm_.Mask__key_exists) == Mwh_atr_itm_.Mask__key_exists;
|
||||
boolean val_made = (atr_utl & Mwh_atr_itm_.Mask__val_made) == Mwh_atr_itm_.Mask__val_made;
|
||||
if (val_made)
|
||||
val_bry_manual = text_ary[(j * Mwh_atr_mgr.Text__mult) + 1];
|
||||
wkr.On_atr_each(this, src, nde_tid, atr_valid, repeated, key_exists, key_bry, val_bry_manual, data_ary, itm_idx);
|
||||
}
|
||||
atr_mgr.Clear();
|
||||
repeated_atrs_hash.Clear();
|
||||
|
||||
return pos;
|
||||
}
|
||||
private void Make(byte[] src, int atr_end) {
|
||||
// calc final values for atr
|
||||
boolean key_exists = false;
|
||||
byte[] key_bry = null, val_bry = null;
|
||||
boolean atr_valid = true;
|
||||
if (area == Area__invalid) {
|
||||
atr_valid = false;
|
||||
key_bry = Bry_.Empty;
|
||||
key_bfr.Clear();
|
||||
if (val_bgn == -1) val_bgn = atr_bgn;
|
||||
val_bfr.Clear();
|
||||
}
|
||||
else {
|
||||
if (key_bgn != -1 && val_bgn != -1) // key && val exists; EX: "<input id='123'>"
|
||||
key_exists = true;
|
||||
else { // not a pair; EX: "<input checked>"
|
||||
if (key_end == -1) key_end = val_end; // NOTE: key_end == -1 when eos; EX: "a" would have key_bgn = 0; key_end = -1; val_end = 1 DATE:2014-07-03
|
||||
val_bgn = val_end = -1;
|
||||
}
|
||||
key_bry = key_bfr_on ? key_bfr.To_bry_and_clear() : Bry_.Mid(src, key_bgn, key_end); // always make key_bry; needed for repeated_atrs as well as key_tid
|
||||
if (val_bfr_on) val_bry = val_bfr.To_bry_and_clear();
|
||||
}
|
||||
int qte_tid = Mwh_atr_itm_.Mask__qte__none;
|
||||
if (qte_byte != Byte_ascii.Null)
|
||||
qte_tid = qte_byte == Byte_ascii.Quote ? Mwh_atr_itm_.Mask__qte_qute : Mwh_atr_itm_.Mask__qte__apos;
|
||||
int atr_uid = atr_mgr.Add(nde_uid, nde_tid, atr_valid, false, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, eql_pos, qte_tid, val_bgn, val_end, val_bry);
|
||||
|
||||
// handle repeated atrs
|
||||
if (atr_valid) {
|
||||
int repeated_uid = repeated_atrs_hash.Get_as_int_or(key_bry, -1);
|
||||
if (repeated_uid != -1) {
|
||||
repeated_atrs_hash.Del(key_bry);
|
||||
atr_mgr.Set_repeated(repeated_uid);
|
||||
}
|
||||
repeated_atrs_hash.Add_bry_int(key_bry, atr_uid);
|
||||
}
|
||||
|
||||
// reset temp variables
|
||||
area = Area__atr_limbo; qte_byte = Byte_ascii.Null;
|
||||
atr_bgn = key_bgn = val_bgn = key_end = val_end = eql_pos = -1;
|
||||
key_bfr_on = val_bfr_on = ws_is_before_val = qte_closed = false;
|
||||
}
|
||||
public int Xnde_find_gt_find(byte[] src, int pos, int end) {
|
||||
bry_ref.Val_(Bry_.Empty);
|
||||
byte b = src[pos];
|
||||
if (b == Byte_ascii.Slash && pos + 1 < end) { // if </ move pos to after /
|
||||
++pos;
|
||||
b = src[pos];
|
||||
}
|
||||
int gt_pos = Bry_find_.Find_fwd(src, Byte_ascii.Gt, pos, end); if (gt_pos == Bry_find_.Not_found) return Bry_find_.Not_found;
|
||||
byte[] bry = (byte[])xnde_hash.Get_by_mid(src, pos, gt_pos); if (bry == null) return Bry_find_.Not_found;
|
||||
bry_ref.Val_(bry);
|
||||
return bry.length + pos;
|
||||
}
|
||||
private int Xnde_find_gt(byte[] src, int lt_pos, int end) {
|
||||
int pos = lt_pos + 1; if (pos == end) return Bry_find_.Not_found;
|
||||
byte b = src[pos];
|
||||
if (b == Byte_ascii.Slash && pos + 1 < end) {
|
||||
++pos;
|
||||
b = src[pos];
|
||||
}
|
||||
int match_pos = Xnde_find_gt_find(src, pos, end);
|
||||
if (match_pos == Bry_find_.Not_found) {return Bry_find_.Not_found;}
|
||||
boolean slash_found = false;
|
||||
for (int i = match_pos; i < end; i++) {
|
||||
b = src[i];
|
||||
switch (b) {
|
||||
case Byte_ascii.Gt: return i;
|
||||
case Byte_ascii.Space: case Byte_ascii.Nl: case Byte_ascii.Tab: // skip any ws
|
||||
break;
|
||||
case Byte_ascii.Slash:
|
||||
if (slash_found) {return Bry_find_.Not_found;} // only allow one slash
|
||||
else slash_found = true;
|
||||
break;
|
||||
default:
|
||||
return Bry_find_.Not_found;
|
||||
}
|
||||
}
|
||||
return Bry_find_.Not_found;
|
||||
}
|
||||
private static final Hash_adp_bry xnde_hash = Hash_adp_bry.ci_a7()
|
||||
.Add_bry_bry(Xop_xnde_tag_.Tag__nowiki.Name_bry())
|
||||
.Add_bry_bry(Xop_xnde_tag_.Tag__noinclude.Name_bry())
|
||||
.Add_bry_bry(Xop_xnde_tag_.Tag__includeonly.Name_bry())
|
||||
.Add_bry_bry(Xop_xnde_tag_.Tag__onlyinclude.Name_bry())
|
||||
;
|
||||
public static final int Key_tid__unknown = -1;
|
||||
}
|
||||
76
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_fxt.java
Normal file
76
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_fxt.java
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
class Mwh_atr_parser_fxt {
|
||||
private final Bry_bfr expd_bfr = Bry_bfr_.New(), actl_bfr = Bry_bfr_.New();
|
||||
private final Mwh_atr_parser parser = new Mwh_atr_parser();
|
||||
private final Mwh_doc_wkr__atr_bldr wkr = new Mwh_doc_wkr__atr_bldr();
|
||||
public Mwh_atr_itm Make_pair(String key, String val) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.Y, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, Bry_.new_u8(val) , -1, -1);}
|
||||
public Mwh_atr_itm Make_name(String key) {return new Mwh_atr_itm(Bry_.Empty, Bool_.Y, Bool_.N, Bool_.N, -1, -1, -1, -1, Bry_.new_u8(key) , -1, -1, Bry_.new_u8(key) , -1, -1);}
|
||||
public Mwh_atr_itm Make_fail(int bgn, int end) {return new Mwh_atr_itm(Bry_.Empty, Bool_.N, Bool_.N, Bool_.N, bgn, end, -1, -1, null , -1, -1, null , -1, -1);}
|
||||
public void Test_val_as_int(String raw, int expd) {
|
||||
byte[] src = Bry_.new_u8(raw);
|
||||
Mwh_atr_itm itm = new Mwh_atr_itm(src, true, false, false, 0, src.length, -1, -1, null, 0, src.length, src, -1, -1);
|
||||
Tfds.Eq_int(expd, itm.Val_as_int_or(-1));
|
||||
}
|
||||
public void Test_parse(String raw, Mwh_atr_itm... expd) {
|
||||
Mwh_atr_itm[] actl = Exec_parse(raw);
|
||||
Test_print(expd, actl);
|
||||
}
|
||||
private Mwh_atr_itm[] Exec_parse(String raw) {
|
||||
byte[] bry = Bry_.new_u8(raw);
|
||||
parser.Parse(wkr, -1, -1, bry, 0, bry.length);
|
||||
return wkr.To_atr_ary();
|
||||
}
|
||||
public void Test_print(Mwh_atr_itm[] expd_ary, Mwh_atr_itm[] actl_ary) {
|
||||
int expd_len = expd_ary.length;
|
||||
int actl_len = actl_ary.length;
|
||||
int len = expd_len > actl_len ? expd_len : actl_len;
|
||||
for (int i = 0; i < len; ++i) {
|
||||
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
|
||||
}
|
||||
Tfds.Eq_str_lines(expd_bfr.To_str_and_clear(), actl_bfr.To_str_and_clear());
|
||||
}
|
||||
private void To_bfr(Bry_bfr expd_bfr, Mwh_atr_itm expd_itm, Bry_bfr actl_bfr, Mwh_atr_itm actl_itm) {
|
||||
To_bfr__main(expd_bfr, expd_itm);
|
||||
To_bfr__main(actl_bfr, actl_itm);
|
||||
To_bfr__head(expd_bfr, expd_itm);
|
||||
To_bfr__head(actl_bfr, actl_itm);
|
||||
if (expd_itm != null && expd_itm.Atr_bgn() != -1) {
|
||||
To_bfr__atr_rng(expd_bfr, expd_itm);
|
||||
To_bfr__atr_rng(actl_bfr, actl_itm);
|
||||
}
|
||||
}
|
||||
private void To_bfr__head(Bry_bfr bfr, Mwh_atr_itm itm) {
|
||||
if (itm == null) return;
|
||||
bfr.Add_str_a7("head:").Add_yn(itm.Valid()).Add_byte_semic().Add_yn(itm.Repeated()).Add_byte_semic().Add_yn(itm.Key_exists()).Add_byte_nl();
|
||||
}
|
||||
private void To_bfr__main(Bry_bfr bfr, Mwh_atr_itm itm) {
|
||||
if (itm == null) return;
|
||||
if (itm.Valid()) {
|
||||
bfr.Add_str_a7("key:").Add(itm.Key_bry()).Add_byte_nl();
|
||||
bfr.Add_str_a7("val:").Add(itm.Val_as_bry()).Add_byte_nl();
|
||||
}
|
||||
// else
|
||||
// To_bfr__atr_rng(bfr, itm);
|
||||
}
|
||||
private void To_bfr__atr_rng(Bry_bfr bfr, Mwh_atr_itm itm) {
|
||||
if (itm == null) return;
|
||||
bfr.Add_str_a7("rng:").Add_int_variable(itm.Atr_bgn()).Add_byte_semic().Add_int_variable(itm.Atr_end()).Add_byte_nl();
|
||||
}
|
||||
}
|
||||
78
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java
Normal file
78
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_parser_tst.java
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import org.junit.*;
|
||||
public class Mwh_atr_parser_tst {
|
||||
private final Mwh_atr_parser_fxt fxt = new Mwh_atr_parser_fxt();
|
||||
@Test public void Pair__quote__double() {fxt.Test_parse("a=\"b\"" , fxt.Make_pair("a" , "b"));}
|
||||
@Test public void Pair__quote__single() {fxt.Test_parse("a='b'" , fxt.Make_pair("a" , "b"));}
|
||||
@Test public void Pair__quote__none() {fxt.Test_parse("a=b" , fxt.Make_pair("a" , "b"));}
|
||||
@Test public void Pair__quote__none__amp() {fxt.Test_parse("a=&bc" , fxt.Make_pair("a" , "&bc"));}
|
||||
@Test public void Pair__empty() {fxt.Test_parse("a=''" , fxt.Make_pair("a" , ""));}
|
||||
@Test public void Pair__key_w_underline() {fxt.Test_parse("a_b=c" , fxt.Make_pair("a_b" , "c"));}
|
||||
|
||||
@Test public void Name__quote__none() {fxt.Test_parse("b" , fxt.Make_name("b"));}
|
||||
@Test public void Name__ws() {fxt.Test_parse(" b " , fxt.Make_name("b"));} // PURPOSE:discovered while writing test for ref's "lower-alpha" DATE:2014-07-03
|
||||
@Test public void Name__mult() {fxt.Test_parse("a b1 c" , fxt.Make_name("a"), fxt.Make_name("b1"), fxt.Make_name("c"));}
|
||||
|
||||
@Test public void Fail__key_w_plus() {fxt.Test_parse("a+b" , fxt.Make_fail(0, 3));}
|
||||
@Test public void Fail__key_w_plus__many() {fxt.Test_parse("a+b c=d" , fxt.Make_fail(0, 3) , fxt.Make_pair("c", "d"));}
|
||||
@Test public void Fail__val_w_plus() {fxt.Test_parse("a=b+c" , fxt.Make_fail(0, 5));}
|
||||
@Test public void Fail__recover() {fxt.Test_parse("* a=b" , fxt.Make_fail(0, 1) , fxt.Make_pair("a", "b"));} // PURPOSE: * is invalid, but should not stop parsing of a=b
|
||||
@Test public void Fail__incomplete() {fxt.Test_parse("a= c=d" , fxt.Make_fail(0, 3) , fxt.Make_pair("c", "d"));} // PURPOSE: discard xatr if incomplete and followed by valid atr; PAGE:en.w:2013_in_American_television DATE:2014-09-25
|
||||
@Test public void Fail__incomplete_2() {fxt.Test_parse("a=c=d" , fxt.Make_fail(0, 5));} // PURPOSE: variation of above; per MW regex, missing space invalidates entire attribute; DATE:2014-09-25
|
||||
@Test public void Fail__incomplete_pair() {fxt.Test_parse("a= b=" , fxt.Make_fail(0, 3) , fxt.Make_fail(3, 5));} // PURPOSE: "b=" should be invalid not a kv of "b" = "b"; PAGE:en.s:Notes_by_the_Way/Chapter_2; DATE:2015-01-31
|
||||
|
||||
@Test public void Dangling_eos() {fxt.Test_parse("a='b' c='d" , fxt.Make_pair("a", "b") , fxt.Make_fail(5, 10));} // PURPOSE: handle dangling quote at eos; PAGE:en.w:Aubervilliers DATE:2014-06-25
|
||||
@Test public void Dangling_bos() {fxt.Test_parse("a='b c=d" , fxt.Make_fail(0, 4) , fxt.Make_pair("c", "d"));}// PURPOSE: handle dangling quote at bos; resume at next valid atr; PAGE:en.w:Aubervilliers DATE:2014-06-25
|
||||
|
||||
@Test public void Ws__ini() {fxt.Test_parse(" a='b'" , fxt.Make_pair("a", "b").Atr_rng(0, 6));}
|
||||
@Test public void Ws__end() {fxt.Test_parse(" a='b' c='d'" , fxt.Make_pair("a", "b").Atr_rng(0, 6), fxt.Make_pair("c", "d").Atr_rng(6, 12));}
|
||||
@Test public void Ws() {fxt.Test_parse("a = 'b'" , fxt.Make_pair("a", "b"));} // PURPOSE: fix wherein multiple space was causing "a=a"; PAGE:fr.s:La_Sculpture_dans_les_cimetières_de_Paris/Père-Lachaise; DATE:2014-01-18
|
||||
|
||||
@Test public void Many__quote__apos() {fxt.Test_parse("a='b' c='d' e='f'" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
|
||||
@Test public void Many__naked() {fxt.Test_parse("a=b c=d e=f" , fxt.Make_pair("a", "b"), fxt.Make_pair("c", "d"), fxt.Make_pair("e", "f"));}
|
||||
@Test public void Many__naked__pair() {fxt.Test_parse("a b=c" , fxt.Make_name("a"), fxt.Make_pair("b", "c"));}
|
||||
|
||||
@Test public void Quote__ws__nl() {fxt.Test_parse("a='b\nc'" , fxt.Make_pair("a", "b c"));}
|
||||
@Test public void Quote__ws__mult() {fxt.Test_parse("a='b c'" , fxt.Make_pair("a", "b c"));}
|
||||
@Test public void Quote__ws__mult_mult() {fxt.Test_parse("a='b c d'" , fxt.Make_pair("a", "b c d"));} // PURPOSE: fix wherein 1st-gobble gobbled rest of spaces (was b cd)
|
||||
@Test public void Quote__apos() {fxt.Test_parse("a=\"b c'd\"" , fxt.Make_pair("a", "b c'd"));} // PURPOSE: fix wherein apos was gobbled up; PAGE:en.s:Alice's_Adventures_in_Wonderland; DATE:2013-11-22
|
||||
@Test public void Quote__apos_2() {fxt.Test_parse("a=\"b'c d\"" , fxt.Make_pair("a", "b'c d"));} // PURPOSE: fix wherein apos was causing "'b'c d"; PAGE:en.s:Grimm's_Household_Tales,_Volume_1; DATE:2013-12-22
|
||||
// @Test public void Quote__angle() {fxt.Test_parse("a='<'" , fxt.Make_fail(0, 5));} // PURPOSE: "<" inside quotes is always invalid
|
||||
@Test public void Quote__invalid() {fxt.Test_parse("a='b'c" , fxt.Make_fail(0, 6));}
|
||||
|
||||
@Test public void Nowiki__atr() {fxt.Test_parse("<nowiki>a=b</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(8, 20));}
|
||||
@Test public void Nowiki__key() {fxt.Test_parse("a<nowiki>b</nowiki>c=d" , fxt.Make_pair("abc", "d").Atr_rng(0, 22));}
|
||||
@Test public void Nowiki__eql() {fxt.Test_parse("a<nowiki>=</nowiki>\"b\"" , fxt.Make_pair("a", "b").Atr_rng(0, 22));} // EX:fr.w:{{Portail|Transpédia|Californie}}
|
||||
@Test public void Nowiki__val__naked() {fxt.Test_parse("a=b<nowiki>c</nowiki>d" , fxt.Make_pair("a", "bcd").Atr_rng(0, 22));}
|
||||
@Test public void Nowiki__val__quote() {fxt.Test_parse("a=<nowiki>'b'</nowiki>" , fxt.Make_pair("a", "b").Atr_rng(0, 22));}
|
||||
@Test public void Nowiki__val__quote_2() {fxt.Test_parse("a=\"b<nowiki>c</nowiki>d<nowiki>e</nowiki>f\"", fxt.Make_pair("a", "bcdef"));}
|
||||
|
||||
@Test public void Val__as_int() {fxt.Test_val_as_int("-123" , -123);}
|
||||
|
||||
// @Test public void Embedded() { // PURPOSE: handle html inside attrib; PAGE:en.w:Economy_of_Greece DATE:2015-10-15
|
||||
// fxt.Test_parse("title='<sup id='cite_ref-a_1-0' class='reference'><a href='#cite_note-a-1'>[1]</a></sup> c'"
|
||||
// , fxt.Make_fail(0, 11) // "title='<sup" invalid b/c of "<"
|
||||
// , fxt.Make_pair("id", "cite_ref-a_1-0")
|
||||
// , fxt.Make_fail(31, 52) // "class='reference'><a" invalid b/c no ws after '
|
||||
// , fxt.Make_fail(53, 88) // "href='#cite_note-a-1'>[1]</a></sup>" invalid b/c no ws after '
|
||||
// , fxt.Make_fail(89, 91) // " c'" invalid b/c name (c) cannot have apos
|
||||
// );
|
||||
// }
|
||||
}
|
||||
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_wkr.java
Normal file
21
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_atr_wkr.java
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public interface Mwh_atr_wkr {
|
||||
void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx);
|
||||
}
|
||||
25
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_itm.java
Normal file
25
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_itm.java
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
class Mwh_doc_itm {
|
||||
public Mwh_doc_itm(int itm_tid, int nde_tid, byte[] itm_bry) {this.itm_tid = itm_tid; this.itm_bry = itm_bry; this.nde_tid = nde_tid;}
|
||||
public int Itm_tid() {return itm_tid;} private final int itm_tid;
|
||||
public byte[] Itm_bry() {return itm_bry;} private final byte[] itm_bry;
|
||||
public int Nde_tid() {return nde_tid;} private final int nde_tid;
|
||||
public static final int Itm_tid__txt = 0, Itm_tid__nde_head = 1, Itm_tid__nde_tail = 2, Itm_tid__comment = 3, Itm_tid__entity = 4;
|
||||
}
|
||||
62
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_mgr.java
Normal file
62
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_mgr.java
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
class Mwh_doc_mgr {
|
||||
private final int data_max_orig;
|
||||
public Mwh_doc_mgr(int max) {
|
||||
this.data_max_orig = max * Idx__mult;
|
||||
this.Max_(max);
|
||||
}
|
||||
public int Len() {return itm_len;} private int itm_len;
|
||||
public int[] Data_ary() {return data_ary;} private int[] data_ary; private int data_max;
|
||||
private void Max_(int len) {
|
||||
this.data_max = len * Idx__mult;
|
||||
this.data_ary = new int[data_max];
|
||||
this.itm_len = 0;
|
||||
}
|
||||
public void Clear() {
|
||||
if (data_max == data_max_orig)
|
||||
itm_len = 0;
|
||||
else
|
||||
Max_(data_max_orig / Idx__mult);
|
||||
}
|
||||
public int Add(int dom_tid, int src_bgn, int src_end) {
|
||||
int data_idx = itm_len * Idx__mult;
|
||||
if (data_idx == data_max) {
|
||||
int new_data_max = data_max == 0 ? Idx__mult : data_max * 2;
|
||||
int[] new_data_ary = new int[new_data_max];
|
||||
Int_.Ary_copy_to(data_ary, data_max, data_ary);
|
||||
this.data_ary = new_data_ary;
|
||||
this.data_max = new_data_max;
|
||||
}
|
||||
int dom_uid = itm_len;
|
||||
data_ary[data_idx + Idx_dom_uid] = dom_uid;
|
||||
data_ary[data_idx + Idx_dom_tid] = dom_tid;
|
||||
data_ary[data_idx + Idx_src_bgn] = src_bgn;
|
||||
data_ary[data_idx + Idx_src_end] = src_end;
|
||||
++itm_len;
|
||||
return dom_uid;
|
||||
}
|
||||
public static final int
|
||||
Idx_dom_uid = 0
|
||||
, Idx_dom_tid = 1
|
||||
, Idx_src_bgn = 2
|
||||
, Idx_src_end = 3
|
||||
, Idx__mult = 4
|
||||
;
|
||||
}
|
||||
245
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser.java
Normal file
245
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser.java
Normal file
@@ -0,0 +1,245 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import gplx.core.primitives.*;
|
||||
import gplx.xowa.parsers.amps.*; import gplx.xowa.parsers.xndes.*;
|
||||
public class Mwh_doc_parser {
|
||||
private final Mwh_doc_mgr dom_mgr = new Mwh_doc_mgr(16);
|
||||
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
||||
private final List_adp nde_stack = List_adp_.New();
|
||||
private final Xop_amp_mgr amp_mgr = Xop_amp_mgr.Instance; private final Xop_tkn_mkr tkn_mkr = new Xop_tkn_mkr();
|
||||
private byte[] src; private int src_end;
|
||||
private Mwh_doc_wkr wkr;
|
||||
private Hash_adp_bry nde_regy;
|
||||
private int txt_bgn, nde_uid;
|
||||
private Xop_xnde_tag cur_nde; private int cur_nde_tid;
|
||||
public void Parse(Mwh_doc_wkr wkr, byte[] src, int src_bgn, int src_end) {
|
||||
this.wkr = wkr; this.src = src; this.src_end = src_end;
|
||||
this.nde_regy = wkr.Nde_regy();
|
||||
nde_stack.Clear();
|
||||
int pos = txt_bgn = src_bgn;
|
||||
nde_uid = cur_nde_tid = -1;
|
||||
cur_nde = null;
|
||||
|
||||
while (pos < src_end) {
|
||||
byte b = src[pos];
|
||||
switch (b) {
|
||||
case Byte_ascii.Angle_bgn: // "<": possible nde start
|
||||
pos = Parse_nde(pos);
|
||||
break;
|
||||
case Byte_ascii.Amp: // "&": check for entity; EX: in sr-ec -> sr-el
|
||||
Xop_amp_mgr_rslt rv = amp_mgr.Parse_tkn(tkn_mkr, src, src_end, pos, pos + 1);
|
||||
Xop_tkn_itm rv_tkn = rv.Tkn();
|
||||
if (rv_tkn == null)
|
||||
++pos;
|
||||
else {
|
||||
wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, pos);
|
||||
wkr.On_entity_end(this, src, cur_nde_tid, rv_tkn.Src_bgn(), rv_tkn.Src_end());
|
||||
pos = rv_tkn.Src_end();
|
||||
txt_bgn = pos;
|
||||
}
|
||||
break;
|
||||
default: // else, just increment
|
||||
++pos;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (src_end != txt_bgn) wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, pos);
|
||||
}
|
||||
private int Parse_nde(int pos) {
|
||||
int nde_end_tid = Nde_end_tid__invalid;
|
||||
boolean nde_is_head = true;
|
||||
int nde_bgn = pos;
|
||||
++pos;
|
||||
int name_bgn = pos;
|
||||
int name_end = pos;
|
||||
while (pos < src_end) {
|
||||
byte b = src[pos];
|
||||
switch (b) {
|
||||
// valid chars for name
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
case Byte_ascii.Dot: case Byte_ascii.Dash: case Byte_ascii.Underline: case Byte_ascii.Colon: // XML allowed punctuation
|
||||
case Byte_ascii.Dollar:// MW: handles <br$2>;
|
||||
++pos;
|
||||
break;
|
||||
// comment check
|
||||
case Byte_ascii.Bang:
|
||||
boolean comment_found = false;
|
||||
if (name_bgn == pos && Bry_.Eq(src, pos + 1, pos + 3, Comment_bgn)) {
|
||||
int comment_end_pos = Bry_find_.Find_fwd(src, Comment_end, pos + 3);
|
||||
if (comment_end_pos != Bry_find_.Not_found) {
|
||||
nde_end_tid = Nde_end_tid__comment;
|
||||
pos = comment_end_pos + 3;
|
||||
comment_found = true;
|
||||
}
|
||||
}
|
||||
if (!comment_found)
|
||||
return pos;
|
||||
else
|
||||
break;
|
||||
// invalid char; not a node; treat as text; EX: "<!@#", "< /b>"
|
||||
default:
|
||||
return pos;
|
||||
// slash -> either "</b>" or "<b/>"
|
||||
case Byte_ascii.Slash:
|
||||
if (name_bgn == pos) { // "</"; EX: "</b>"
|
||||
nde_is_head = false;
|
||||
++name_bgn;
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
else { // check for "/>"; NOTE: <pre/a>, <pre//> are allowed
|
||||
name_end = pos;
|
||||
++pos;
|
||||
if (pos == src_end) return pos; // end of doc; treat as text; EX: "<b/EOS"
|
||||
if (src[pos] == Byte_ascii.Gt) {
|
||||
nde_end_tid = Nde_end_tid__inline;
|
||||
++pos;
|
||||
}
|
||||
else
|
||||
nde_end_tid = Nde_end_tid__slash;
|
||||
}
|
||||
break;
|
||||
// stops "name"
|
||||
case Byte_ascii.Gt:
|
||||
nde_end_tid = Nde_end_tid__gt;
|
||||
name_end = pos;
|
||||
++pos;
|
||||
break;
|
||||
case Byte_ascii.Tab: case Byte_ascii.Nl: case Byte_ascii.Cr: case Byte_ascii.Space:
|
||||
nde_end_tid = Nde_end_tid__ws;
|
||||
name_end = pos;
|
||||
break;
|
||||
case Byte_ascii.Backslash: // MW: allows "<br\>" -> "<br/>"
|
||||
nde_end_tid = Nde_end_tid__backslash;
|
||||
name_end = pos;
|
||||
break;
|
||||
}
|
||||
if (nde_end_tid != Nde_end_tid__invalid) break;
|
||||
}
|
||||
// get name
|
||||
Xop_xnde_tag nde_itm = null;
|
||||
if (nde_end_tid != Nde_end_tid__comment) {
|
||||
nde_itm = (Xop_xnde_tag)nde_regy.Get_by_mid(src, name_bgn, name_end);
|
||||
if (nde_itm == null) return pos; // not a known nde; exit
|
||||
}
|
||||
if (txt_bgn != nde_bgn) { // notify txt
|
||||
wkr.On_txt_end(this, src, cur_nde_tid, txt_bgn, nde_bgn);
|
||||
txt_bgn = pos;
|
||||
}
|
||||
if (nde_is_head) {
|
||||
wkr.On_nde_head_bgn(this, src, cur_nde_tid, name_bgn, name_end);
|
||||
switch (nde_end_tid) {
|
||||
case Nde_end_tid__comment:
|
||||
wkr.On_comment_end(this, src, cur_nde_tid, nde_bgn, pos);
|
||||
break;
|
||||
case Nde_end_tid__ws:
|
||||
case Nde_end_tid__slash:
|
||||
case Nde_end_tid__backslash:
|
||||
// look for ">" or "/>"
|
||||
int tmp_pos = pos, atrs_end = src_end, head_end = src_end;
|
||||
boolean loop = true;
|
||||
while (loop) {
|
||||
byte b = src[tmp_pos];
|
||||
switch (b) {
|
||||
// angle_end -> stop iterating
|
||||
case Byte_ascii.Angle_end:
|
||||
atrs_end = tmp_pos;
|
||||
head_end = tmp_pos + 1;
|
||||
nde_end_tid = Mwh_doc_parser.Nde_end_tid__gt;
|
||||
loop = false;
|
||||
break;
|
||||
// slash -> check for "/>" or " / "
|
||||
case Byte_ascii.Slash:
|
||||
int nxt_pos = tmp_pos + 1;
|
||||
if (nxt_pos == src_end) {
|
||||
nde_end_tid = Mwh_doc_parser.Nde_end_tid__invalid;
|
||||
loop = false;
|
||||
}
|
||||
else if (src[nxt_pos] == Byte_ascii.Angle_end) {
|
||||
atrs_end = tmp_pos;
|
||||
head_end = tmp_pos + 2;
|
||||
nde_end_tid = Mwh_doc_parser.Nde_end_tid__inline;
|
||||
loop = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (loop) {
|
||||
++tmp_pos;
|
||||
if (tmp_pos == src_end) break;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
atr_parser.Parse(wkr, nde_uid, cur_nde_tid, src, pos, atrs_end);
|
||||
pos = head_end;
|
||||
txt_bgn = head_end;
|
||||
break;
|
||||
}
|
||||
switch (nde_end_tid) {
|
||||
case Nde_end_tid__inline:
|
||||
wkr.On_nde_head_end(this, src, cur_nde_tid, nde_bgn, pos, Bool_.Y);
|
||||
txt_bgn = pos;
|
||||
break;
|
||||
case Nde_end_tid__gt:
|
||||
wkr.On_nde_head_end(this, src, cur_nde_tid, nde_bgn, pos, Bool_.N);
|
||||
txt_bgn = pos;
|
||||
if ( nde_itm != null
|
||||
&& !nde_itm.Single_only_html() // ignore <b>
|
||||
&& (cur_nde == null || !cur_nde.Xtn()) // <pre> ignores inner
|
||||
) {
|
||||
if (cur_nde != null)
|
||||
nde_stack.Add(cur_nde);
|
||||
this.cur_nde = nde_itm;
|
||||
this.cur_nde_tid = nde_itm.Id();
|
||||
}
|
||||
break;
|
||||
case Nde_end_tid__ws:
|
||||
case Nde_end_tid__slash:
|
||||
case Nde_end_tid__backslash: break; // handled above
|
||||
}
|
||||
nde_uid = dom_mgr.Add(Mwh_doc_itm.Itm_tid__nde_head, nde_bgn, pos);
|
||||
}
|
||||
else {
|
||||
switch (nde_end_tid) {
|
||||
case Nde_end_tid__gt:
|
||||
wkr.On_nde_tail_end(this, src, cur_nde_tid, nde_bgn, pos);
|
||||
txt_bgn = pos;
|
||||
if (nde_itm.Id() == cur_nde_tid) {
|
||||
cur_nde = (Xop_xnde_tag)List_adp_.Pop_or(nde_stack, null);
|
||||
cur_nde_tid = cur_nde == null ? -1 : cur_nde.Id();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
public static final int Nde_end_tid__invalid = 0, Nde_end_tid__gt = 1, Nde_end_tid__ws = 2, Nde_end_tid__inline = 3, Nde_end_tid__slash = 4, Nde_end_tid__backslash = 5, Nde_end_tid__comment = 6;
|
||||
private static final byte[] Comment_bgn = Bry_.new_a7("--"), Comment_end = Bry_.new_a7("-->");
|
||||
}
|
||||
75
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser_fxt.java
Normal file
75
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser_fxt.java
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
class Mwh_doc_parser_fxt {
|
||||
private final Bry_bfr expd_bfr = Bry_bfr_.New(), actl_bfr = Bry_bfr_.New();
|
||||
private final Mwh_doc_parser parser = new Mwh_doc_parser();
|
||||
private final Mwh_doc_wkr__itm_bldr wkr = new Mwh_doc_wkr__itm_bldr();
|
||||
public Mwh_doc_itm Make_txt (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , -1, Bry_.new_u8(raw));}
|
||||
public Mwh_doc_itm Make_txt (String raw, int nde_tid) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.new_u8(raw));}
|
||||
public Mwh_doc_itm Make_comment (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , -1, Bry_.new_u8(raw));}
|
||||
public Mwh_doc_itm Make_entity (String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__entity , -1, Bry_.new_u8(raw));}
|
||||
public Mwh_doc_itm Make_nde_head(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , -1, Bry_.new_u8(raw));}
|
||||
public Mwh_doc_itm Make_nde_tail(String raw) {return new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , -1, Bry_.new_u8(raw));}
|
||||
public void Test_parse(String raw, Mwh_doc_itm... expd) {
|
||||
Mwh_doc_itm[] actl = Exec_parse(raw);
|
||||
Test_print(expd, actl);
|
||||
}
|
||||
public Mwh_doc_itm[] Exec_parse(String raw) {
|
||||
byte[] bry = Bry_.new_u8(raw);
|
||||
parser.Parse(wkr, bry, 0, bry.length);
|
||||
return wkr.To_atr_ary();
|
||||
}
|
||||
public void Test_print(Mwh_doc_itm[] expd_ary, Mwh_doc_itm[] actl_ary) {
|
||||
int expd_len = expd_ary.length;
|
||||
int actl_len = actl_ary.length;
|
||||
int len = expd_len > actl_len ? expd_len : actl_len;
|
||||
for (int i = 0; i < len; ++i) {
|
||||
To_bfr(expd_bfr, i < expd_len ? expd_ary[i] : null, actl_bfr, i < actl_len ? actl_ary[i] : null);
|
||||
}
|
||||
Tfds.Eq_str_lines(expd_bfr.To_str_and_clear(), actl_bfr.To_str_and_clear());
|
||||
}
|
||||
private void To_bfr(Bry_bfr expd_bfr, Mwh_doc_itm expd_itm, Bry_bfr actl_bfr, Mwh_doc_itm actl_itm) {
|
||||
To_bfr__main(expd_bfr, expd_itm); To_bfr__main(actl_bfr, actl_itm);
|
||||
if (expd_itm != null && expd_itm.Nde_tid() != -1) {
|
||||
To_bfr__nde_tid(expd_bfr, expd_itm); To_bfr__nde_tid(actl_bfr, actl_itm);
|
||||
}
|
||||
}
|
||||
private void To_bfr__main(Bry_bfr bfr, Mwh_doc_itm itm) {
|
||||
if (itm == null) return;
|
||||
bfr.Add_str_a7("itm_tid:").Add_int_variable(itm.Itm_tid()).Add_byte_nl();
|
||||
bfr.Add_str_a7("txt:").Add(itm.Itm_bry()).Add_byte_nl();
|
||||
}
|
||||
private void To_bfr__nde_tid(Bry_bfr bfr, Mwh_doc_itm itm) {
|
||||
if (itm == null) return;
|
||||
bfr.Add_str_a7("nde_tid:").Add_int_variable(itm.Nde_tid()).Add_byte_nl();
|
||||
}
|
||||
}
|
||||
class Mwh_doc_wkr__itm_bldr implements Mwh_doc_wkr {
|
||||
private final List_adp list = List_adp_.New();
|
||||
public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw();
|
||||
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
|
||||
public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__txt , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
|
||||
public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
|
||||
public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_head , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
|
||||
public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__nde_tail , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
|
||||
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__comment , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
|
||||
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {list.Add(new Mwh_doc_itm(Mwh_doc_itm.Itm_tid__entity , nde_tid, Bry_.Mid(src, itm_bgn, itm_end)));}
|
||||
|
||||
public Mwh_doc_itm[] To_atr_ary() {return (Mwh_doc_itm[])list.To_ary_and_clear(Mwh_doc_itm.class);}
|
||||
}
|
||||
61
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser_tst.java
Normal file
61
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_parser_tst.java
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import org.junit.*; import gplx.xowa.parsers.xndes.*;
|
||||
public class Mwh_doc_parser_tst {
|
||||
private final Mwh_doc_parser_fxt fxt = new Mwh_doc_parser_fxt();
|
||||
@Test public void Text__basic() {fxt.Test_parse("abc" , fxt.Make_txt("abc"));}
|
||||
@Test public void Comment() {fxt.Test_parse("a<!--b-->c" , fxt.Make_txt("a"), fxt.Make_comment("<!--b-->"), fxt.Make_txt("c"));}
|
||||
@Test public void Entity() {fxt.Test_parse("a b" , fxt.Make_txt("a"), fxt.Make_entity(" "), fxt.Make_txt("b"));}
|
||||
@Test public void Fail__inline_eos() {fxt.Test_parse("a<b/" , fxt.Make_txt("a<b/"));}
|
||||
@Test public void Fail__unknown() {fxt.Test_parse("a<bc/>d" , fxt.Make_txt("a<bc/>d"));}
|
||||
@Test public void Node__inline() {fxt.Test_parse("a<b/>c" , fxt.Make_txt("a"), fxt.Make_nde_head("<b/>") , fxt.Make_txt("c"));}
|
||||
@Test public void Node__pair() {fxt.Test_parse("a<b>c</b>d" , fxt.Make_txt("a"), fxt.Make_nde_head("<b>") , fxt.Make_txt("c"), fxt.Make_nde_tail("</b>"), fxt.Make_txt("d"));}
|
||||
@Test public void Atrs__pair() {
|
||||
fxt.Test_parse("<div id='1'>a</div>"
|
||||
, fxt.Make_nde_head("<div id='1'>")
|
||||
, fxt.Make_txt("a")
|
||||
, fxt.Make_nde_tail("</div>"));
|
||||
}
|
||||
@Test public void Atrs__inline() {
|
||||
fxt.Test_parse("a<div id='1'/>b"
|
||||
, fxt.Make_txt("a")
|
||||
, fxt.Make_nde_head("<div id='1'/>")
|
||||
, fxt.Make_txt("b"));
|
||||
}
|
||||
@Test public void Node__single_only() {
|
||||
fxt.Test_parse("<b>a<br>b</b>c"
|
||||
, fxt.Make_nde_head("<b>")
|
||||
, fxt.Make_txt("a", Xop_xnde_tag_.Tid__b)
|
||||
, fxt.Make_nde_head("<br>")
|
||||
, fxt.Make_txt("b", Xop_xnde_tag_.Tid__b) // <b> not <br>
|
||||
, fxt.Make_nde_tail("</b>")
|
||||
, fxt.Make_txt("c", Xop_xnde_tag_.Tid__null)
|
||||
);
|
||||
}
|
||||
@Test public void Node__pre() {
|
||||
fxt.Test_parse("<pre>a<div>b</pre>c"
|
||||
, fxt.Make_nde_head("<pre>")
|
||||
, fxt.Make_txt("a", Xop_xnde_tag_.Tid__pre)
|
||||
, fxt.Make_nde_head("<div>")
|
||||
, fxt.Make_txt("b", Xop_xnde_tag_.Tid__pre) // <pre> not <div>
|
||||
, fxt.Make_nde_tail("</pre>")
|
||||
, fxt.Make_txt("c", Xop_xnde_tag_.Tid__null)
|
||||
);
|
||||
}
|
||||
}
|
||||
27
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_wkr.java
Normal file
27
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_wkr.java
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public interface Mwh_doc_wkr extends Mwh_atr_wkr {
|
||||
Hash_adp_bry Nde_regy();
|
||||
void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
|
||||
void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end);
|
||||
void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline);
|
||||
void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
|
||||
void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
|
||||
void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end);
|
||||
}
|
||||
31
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_wkr_.java
Normal file
31
400_xowa/src/gplx/xowa/parsers/htmls/Mwh_doc_wkr_.java
Normal file
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import gplx.xowa.parsers.xndes.*;
|
||||
public class Mwh_doc_wkr_ {
|
||||
public static Hash_adp_bry Nde_regy__mw() {
|
||||
Xop_xnde_tag[] ary = Xop_xnde_tag_.Ary;
|
||||
int len = ary.length;
|
||||
Hash_adp_bry rv = Hash_adp_bry.ci_a7();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
Xop_xnde_tag itm = ary[i];
|
||||
rv.Add(itm.Name_bry(), itm);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
public class Mwh_doc_wkr__atr_bldr implements Mwh_doc_wkr {
|
||||
private final List_adp list = List_adp_.New();
|
||||
public Hash_adp_bry Nde_regy() {return null;}
|
||||
public void On_atr_each(Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] data_ary, int itm_idx) {
|
||||
int atr_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_bgn];
|
||||
int atr_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_end];
|
||||
int key_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_bgn];
|
||||
int key_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_key_end];
|
||||
int val_bgn = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_bgn];
|
||||
int val_end = data_ary[itm_idx + Mwh_atr_mgr.Idx_val_end];
|
||||
int eql_pos = data_ary[itm_idx + Mwh_atr_mgr.Idx_eql_pos];
|
||||
int qte_tid = data_ary[itm_idx + Mwh_atr_mgr.Idx_atr_utl];
|
||||
qte_tid = Mwh_atr_itm_.Calc_qte_tid(qte_tid);
|
||||
if (!key_exists) val_bry_manual = key_bry;
|
||||
Mwh_atr_itm atr = new Mwh_atr_itm(src, valid, repeated, key_exists, atr_bgn, atr_end, key_bgn, key_end, key_bry, val_bgn, val_end, val_bry_manual, eql_pos, qte_tid);
|
||||
list.Add(atr);
|
||||
}
|
||||
public void On_txt_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_nde_head_bgn(Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
|
||||
public void On_nde_head_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
|
||||
public void On_nde_tail_end(Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
|
||||
public Mwh_atr_itm[] To_atr_ary() {return (Mwh_atr_itm[])list.To_ary_and_clear(Mwh_atr_itm.class);}
|
||||
public int Atrs__len() {return list.Len();}
|
||||
public Mwh_atr_itm Atrs__get_at(int i) {return (Mwh_atr_itm)list.Get_at(i);}
|
||||
public void Atrs__clear() {list.Clear();}
|
||||
}
|
||||
Reference in New Issue
Block a user