Mw_parse: Mass checkin of various mediawiki parse files

pull/620/head
gnosygnu 8 years ago
parent 6a5c114998
commit cef2d7e2f6

@ -114,4 +114,9 @@ public class Array_ {
Set_at(trg, i, Get_at(add, i - srcLen)); Set_at(trg, i, Get_at(add, i - srcLen));
return trg; return trg;
} }
public static Object Clone(Object src) {
Object trg = Create(Component_type(src), Len(src));
Copy(src, trg);
return trg;
}
} }

@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx; package gplx;
import java.lang.*; import java.lang.*;
import gplx.core.brys.*; import gplx.core.primitives.*; import gplx.core.ios.*; import gplx.core.brys.*; import gplx.core.primitives.*; import gplx.core.ios.*;
import gplx.langs.htmls.entitys.*;
public class Bry_ { public class Bry_ {
public static final String Cls_val_name = "byte[]"; public static final String Cls_val_name = "byte[]";
public static final byte[] Empty = new byte[0]; public static final byte[] Empty = new byte[0];
@ -62,6 +63,7 @@ public class Bry_ {
public static byte[] new_u8(String str) { public static byte[] new_u8(String str) {
try { try {
int str_len = str.length(); int str_len = str.length();
if (str_len == 0) return Bry_.Empty;
int bry_len = new_u8__by_len(str, str_len); int bry_len = new_u8__by_len(str, str_len);
byte[] rv = new byte[bry_len]; byte[] rv = new byte[bry_len];
new_u8__write(str, str_len, rv, 0); new_u8__write(str, str_len, rv, 0);
@ -365,7 +367,7 @@ public class Bry_ {
|| (end < bgn) || (end < bgn)
) )
return or; return or;
return Mid(src, bgn, src.length); return bgn == src_len ? Bry_.Empty : Mid(src, bgn, src_len);
} }
public static byte[] Mid(byte[] src, int bgn, int end) { public static byte[] Mid(byte[] src, int bgn, int end) {
try { try {
@ -1001,6 +1003,33 @@ public class Bry_ {
} }
return rv; return rv;
} }
public static byte[] Xcase__build__all(Bry_bfr tmp, boolean upper, byte[] src) {
if (src == null) return null;
int src_bgn = 0;
int src_end = src.length;
int lbound = 96, ubound = 123;
if (!upper) {
lbound = 64; ubound = 91;
}
boolean dirty = false;
for (int i = src_bgn; i < src_end; i++) {
byte b = src[i];
if (b > lbound && b < ubound) {
if (!dirty) {
dirty = true;
tmp.Add_mid(src, src_bgn, i);
}
if (upper)
b -= 32;
else
b += 32;
}
if (dirty)
tmp.Add_byte(b);
}
return dirty ? tmp.To_bry_and_clear() : src;
}
public static byte[] Ucase__1st(byte[] src) {return Xcase__1st(Bool_.Y, src);} public static byte[] Ucase__1st(byte[] src) {return Xcase__1st(Bool_.Y, src);}
public static byte[] Lcase__1st(byte[] src) {return Xcase__1st(Bool_.N, src);} public static byte[] Lcase__1st(byte[] src) {return Xcase__1st(Bool_.N, src);}
private static byte[] Xcase__1st(boolean upper, byte[] src) { private static byte[] Xcase__1st(boolean upper, byte[] src) {
@ -1076,4 +1105,71 @@ public class Bry_ {
public static byte[] Replace_nl_w_tab(byte[] src, int bgn, int end) { public static byte[] Replace_nl_w_tab(byte[] src, int bgn, int end) {
return Bry_.Replace(Bry_.Mid(src, bgn, end), Byte_ascii.Nl, Byte_ascii.Tab); return Bry_.Replace(Bry_.Mid(src, bgn, end), Byte_ascii.Nl, Byte_ascii.Tab);
} }
public static byte[] Escape_html(byte[] src) {
return Escape_html(null, src, 0, src.length);
}
public static byte[] Escape_html(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) { // uses PHP rules for htmlspecialchars; REF.PHP:http://php.net/manual/en/function.htmlspecialchars.php
boolean dirty = false;
int cur = src_bgn;
int prv = cur;
boolean called_by_bry = bfr == null;
// loop over chars
while (true) {
// if EOS, exit
if (cur == src_end) {
if (dirty) {
bfr.Add_mid(src, prv, src_end);
}
break;
}
// check current byte if escaped
byte b = src[cur];
byte[] escaped = null;
switch (b) {
case Byte_ascii.Amp: escaped = Gfh_entity_.Amp_bry; break;
case Byte_ascii.Quote: escaped = Gfh_entity_.Quote_bry; break;
case Byte_ascii.Apos: escaped = Gfh_entity_.Apos_num_bry; break;
case Byte_ascii.Lt: escaped = Gfh_entity_.Lt_bry; break;
case Byte_ascii.Gt: escaped = Gfh_entity_.Gt_bry; break;
}
// not escaped; increment and continue
if (escaped == null) {
cur++;
continue;
}
// escaped
else {
dirty = true;
if (bfr == null) bfr = Bry_bfr_.New();
if (prv < cur)
bfr.Add_mid(src, prv, cur);
bfr.Add(escaped);
cur++;
prv = cur;
}
}
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return null;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
} }

@ -279,8 +279,13 @@ public class Bry__tst {
@Test public void Repeat_bry() { @Test public void Repeat_bry() {
fxt.Test__repeat_bry("abc" , 3, "abcabcabc"); fxt.Test__repeat_bry("abc" , 3, "abcabcabc");
} }
@Test public void Xcase__build__all() {
fxt.Test__xcase__build__all(Bool_.N, "abc", "abc");
fxt.Test__xcase__build__all(Bool_.N, "aBc", "abc");
}
} }
class Bry__fxt { class Bry__fxt {
private final Bry_bfr tmp = Bry_bfr_.New();
public void Test_trim_end(String raw, byte trim, String expd) { public void Test_trim_end(String raw, byte trim, String expd) {
byte[] raw_bry = Bry_.new_a7(raw); byte[] raw_bry = Bry_.new_a7(raw);
Tfds.Eq(expd, String_.new_u8(Bry_.Trim_end(raw_bry, trim, raw_bry.length))); Tfds.Eq(expd, String_.new_u8(Bry_.Trim_end(raw_bry, trim, raw_bry.length)));
@ -298,4 +303,7 @@ class Bry__fxt {
public void Test__repeat_bry(String s, int count, String expd) { public void Test__repeat_bry(String s, int count, String expd) {
Gftest.Eq__str(expd, Bry_.Repeat_bry(Bry_.new_u8(s), count)); Gftest.Eq__str(expd, Bry_.Repeat_bry(Bry_.new_u8(s), count));
} }
public void Test__xcase__build__all(boolean upper, String src, String expd) {
Gftest.Eq__str(expd, Bry_.Xcase__build__all(tmp, upper, Bry_.new_u8(src)));
}
} }

@ -297,35 +297,21 @@ public class Bry_bfr {
Add_mid(val, bgn, end); Add_mid(val, bgn, end);
return this; return this;
} }
public Bry_bfr Add_bry_escape_html(byte[] val) {return Add_bry_escape_html(val, 0, val.length);} public Bry_bfr Add_bry_many(byte[]... ary) {
public Bry_bfr Add_bry_escape_html(byte[] val, int bgn, int end) { // uses PHP rules for htmlspecialchars; REF.PHP:http://php.net/manual/en/function.htmlspecialchars.php int len = ary.length;
boolean clean = true; for (int i = 0; i < len; i++) {
for (int i = bgn; i < end; ++i) { byte[] bry = ary[i];
byte[] escaped = null; if (bry != null && bry.length > 0)
byte b = val[i]; this.Add(bry);
switch (b) {
case Byte_ascii.Amp: escaped = Gfh_entity_.Amp_bry; break;
case Byte_ascii.Quote: escaped = Gfh_entity_.Quote_bry; break;
case Byte_ascii.Apos: escaped = Gfh_entity_.Apos_num_bry; break;
case Byte_ascii.Lt: escaped = Gfh_entity_.Lt_bry; break;
case Byte_ascii.Gt: escaped = Gfh_entity_.Gt_bry; break;
}
if (escaped == null && clean) {
continue;
}
else {
if (clean) {
clean = false;
this.Add_mid(val, bgn, i);
} }
if (escaped == null) return this;
this.Add_byte(b);
else
this.Add(escaped);
} }
public Bry_bfr Add_bry_escape_html(byte[] val) {
if (val == null) return this;
return Add_bry_escape_html(val, 0, val.length);
} }
if (clean) public Bry_bfr Add_bry_escape_html(byte[] val, int bgn, int end) {
Add_mid(val, bgn, end); Bry_.Escape_html(this, val, bgn, end);
return this; return this;
} }
public Bry_bfr Add_str_u8_w_nl(String s) {Add_str_u8(s); return Add_byte_nl();} public Bry_bfr Add_str_u8_w_nl(String s) {Add_str_u8(s); return Add_byte_nl();}
@ -542,6 +528,30 @@ public class Bry_bfr {
this.Del_by(count); this.Del_by(count);
return this; return this;
} }
public Bry_bfr Trim_end_ws() {
if (bfr_len == 0) return this;
int count = 0;
for (int i = bfr_len - 1; i > -1; --i) {
byte b = bfr[i];
if (Trim_end_ws_ary[b])
++count;
else
break;
}
if (count > 0)
this.Del_by(count);
return this;
}
private static final boolean[] Trim_end_ws_ary = Trim_end_ws_new();
private static boolean[] Trim_end_ws_new() {
boolean[] rv = new boolean[256];
rv[32] = true;
rv[ 9] = true;
rv[10] = true;
rv[13] = true;
rv[11] = true;
return rv;
}
public Bry_bfr Concat_skip_empty(byte[] dlm, byte[]... ary) { public Bry_bfr Concat_skip_empty(byte[] dlm, byte[]... ary) {
int ary_len = ary.length; int ary_len = ary.length;
for (int i = 0; i < ary_len; i++) { for (int i = 0; i < ary_len; i++) {

@ -245,6 +245,13 @@ public class Bry_find_ {
cur += while_len; cur += while_len;
} }
} }
public static int Find_fwd_while_in(byte[] src, int cur, int end, boolean[] while_ary) {
while (cur < end) {
if (cur == end || !while_ary[src[cur]]) return cur;
cur++;
}
return end;
}
public static int Find_fwd_until(byte[] src, int cur, int end, byte until_byte) { public static int Find_fwd_until(byte[] src, int cur, int end, byte until_byte) {
while (true) { while (true) {
if ( cur == end if ( cur == end

@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx; package gplx;
import org.junit.*; import org.junit.*; import gplx.core.tests.*;
public class Bry_find__tst { public class Bry_find__tst {
private Bry_find__fxt fxt = new Bry_find__fxt(); private Bry_find__fxt fxt = new Bry_find__fxt();
@Test public void Find_fwd() { @Test public void Find_fwd() {
@ -59,6 +59,10 @@ public class Bry_find__tst {
fxt.Test_Trim_bwd_space_tab("" , 0); fxt.Test_Trim_bwd_space_tab("" , 0);
fxt.Test_Trim_bwd_space_tab(" \t" , 0); fxt.Test_Trim_bwd_space_tab(" \t" , 0);
} }
@Test public void Find_fwd_while_in() {
boolean[] while_ary = fxt.Init__find_fwd_while_in(Byte_ascii.Space, Byte_ascii.Tab, Byte_ascii.Nl);
fxt.Test__find_fwd_while_in(" \t\na", while_ary, 3);
}
} }
class Bry_find__fxt { class Bry_find__fxt {
public void Test_Find_fwd(String src, String lkp, int bgn, int expd) {Tfds.Eq(expd, Bry_find_.Find_fwd(Bry_.new_u8(src), Bry_.new_u8(lkp), bgn));} public void Test_Find_fwd(String src, String lkp, int bgn, int expd) {Tfds.Eq(expd, Bry_find_.Find_fwd(Bry_.new_u8(src), Bry_.new_u8(lkp), bgn));}
@ -74,4 +78,15 @@ class Bry_find__fxt {
int actl = Bry_find_.Trim_fwd_space_tab(raw_bry, 0, raw_bry.length); int actl = Bry_find_.Trim_fwd_space_tab(raw_bry, 0, raw_bry.length);
Tfds.Eq(expd, actl, raw_str); Tfds.Eq(expd, actl, raw_str);
} }
public boolean[] Init__find_fwd_while_in(byte... ary) {
boolean[] rv = new boolean[256];
int len = ary.length;
for (int i = 0; i < len; i++)
rv[ary[i]] = true;
return rv;
}
public void Test__find_fwd_while_in(String src, boolean[] ary, int expd) {
byte[] src_bry = Bry_.new_u8(src);
Gftest.Eq__int(expd, Bry_find_.Find_fwd_while_in(src_bry, 0, src_bry.length, ary));
}
} }

@ -48,7 +48,7 @@ public class Bry_split_ {
boolean reset = true; boolean reset = true;
if (itm_bgn == -1) { if (itm_bgn == -1) {
if (pos_is_last) {} // skip dlm at bgn / end; EX: "a," if (pos_is_last) {} // skip dlm at bgn / end; EX: "a,"
else {wkr.Split(src, itm_bgn, itm_end);} // else, process "empty" dlm; EX: ",a" else {wkr.Split(src, pos, pos );} // else, process "empty" dlm; EX: ",a"
} }
else { else {
int rv = wkr.Split(src, itm_bgn, itm_end); int rv = wkr.Split(src, itm_bgn, itm_end);

@ -43,6 +43,9 @@ public class Bry_split__tst {
fxt.Test_split("a|b|c|d" , 2, 6, "|", "b", "c"); fxt.Test_split("a|b|c|d" , 2, 6, "|", "b", "c");
fxt.Test_split("a|b|c|d" , 2, 4, "|", "b"); fxt.Test_split("a|b|c|d" , 2, 4, "|", "b");
} }
@Test public void Empty() {
fxt.Test_split("a\n\nb" , Byte_ascii.Nl, Bool_.N, "a", "", "b");
}
@Test public void Split_w_max() { @Test public void Split_w_max() {
fxt.Test__split_w_max("a|b|c|d" , Byte_ascii.Pipe, 2, "a", "b"); // max is less fxt.Test__split_w_max("a|b|c|d" , Byte_ascii.Pipe, 2, "a", "b"); // max is less
fxt.Test__split_w_max("a" , Byte_ascii.Pipe, 2, "a", null); // max is more fxt.Test__split_w_max("a" , Byte_ascii.Pipe, 2, "a", null); // max is more

@ -109,6 +109,14 @@ public class Btrie_slim_mgr implements Btrie_mgr {
Add_obj(Bry_.new_u8(ary[i]), bval); Add_obj(Bry_.new_u8(ary[i]), bval);
return this; return this;
} }
public Btrie_slim_mgr Add_many_str(String... ary) {
int len = ary.length;
for (int i = 0; i < len; i++) {
byte[] itm = Bry_.new_u8(ary[i]);
Add_obj(itm, itm);
}
return this;
}
public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));} public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));}
public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) { public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) {
int len = ary.length; int len = ary.length;

@ -87,11 +87,31 @@ public class Hex_utl_ {
public static void Write(byte[] bry, int bgn, int end, int val) { public static void Write(byte[] bry, int bgn, int end, int val) {
for (int i = end - 1; i > bgn - 1; i--) { for (int i = end - 1; i > bgn - 1; i--) {
int b = val % 16; int b = val % 16;
bry[i] = To_byte(b); bry[i] = To_byte_ucase(b);
val /= 16; val /= 16;
if (val == 0) break; if (val == 0) break;
} }
} }
public static void Write_bfr(Bry_bfr bfr, boolean lcase, int val) {
// count bytes
int val_len = 0;
int tmp = val;
while (true) {
tmp /= 16;
val_len++;
if (tmp == 0) break;
}
// fill bytes from right to left
int hex_bgn = bfr.Len();
bfr.Add_byte_repeat(Byte_ascii.Null, val_len);
byte[] bry = bfr.Bfr();
for (int i = 0; i < val_len; i++) {
int b = val % 16;
bry[hex_bgn + val_len - i - 1] = lcase ? To_byte_lcase(b) : To_byte_ucase(b);
val /= 16;
}
}
public static boolean Is_hex_many(byte... ary) { public static boolean Is_hex_many(byte... ary) {
for (byte itm : ary) { for (byte itm : ary) {
switch (itm) { switch (itm) {
@ -123,7 +143,7 @@ public class Hex_utl_ {
default: throw Err_.new_parse("hexstring", Int_.To_str(val)); default: throw Err_.new_parse("hexstring", Int_.To_str(val));
} }
} }
private static byte To_byte(int v) { private static byte To_byte_ucase(int v) {
switch (v) { switch (v) {
case 0: return Byte_ascii.Num_0; case 1: return Byte_ascii.Num_1; case 2: return Byte_ascii.Num_2; case 3: return Byte_ascii.Num_3; case 4: return Byte_ascii.Num_4; case 0: return Byte_ascii.Num_0; case 1: return Byte_ascii.Num_1; case 2: return Byte_ascii.Num_2; case 3: return Byte_ascii.Num_3; case 4: return Byte_ascii.Num_4;
case 5: return Byte_ascii.Num_5; case 6: return Byte_ascii.Num_6; case 7: return Byte_ascii.Num_7; case 8: return Byte_ascii.Num_8; case 9: return Byte_ascii.Num_9; case 5: return Byte_ascii.Num_5; case 6: return Byte_ascii.Num_6; case 7: return Byte_ascii.Num_7; case 8: return Byte_ascii.Num_8; case 9: return Byte_ascii.Num_9;

@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.core.encoders; import gplx.*; import gplx.core.*; package gplx.core.encoders; import gplx.*; import gplx.core.*;
import org.junit.*; import org.junit.*; import gplx.core.tests.*;
public class Hex_utl__tst { public class Hex_utl__tst {
private final Hex_utl__fxt fxt = new Hex_utl__fxt(); private final Hex_utl__fxt fxt = new Hex_utl__fxt();
@Test public void To_int() { @Test public void To_int() {
@ -46,6 +46,15 @@ public class Hex_utl__tst {
fxt.Test__write("[00000000]", 1, 9, 15, "[0000000F]"); fxt.Test__write("[00000000]", 1, 9, 15, "[0000000F]");
fxt.Test__write("[00000000]", 1, 9, 255, "[000000FF]"); fxt.Test__write("[00000000]", 1, 9, 255, "[000000FF]");
} }
@Test public void Write_bfr() {
fxt.Test__write_bfr(Bool_.Y, 0, "0");
fxt.Test__write_bfr(Bool_.Y, 15, "f");
fxt.Test__write_bfr(Bool_.Y, 16, "10");
fxt.Test__write_bfr(Bool_.Y, 32, "20");
fxt.Test__write_bfr(Bool_.Y, 255, "ff");
fxt.Test__write_bfr(Bool_.Y, 256, "100");
fxt.Test__write_bfr(Bool_.Y, Int_.Max_value, "7fffffff");
}
} }
class Hex_utl__fxt { class Hex_utl__fxt {
public void Test__write(String s, int bgn, int end, int val, String expd) { public void Test__write(String s, int bgn, int end, int val, String expd) {
@ -63,6 +72,11 @@ class Hex_utl__fxt {
String actl = Hex_utl_.To_str(val, pad); String actl = Hex_utl_.To_str(val, pad);
Tfds.Eq(expd, actl); Tfds.Eq(expd, actl);
} }
private final Bry_bfr bfr = Bry_bfr_.New();
public void Test__write_bfr(boolean lcase, int val, String expd) {
Hex_utl_.Write_bfr(bfr, lcase, val);
Gftest.Eq__str(expd, bfr.To_str_and_clear());
}
// public void Test__encode_bry(int val, int pad, String expd) { // public void Test__encode_bry(int val, int pad, String expd) {
// String actl = Hex_utl_.To_str(val, pad); // String actl = Hex_utl_.To_str(val, pad);
// Tfds.Eq(expd, actl); // Tfds.Eq(expd, actl);

@ -16,6 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import gplx.core.btries.*;
import gplx.core.primitives.*; import gplx.core.primitives.*;
public class Php_preg_ { public class Php_preg_ {
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) { public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
@ -27,7 +28,7 @@ public class Php_preg_ {
while (true) { while (true) {
if (i == src_end) break; if (i == src_end) break;
int dlm_end = i + dlm_len; int dlm_end = i + dlm_len;
if (dlm_end < src_end && Bry_.Eq(src, i, dlm_end, dlm)) { if (dlm_end <= src_end && Bry_.Eq(src, i, dlm_end, dlm)) {
if (extend) { if (extend) {
dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth); dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth);
} }
@ -42,13 +43,33 @@ public class Php_preg_ {
// create brys // create brys
int rv_len = list.Len() - 1; int rv_len = list.Len() - 1;
if (rv_len == 1) return null; if (rv_len == 1) {
list.Clear();
return null;
}
if (list.Get_at(list.Len() - 2) == src_end) { // if 2nd to last elem == src_end, then last item is Bry_.Empty; ignore it; EX: "a''" -> "a", "''" x> "a", "''", ""
rv_len--;
}
byte[][] rv = new byte[rv_len][]; byte[][] rv = new byte[rv_len][];
for (i = 0; i < rv_len; i += 2) { for (i = 0; i < rv_len; i += 2) {
rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1)); rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1));
if (i + 1 == rv_len) break; if (i + 1 == rv_len) break;
rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2)); rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2));
} }
list.Clear();
return rv; return rv;
} }
public static Object Match(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (cur < src_end) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else {
return o;
}
}
return null;
}
} }

@ -21,11 +21,12 @@ public class Php_preg___tst {
private final Php_preg___fxt fxt = new Php_preg___fxt(); private final Php_preg___fxt fxt = new Php_preg___fxt();
@Test public void Basic() {fxt.Test__split("a''b''c" , "''", Bool_.Y, "a", "''", "b", "''", "c");} @Test public void Basic() {fxt.Test__split("a''b''c" , "''", Bool_.Y, "a", "''", "b", "''", "c");}
@Test public void Extend() {fxt.Test__split("a'''b'''c" , "''", Bool_.Y, "a", "'''", "b", "'''", "c");} @Test public void Extend() {fxt.Test__split("a'''b'''c" , "''", Bool_.Y, "a", "'''", "b", "'''", "c");}
@Test public void Eos() {fxt.Test__split("a''" , "''", Bool_.Y, "a", "''");}
} }
class Php_preg___fxt { class Php_preg___fxt {
private final gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list();
public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);} public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);}
public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) { public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) {
gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list();
byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend); byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend);
Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed"); Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed");
} }

@ -16,7 +16,11 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import gplx.core.btries.*;
public class Php_str_ { public class Php_str_ {
public static int Strpos(byte[] src, byte find, int bgn, int end) {
return Bry_find_.Find_fwd(src, find, bgn, end);
}
public static byte[] Substr(byte[] src, int bgn) {return Substr(src, bgn, src.length);} public static byte[] Substr(byte[] src, int bgn) {return Substr(src, bgn, src.length);}
public static byte[] Substr(byte[] src, int bgn, int len) { public static byte[] Substr(byte[] src, int bgn, int len) {
int src_len = src.length; int src_len = src.length;
@ -29,12 +33,24 @@ public class Php_str_ {
public static byte Substr_byte(byte[] src, int bgn) {return Substr_byte(src, bgn, src.length);} public static byte Substr_byte(byte[] src, int bgn) {return Substr_byte(src, bgn, src.length);}
public static byte Substr_byte(byte[] src, int bgn, int len) { public static byte Substr_byte(byte[] src, int bgn, int len) {
int src_len = src.length; int src_len = src.length;
if (src_len == 0) return Byte_ascii.Null;
if (bgn < 0) bgn = src_len + bgn; // handle negative if (bgn < 0) bgn = src_len + bgn; // handle negative
if (bgn < 0) bgn = 0; // handle out of bounds; EX: ("a", -1, -1) if (bgn < 0) bgn = 0; // handle out of bounds; EX: ("a", -1, -1)
int end = len < 0 ? src_len + len : bgn + len; int end = len < 0 ? src_len + len : bgn + len;
if (end > src.length) end = src.length;; // handle out of bounds; if (end > src.length) end = src.length;; // handle out of bounds;
return src[bgn]; return src[bgn];
} }
public static int Strspn_fwd__ary(byte[] src, boolean[] find, int bgn, int max, int src_len) {
if (max == -1) max = src_len;
int rv = 0;
for (int i = bgn; i < src_len; i++) {
if (find[src[i]] && rv < max)
rv++;
else
break;
}
return rv;
}
public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) { public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
if (max == -1) max = src_len; if (max == -1) max = src_len;
int rv = 0; int rv = 0;
@ -91,4 +107,31 @@ public class Php_str_ {
} }
return rv; return rv;
} }
public static byte[] Strtr(byte[] src, Btrie_slim_mgr trie, Bry_bfr tmp, Btrie_rv trv) {
boolean dirty = false;
int src_bgn = 0;
int src_end = src.length;
int i = src_bgn;
while (true) {
if (i == src_end) break;
byte b = src[i];
Object o = trie.Match_at_w_b0(trv, b, src, i, src_end);
if (o == null) {
if (dirty) {
tmp.Add_byte(b);
}
i++;
}
else {
if (!dirty) {
dirty = true;
tmp.Add_mid(src, 0, i);
}
tmp.Add((byte[])o);
i = trv.Pos();
}
}
return dirty ? tmp.To_bry_and_clear() : src;
}
} }

@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import org.junit.*; import gplx.core.tests.*; import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
public class Php_str___tst { public class Php_str___tst {
private final Php_str___fxt fxt = new Php_str___fxt(); private final Php_str___fxt fxt = new Php_str___fxt();
@Test public void Strspn_fwd__byte() { @Test public void Strspn_fwd__byte() {
@ -43,6 +43,14 @@ public class Php_str___tst {
fxt.Test__substr("abcde" , -1, "e"); fxt.Test__substr("abcde" , -1, "e");
fxt.Test__substr("abcde" , -3, -1, "cd"); fxt.Test__substr("abcde" , -3, -1, "cd");
} }
@Test public void Strtr() {
fxt.Init__strtr_by_trie("01", "89", "02", "79");
fxt.Test__strtr_by_trie("abc" , "abc"); // found=none
fxt.Test__strtr_by_trie("ab_01_cd" , "ab_89_cd"); // found=one
fxt.Test__strtr_by_trie("ab_01_cd_02_ef", "ab_89_cd_79_ef"); // found=many
fxt.Test__strtr_by_trie("01_ab" , "89_ab"); // BOS
fxt.Test__strtr_by_trie("ab_01" , "ab_89"); // EOS
}
} }
class Php_str___fxt { class Php_str___fxt {
public void Test__strspn_fwd__byte(String src_str, byte find, int bgn, int max, int expd) { public void Test__strspn_fwd__byte(String src_str, byte find, int bgn, int max, int expd) {
@ -63,4 +71,17 @@ class Php_str___fxt {
public void Test__substr(String src_str, int bgn, int len, String expd) { public void Test__substr(String src_str, int bgn, int len, String expd) {
Gftest.Eq__str(expd, Php_str_.Substr(Bry_.new_u8(src_str), bgn, len)); Gftest.Eq__str(expd, Php_str_.Substr(Bry_.new_u8(src_str), bgn, len));
} }
private Btrie_slim_mgr strtr_trie;
public void Init__strtr_by_trie(String... kvs) {
if (strtr_trie == null) strtr_trie = Btrie_slim_mgr.cs();
int len = kvs.length;
for (int i = 0; i < len; i += 2) {
strtr_trie.Add_str_str(kvs[i], kvs[i + 1]);
}
}
public void Test__strtr_by_trie(String src, String expd) {
Bry_bfr tmp = Bry_bfr_.New();
Btrie_rv trv = new Btrie_rv();
Gftest.Eq__str(expd, Php_str_.Strtr(Bry_.new_u8(src), strtr_trie, tmp, trv));
}
} }

@ -132,6 +132,67 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
return Bry_.Mid(full_txt, page_bgn, ques_pos == Bry_find_.Not_found ? full_txt_len : ques_pos); return Bry_.Mid(full_txt, page_bgn, ques_pos == Bry_find_.Not_found ? full_txt_len : ques_pos);
} }
public byte[] Get_prefixed_text() {return Full_txt_wo_qarg();}
public byte[] Get_prefixed_db_key() {return Full_db();}
public boolean Has_fragment() {return anch_bgn != -1;}
public byte[] Get_fragment() {return Anch_txt();}
public byte[] Get_link_url(byte[] query, boolean query2, boolean proto) {
// if ( $this->isExternal() || $proto !== false ) {
// $ret = $this->getFullURL( $query, $query2, $proto );
// }
// else if ( $this->getPrefixedText() === '' && $this->hasFragment() ) {
// $ret = $this->getFragmentForURL();
// }
// else {
// $ret = $this->getLocalURL( $query, $query2 ) . $this->getFragmentForURL();
// }
return Bry_.Add(gplx.xowa.htmls.hrefs.Xoh_href_.Bry__wiki, this.Full_db_w_anch());
}
public boolean Is_always_known() {
// $isKnown = null;
/**
* Allows overriding default behavior for determining if a page exists.
* If $isKnown is kept as null, regular checks happen. If it's
* a boolean, this value is returned by the isKnown method.
*
* @since 1.20
*
* @param Title $title
* @param boolean|null $isKnown
*/
// Hooks::run( 'TitleIsAlwaysKnown', [ $this, &$isKnown ] );
//
// if ( !is_null( $isKnown ) ) {
// return $isKnown;
// }
//
// if ( $this->isExternal() ) {
// return true; // any interwiki link might be viewable, for all we know
// }
//
// switch ( $this->mNamespace ) {
// case NS_MEDIA:
// case NS_FILE:
// // file exists, possibly in a foreign repo
// return (boolean)wfFindFile( $this );
// case NS_SPECIAL:
// // valid special page
// return SpecialPageFactory::exists( $this->getDBkey() );
// case NS_MAIN:
// // selflink, possibly with fragment
// return $this->mDbkeyform == '';
// case NS_MEDIAWIKI:
// // known system message
// return $this->hasSourceText() !== false;
// default:
// return false;
// }
return false;
}
public boolean Is_external() {return this.wik_bgn != -1;}
public static final byte Subpage_spr = Byte_ascii.Slash; // EX: A/B/C public static final byte Subpage_spr = Byte_ascii.Slash; // EX: A/B/C
public static final Xoa_ttl Null = null; public static final Xoa_ttl Null = null;

@ -46,3 +46,12 @@ public class Xocfg_dflt_mgr {
gfs_mgr.Run_url(url); gfs_mgr.Run_url(url);
} }
} }
class Xocfg_dflt_itm__static implements Gfo_invk {
private final String val;
public Xocfg_dflt_itm__static(String val) {
this.val = val;
}
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
return val;
}
}

@ -17,6 +17,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.addons.bldrs.updates.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.updates.*; package gplx.xowa.addons.bldrs.updates.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.updates.*;
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*;
import gplx.xowa.files.*;
public class Xodel_small_cmd extends Xob_cmd__base { public class Xodel_small_cmd extends Xob_cmd__base {
public Xodel_small_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki);} public Xodel_small_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki);}
private final int[] ext_max_ary = Xobldr__fsdb_db__delete_small_files_.New_ext_max_ary(); private final int[] ext_max_ary = Xobldr__fsdb_db__delete_small_files_.New_ext_max_ary();
@ -31,3 +32,20 @@ public class Xodel_small_cmd extends Xob_cmd__base {
public static final Xob_cmd Prototype = new Xodel_small_cmd(null, null); public static final Xob_cmd Prototype = new Xodel_small_cmd(null, null);
@Override public Xob_cmd Cmd_clone(Xob_bldr bldr, Xowe_wiki wiki) {return new Xodel_small_cmd(bldr, wiki);} @Override public Xob_cmd Cmd_clone(Xob_bldr bldr, Xowe_wiki wiki) {return new Xodel_small_cmd(bldr, wiki);}
} }
class Xobldr__fsdb_db__delete_small_files_ {
public static int[] New_ext_max_ary() {
int[] rv = new int[Xof_ext_.Id__max];
Ext_max_(rv, 35, Xof_ext_.Id_svg);
Ext_max_(rv, 40, Xof_ext_.Id_gif);
Ext_max_(rv, 100, Xof_ext_.Id_png, Xof_ext_.Id_jpg, Xof_ext_.Id_jpeg);
Ext_max_(rv, 500, Xof_ext_.Id_tif, Xof_ext_.Id_tiff);
Ext_max_(rv, 500, Xof_ext_.Id_xcf);
Ext_max_(rv, 1000, Xof_ext_.Id_bmp);
Ext_max_(rv, 700, Xof_ext_.Id_webm);
Ext_max_(rv, 1000, Xof_ext_.Id_ogv);
Ext_max_(rv, 400, Xof_ext_.Id_pdf);
Ext_max_(rv, 700, Xof_ext_.Id_djvu);
return rv;
}
private static void Ext_max_(int[] ary, int max, int... exts) {for (int ext : exts) ary[ext] = max;}
}

@ -19,6 +19,7 @@ package gplx.xowa.addons.bldrs.updates.files; import gplx.*; import gplx.xowa.*;
import gplx.dbs.*; import gplx.dbs.*;
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.*;
import gplx.fsdb.*; import gplx.fsdb.meta.*; import gplx.xowa.files.*; import gplx.fsdb.*; import gplx.fsdb.meta.*; import gplx.xowa.files.*;
import gplx.xowa.bldrs.wkrs.*;
class Xodel_small_mgr { class Xodel_small_mgr {
public void Exec(Xowe_wiki wiki, int[] ext_max_ary) { public void Exec(Xowe_wiki wiki, int[] ext_max_ary) {
wiki.Init_assert(); wiki.Init_assert();
@ -53,20 +54,3 @@ class Xodel_small_mgr {
); );
} }
} }
class Xobldr__fsdb_db__delete_small_files_ {
public static int[] New_ext_max_ary() {
int[] rv = new int[Xof_ext_.Id__max];
Ext_max_(rv, 35, Xof_ext_.Id_svg);
Ext_max_(rv, 40, Xof_ext_.Id_gif);
Ext_max_(rv, 100, Xof_ext_.Id_png, Xof_ext_.Id_jpg, Xof_ext_.Id_jpeg);
Ext_max_(rv, 500, Xof_ext_.Id_tif, Xof_ext_.Id_tiff);
Ext_max_(rv, 500, Xof_ext_.Id_xcf);
Ext_max_(rv, 1000, Xof_ext_.Id_bmp);
Ext_max_(rv, 700, Xof_ext_.Id_webm);
Ext_max_(rv, 1000, Xof_ext_.Id_ogv);
Ext_max_(rv, 400, Xof_ext_.Id_pdf);
Ext_max_(rv, 700, Xof_ext_.Id_djvu);
return rv;
}
private static void Ext_max_(int[] ary, int max, int... exts) {for (int ext : exts) ary[ext] = max;}
}

@ -21,7 +21,7 @@ public class Xol_lnki_trail_mgr implements Gfo_invk {
public Xol_lnki_trail_mgr(Xol_lang_itm lang) {} public Xol_lnki_trail_mgr(Xol_lang_itm lang) {}
public void Clear() {trie.Clear();} public void Clear() {trie.Clear();}
public int Count() {return trie.Count();} public int Count() {return trie.Count();}
public Btrie_slim_mgr Trie() {return trie;} Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); public Btrie_slim_mgr Trie() {return trie;} private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
public void Add(byte[] v) {trie.Add_obj(v, v);} public void Add(byte[] v) {trie.Add_obj(v, v);}
public void Del(byte[] v) {trie.Del(v);} public void Del(byte[] v) {trie.Del(v);}
private void Add(String... ary) { private void Add(String... ary) {

@ -0,0 +1,819 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import gplx.core.btries.*;
import gplx.xowa.mws.htmls.*;
public class Xomw_linker {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Linker_rel_splitter splitter = new Linker_rel_splitter();
private final Xomw_html_utl html_utl = new Xomw_html_utl();
private byte[] wg_title = null;
private final Btrie_rv trv = new Btrie_rv();
private final byte[][] split_trail_rv = new byte[2][];
private Btrie_slim_mgr split_trail_trie;
private static final byte[] Atr__class = Bry_.new_a7("class"), Atr__rel = Bry_.new_a7("rel"), Atr__href = Bry_.new_a7("href"), Rel__nofollow = Bry_.new_a7("nofollow");
public void Init_by_wiki(Btrie_slim_mgr trie) {
this.split_trail_trie = trie;
}
// /**
// * This function returns an HTML link to the given target. It serves a few
// * purposes:
// * 1) If $target is a Title, the correct URL to link to will be figured
// * out automatically.
// * 2) It automatically adds the usual classes for various types of link
// * targets: "new" for red links, "stub" for short articles, etc.
// * 3) It escapes all attribute values safely so there's no risk of XSS.
// * 4) It provides a default tooltip if the target is a Title (the page
// * name of the target).
// * link() replaces the old functions in the makeLink() family.
// *
// * @since 1.18 Method exists since 1.16 as non-static, made static in 1.18.
// * @deprecated since 1.28, use MediaWiki\Linker\LinkRenderer instead
// *
// * @param Title $target Can currently only be a Title, but this may
// * change to support Images, literal URLs, etc.
// * @param String $html The HTML contents of the <a> element, i.e.,
// * the link text. This is raw HTML and will not be escaped. If null,
// * defaults to the prefixed text of the Title; or if the Title is just a
// * fragment, the contents of the fragment.
// * @param array $customAttribs A key => value array of extra HTML attributes,
// * such as title and class. (href is ignored.) Classes will be
// * merged with the default classes, while other attributes will replace
// * default attributes. All passed attribute values will be HTML-escaped.
// * A false attribute value means to suppress that attribute.
// * @param array $query The query String to append to the URL
// * you're linking to, in key => value array form. Query keys and values
// * will be URL-encoded.
// * @param String|array $options String or array of strings:
// * 'known': Page is known to exist, so don't check if it does.
// * 'broken': Page is known not to exist, so don't check if it does.
// * 'noclasses': Don't add any classes automatically (includes "new",
// * "stub", "mw-redirect", "extiw"). Only use the class attribute
// * provided, if any, so you get a simple blue link with no funny i-
// * cons.
// * 'forcearticlepath': Use the article path always, even with a querystring.
// * Has compatibility issues on some setups, so avoid wherever possible.
// * 'http': Force a full URL with http:// as the scheme.
// * 'https': Force a full URL with https:// as the scheme.
// * 'stubThreshold' => (int): Stub threshold to use when determining link classes.
// * @return String HTML <a> attribute
// */
// public static function link(
// $target, $html = null, $customAttribs = [], $query = [], $options = []
// ) {
// if ( !$target instanceof Title ) {
// wfWarn( __METHOD__ . ': Requires $target to be a Title Object.', 2 );
// return "<!-- ERROR -->$html";
// }
//
// if ( is_string( $query ) ) {
// // some functions withing core using this still hand over query strings
// wfDeprecated( __METHOD__ . ' with parameter $query as String (should be array)', '1.20' );
// $query = wfCgiToArray( $query );
// }
//
// $services = MediaWikiServices::getInstance();
// $options = (array)$options;
// if ( $options ) {
// // Custom options, create new LinkRenderer
// if ( !isset( $options['stubThreshold'] ) ) {
// $defaultLinkRenderer = $services->getLinkRenderer();
// $options['stubThreshold'] = $defaultLinkRenderer->getStubThreshold();
// }
// $linkRenderer = $services->getLinkRendererFactory()
// ->createFromLegacyOptions( $options );
// } else {
// $linkRenderer = $services->getLinkRenderer();
// }
//
// if ( $html !== null ) {
// $text = new HtmlArmor( $html );
// } else {
// $text = $html; // null
// }
// if ( in_array( 'known', $options, true ) ) {
// return $linkRenderer->makeKnownLink( $target, $text, $customAttribs, $query );
// } elseif ( in_array( 'broken', $options, true ) ) {
// return $linkRenderer->makeBrokenLink( $target, $text, $customAttribs, $query );
// } elseif ( in_array( 'noclasses', $options, true ) ) {
// return $linkRenderer->makePreloadedLink( $target, $text, '', $customAttribs, $query );
// } else {
// return $linkRenderer->makeLink( $target, $text, $customAttribs, $query );
// }
// }
public void Make_self_link_obj(Bry_bfr bfr, Xoa_ttl nt, byte[] html, byte[] query, byte[] trail, byte[] prefix) {
// MW.HOOK:SelfLinkBegin
if (html == Bry_.Empty) {
html = tmp.Add_bry_escape_html(nt.Get_prefixed_text()).To_bry_and_clear();
}
byte[] inside = Bry_.Empty;
byte[][] split_trail = Split_trail(trail);
inside = split_trail[0];
trail = split_trail[1];
bfr.Add_str_a7("<strong class=\"selflink\">");
bfr.Add_bry_many(prefix, html, inside);
bfr.Add_str_a7("</strong>");
bfr.Add(trail);
}
public void Make_external_link(Bry_bfr bfr, byte[] url, byte[] text, boolean escape, byte[] link_type, Xomwh_atr_mgr attribs, byte[] title) {
tmp.Add_str_a7("external");
if (link_type != null) {
tmp.Add_byte_space().Add(link_type);
}
Xomwh_atr_itm cls_itm = attribs.Get_by_or_make(Atr__class);
if (cls_itm.Val() != null) {
tmp.Add(cls_itm.Val());
}
cls_itm.Val_(tmp.To_bry_and_clear());
if (escape)
text = tmp.Add_bry_escape_html(text).To_bry_and_clear();
if (title == null)
title = wg_title;
byte[] new_rel = Get_external_link_rel(url, title);
Xomwh_atr_itm cur_rel_atr = attribs.Get_by_or_make(Atr__rel);
if (cur_rel_atr.Val() == null) {
cur_rel_atr.Val_(new_rel);
}
else {
// Merge the rel attributes.
byte[] cur_rel = cur_rel_atr.Val();
Bry_split_.Split(new_rel, 0, new_rel.length, Byte_ascii.Space, Bool_.N, splitter); // $newRels = explode( ' ', $newRel );
Bry_split_.Split(cur_rel, 0, cur_rel.length, Byte_ascii.Space, Bool_.N, splitter); // $oldRels = explode( ' ', $attribs['rel'] );
cur_rel_atr.Val_(splitter.To_bry()); // $attribs['rel'] = implode( ' ', $combined );
}
//$link = '';
//$success = Hooks::run( 'LinkerMakeExternalLink',
// [ &$url, &$text, &$link, &$attribs, $linktype ] );
//if ( !$success ) {
// wfDebug( "Hook LinkerMakeExternalLink changed the output of link "
// . "with url {$url} and text {$text} to {$link}\n", true );
// return $link;
//}
attribs.Set(Atr__href, url);
html_utl.Raw_element(bfr, Bry_.new_a7("a"), attribs, text);
}
private byte[] Get_external_link_rel(byte[] url, byte[] title) {
// global $wgNoFollowLinks, $wgNoFollowNsExceptions, $wgNoFollowDomainExceptions;
// $ns = $title ? $title->getNamespace() : false;
// if ( $wgNoFollowLinks && !in_array( $ns, $wgNoFollowNsExceptions )
// && !wfMatchesDomainList( $url, $wgNoFollowDomainExceptions )
// ) {
return Rel__nofollow;
// }
// return null;
}
public void Normalize_subpage_link(Xomw_linker__normalize_subpage_link rv, Xoa_ttl context_title, byte[] target, byte[] text) {
// Valid link forms:
// Foobar -- normal
// :Foobar -- override special treatment of prefix (images, language links)
// /Foobar -- convert to CurrentPage/Foobar
// /Foobar/ -- convert to CurrentPage/Foobar, strip the initial and final / from text
// ../ -- convert to CurrentPage, from CurrentPage/CurrentSubPage
// ../Foobar -- convert to CurrentPage/Foobar,
// (from CurrentPage/CurrentSubPage)
// ../Foobar/ -- convert to CurrentPage/Foobar, use 'Foobar' as text
// (from CurrentPage/CurrentSubPage)
byte[] ret = target; // default return value is no change
// Some namespaces don't allow subpages,
// so only perform processing if subpages are allowed
if (context_title != null && context_title.Ns().Subpages_enabled()) {
int hash = Bry_find_.Find_fwd(target, Byte_ascii.Hash);
byte[] suffix = null;
if (hash != Bry_find_.Not_found) {
suffix = Bry_.Mid(target, hash);
target = Bry_.Mid(target, 0, hash);
}
else {
suffix = Bry_.Empty;
}
// bug 7425
target = Bry_.Trim(target);
// Look at the first character
if (target != Bry_.Empty && target[0] == Byte_ascii.Slash) {
// / at end means we don't want the slash to be shown
int target_len = target.length;
int trailing_slashes_bgn = Bry_find_.Find_bwd_while(target, target_len, 0, Byte_ascii.Slash) + 1;
byte[] no_slash = null;
if (trailing_slashes_bgn != target_len) {
no_slash = target = Bry_.Mid(target, 1, trailing_slashes_bgn);
}
else {
no_slash = Bry_.Mid(target, 1);
}
ret = Bry_.Add(context_title.Get_prefixed_text(), Byte_ascii.Slash_bry, Bry_.Trim(no_slash), suffix);
if (text == Bry_.Empty) {
text = Bry_.Add(target, suffix);
} // this might be changed for ugliness reasons
}
else {
// check for .. subpage backlinks
int dot2_count = 0;
byte[] dot2_stripped = target;
while (Bry_.Match(dot2_stripped, 0, 3, Bry__dot2)) {
++dot2_count;
dot2_stripped = Bry_.Mid(dot2_stripped, 3);
}
if (dot2_count > 0) {
byte[][] exploded = Bry_split_.Split(context_title.Get_prefixed_text(), Byte_ascii.Slash);
int exploded_len = exploded.length;
if (exploded_len > dot2_count) { // not allowed to go below top level page
// PORTED: ret = implode('/', array_slice($exploded, 0, -dot2_count));
int implode_len = exploded_len - dot2_count;
for (int i = 0; i < implode_len; i++) {
if (i != 0) tmp.Add_byte(Byte_ascii.Slash);
tmp.Add(exploded[i]);
}
// / at the end means don't show full path
if (Bry_.Has_at_end(dot2_stripped, Byte_ascii.Slash)) {
dot2_stripped = Bry_.Mid(dot2_stripped, 0, dot2_stripped.length - 1);
if (text == Bry_.Empty) {
text = Bry_.Add(dot2_stripped, suffix);
}
}
dot2_stripped = Bry_.Trim(dot2_stripped);
if (dot2_stripped != Bry_.Empty) {
tmp.Add_bry_many(Byte_ascii.Slash_bry, dot2_stripped);
}
tmp.Add(suffix);
ret = tmp.To_bry_and_clear();
}
}
}
}
rv.Init(ret, text);
}
public byte[][] Split_trail(byte[] trail) {
int cur = 0;
int src_end = trail.length;
while (true) {
Object o = split_trail_trie.Match_at(trv, trail, cur, src_end);
if (o == null) break;
byte[] bry = (byte[])o;
cur += bry.length;
}
if (cur == 0) { // no trail
split_trail_rv[0] = null;
split_trail_rv[1] = trail;
}
else {
split_trail_rv[0] = Bry_.Mid(trail, 0, cur);
split_trail_rv[1] = Bry_.Mid(trail, cur, src_end);
}
return split_trail_rv;
}
public void Make_image(Bry_bfr bfr, Xoa_ttl title, byte[] options, boolean holders) {
// Check if the options text is of the form "options|alt text"
// Options are:
// * thumbnail make a thumbnail with enlarge-icon and caption, alignment depends on lang
// * left no resizing, just left align. label is used for alt= only
// * right same, but right aligned
// * none same, but not aligned
// * ___px scale to ___ pixels width, no aligning. e.g. use in taxobox
// * center center the image
// * frame Keep original image size, no magnify-button.
// * framed Same as "frame"
// * frameless like 'thumb' but without a frame. Keeps user preferences for width
// * upright reduce width for upright images, rounded to full __0 px
// * border draw a 1px border around the image
// * alt Text for HTML alt attribute (defaults to empty)
// * class Set a class for img node
// * link Set the target of the image link. Can be external, interwiki, or local
// vertical-align values (no % or length right now):
// * baseline
// * sub
// * super
// * top
// * text-top
// * middle
// * bottom
// * text-bottom
// Protect LanguageConverter markup when splitting into parts
// $parts = StringUtils::delimiterExplode(
// '-{', '}-', '|', $options, true /* allow nesting */
// );
// Give extensions a chance to select the file revision for us
// $options = [];
// $descQuery = false;
// MW.HOOK:BeforeParserFetchFileAndTitle
// Fetch and register the file (file title may be different via hooks)
// list($file, $title) = $this->fetchFileAndTitle($title, $options);
// Get parameter map
// $handler = $file ? $file->getHandler() : false;
// list($paramMap, $mwArray) = $this->getImageParams($handler);
// if (!$file) {
// $this->addTrackingCategory('broken-file-category');
// }
// Process the input parameters
// $caption = '';
// $params = [ 'frame' => [], 'handler' => [],
// 'horizAlign' => [], 'vertAlign' => [] ];
// $seenformat = false;
// foreach ($parts as $part) {
// $part = trim($part);
// list($magicName, $value) = $mwArray->matchVariableStartToEnd($part);
// $validated = false;
// if (isset($paramMap[$magicName])) {
// list($type, $paramName) = $paramMap[$magicName];
// Special case; width and height come in one variable together
// if ($type === 'handler' && $paramName === 'width') {
// $parsedWidthParam = $this->parseWidthParam($value);
// if (isset($parsedWidthParam['width'])) {
// $width = $parsedWidthParam['width'];
// if ($handler->validateParam('width', $width)) {
// $params[$type]['width'] = $width;
// $validated = true;
// }
// }
// if (isset($parsedWidthParam['height'])) {
// $height = $parsedWidthParam['height'];
// if ($handler->validateParam('height', $height)) {
// $params[$type]['height'] = $height;
// $validated = true;
// }
// }
// else no validation -- T15436
// } else {
// if ($type === 'handler') {
// // Validate handler parameter
// $validated = $handler->validateParam($paramName, $value);
// } else {
// // Validate @gplx.Internal protected parameters
// switch ($paramName) {
// case 'manualthumb':
// case 'alt':
// case 'class':
// @todo FIXME: Possibly check validity here for
// manualthumb? downstream behavior seems odd with
// missing manual thumbs.
// $validated = true;
// $value = $this->stripAltText($value, $holders);
// break;
// case 'link':
// $chars = self::EXT_LINK_URL_CLASS;
// $addr = self::EXT_LINK_ADDR;
// $prots = $this->mUrlProtocols;
// if ($value === '') {
// $paramName = 'no-link';
// $value = true;
// $validated = true;
// } elseif (preg_match("/^((?i)$prots)/", $value)) {
// if (preg_match("/^((?i)$prots)$addr$chars*$/u", $value, $m)) {
// $paramName = 'link-url';
// $this->mOutput->addExternalLink($value);
// if ($this->mOptions->getExternalLinkTarget()) {
// $params[$type]['link-target'] = $this->mOptions->getExternalLinkTarget();
// }
// $validated = true;
// }
// } else {
// $linkTitle = Title::newFromText($value);
// if ($linkTitle) {
// $paramName = 'link-title';
// $value = $linkTitle;
// $this->mOutput->addLink($linkTitle);
// $validated = true;
// }
// }
// break;
// case 'frameless':
// case 'framed':
// case 'thumbnail':
// // use first appearing option, discard others.
// $validated = !$seenformat;
// $seenformat = true;
// break;
// default:
// // Most other things appear to be empty or numeric...
// $validated = ($value === false || is_numeric(trim($value)));
// }
// }
// if ($validated) {
// $params[$type][$paramName] = $value;
// }
// }
// }
// if (!$validated) {
// $caption = $part;
// }
// }
// Process alignment parameters
// if ($params['horizAlign']) {
// $params['frame']['align'] = key($params['horizAlign']);
// }
// if ($params['vertAlign']) {
// $params['frame']['valign'] = key($params['vertAlign']);
// }
// $params['frame']['caption'] = $caption;
// Will the image be presented in a frame, with the caption below?
// $imageIsFramed = isset($params['frame']['frame'])
// || isset($params['frame']['framed'])
// || isset($params['frame']['thumbnail'])
// || isset($params['frame']['manualthumb']);
// In the old days, [[Image:Foo|text...]] would set alt text. Later it
// came to also set the caption, ordinary text after the image -- which
// makes no sense, because that just repeats the text multiple times in
// screen readers. It *also* came to set the title attribute.
// Now that we have an alt attribute, we should not set the alt text to
// equal the caption: that's worse than useless, it just repeats the
// text. This is the framed/thumbnail case. If there's no caption, we
// use the unnamed parameter for alt text as well, just for the time be-
// ing, if the unnamed param is set and the alt param is not.
// For the future, we need to figure out if we want to tweak this more,
// e.g., introducing a title= parameter for the title; ignoring the un-
// named parameter entirely for images without a caption; adding an ex-
// plicit caption= parameter and preserving the old magic unnamed para-
// meter for BC; ...
// if ($imageIsFramed) { // Framed image
// if ($caption === '' && !isset($params['frame']['alt'])) {
// // No caption or alt text, add the filename as the alt text so
// // that screen readers at least get some description of the image
// $params['frame']['alt'] = $title->getText();
// }
// Do not set $params['frame']['title'] because tooltips don't make sense
// for framed images
// } else { // Inline image
// if (!isset($params['frame']['alt'])) {
// // No alt text, use the "caption" for the alt text
// if ($caption !== '') {
// $params['frame']['alt'] = $this->stripAltText($caption, $holders);
// } else {
// // No caption, fall back to using the filename for the
// // alt text
// $params['frame']['alt'] = $title->getText();
// }
// }
// Use the "caption" for the tooltip text
// $params['frame']['title'] = $this->stripAltText($caption, $holders);
// }
// MW.HOOK:ParserMakeImageParams
// Linker does the rest
// $time = isset($options['time']) ? $options['time'] : false;
// $ret = Linker::makeImageLink($this, $title, $file, $params['frame'], $params['handler'],
// $time, $descQuery, $this->mOptions->getThumbSize());
// Give the handler a chance to modify the parser Object
// if ($handler) {
// $handler->parserTransformHook($this, $file);
// }
// return $ret;
}
// public function getImageParams($handler) {
// if ($handler) {
// $handlerClass = get_class($handler);
// }
// else {
// $handlerClass = '';
// }
// if (!isset($this->mImageParams[$handlerClass])) {
// Initialise static lists
// static $internalParamNames = [
// 'horizAlign' => [ 'left', 'right', 'center', 'none' ],
// 'vertAlign' => [ 'baseline', 'sub', 'super', 'top', 'text-top', 'middle',
// 'bottom', 'text-bottom' ],
// 'frame' => [ 'thumbnail', 'manualthumb', 'framed', 'frameless',
// 'upright', 'border', 'link', 'alt', 'class' ],
// ];
// static $internalParamMap;
// if (!$internalParamMap) {
// $internalParamMap = [];
// foreach ($internalParamNames as $type => $names) {
// foreach ($names as $name) {
// $magicName = str_replace('-', '_', "img_$name");
// $internalParamMap[$magicName] = [ $type, $name ];
// }
// }
// }
// Add handler params
// $paramMap = $internalParamMap;
// if ($handler) {
// $handlerParamMap = $handler->getParamMap();
// foreach ($handlerParamMap as $magic => $paramName) {
// $paramMap[$magic] = [ 'handler', $paramName ];
// }
// }
// $this->mImageParams[$handlerClass] = $paramMap;
// $this->mImageParamsMagicArray[$handlerClass] = new MagicWordArray(array_keys($paramMap));
// }
// return [ $this->mImageParams[$handlerClass], $this->mImageParamsMagicArray[$handlerClass] ];
// }
// /**
// * Make HTML for a thumbnail including image, border and caption
// * @param Title $title
// * @param File|boolean $file File Object or false if it doesn't exist
// * @param String $label
// * @param String $alt
// * @param String $align
// * @param array $params
// * @param boolean $framed
// * @param String $manualthumb
// * @return String
// */
// public static function makeThumbLinkObj( Title $title, $file, $label = '', $alt,
// $align = 'right', $params = [], $framed = false, $manualthumb = ""
// ) {
// $frameParams = [
// 'alt' => $alt,
// 'caption' => $label,
// 'align' => $align
// ];
// if ( $framed ) {
// $frameParams['framed'] = true;
// }
// if ( $manualthumb ) {
// $frameParams['manualthumb'] = $manualthumb;
// }
// return self::makeThumbLink2( $title, $file, $frameParams, $params );
// }
//
// /**
// * @param Title $title
// * @param File $file
// * @param array $frameParams
// * @param array $handlerParams
// * @param boolean $time
// * @param String $query
// * @return String
// */
// public static function makeThumbLink2( Title $title, $file, $frameParams = [],
// $handlerParams = [], $time = false, $query = ""
// ) {
// $exists = $file && $file->exists();
//
// $page = isset( $handlerParams['page'] ) ? $handlerParams['page'] : false;
// if ( !isset( $frameParams['align'] ) ) {
// $frameParams['align'] = 'right';
// }
// if ( !isset( $frameParams['alt'] ) ) {
// $frameParams['alt'] = '';
// }
// if ( !isset( $frameParams['title'] ) ) {
// $frameParams['title'] = '';
// }
// if ( !isset( $frameParams['caption'] ) ) {
// $frameParams['caption'] = '';
// }
//
// if ( empty( $handlerParams['width'] ) ) {
// // Reduce width for upright images when parameter 'upright' is used
// $handlerParams['width'] = isset( $frameParams['upright'] ) ? 130 : 180;
// }
// $thumb = false;
// $noscale = false;
// $manualthumb = false;
//
// if ( !$exists ) {
// $outerWidth = $handlerParams['width'] + 2;
// } else {
// if ( isset( $frameParams['manualthumb'] ) ) {
// # Use manually specified thumbnail
// $manual_title = Title::makeTitleSafe( NS_FILE, $frameParams['manualthumb'] );
// if ( $manual_title ) {
// $manual_img = wfFindFile( $manual_title );
// if ( $manual_img ) {
// $thumb = $manual_img->getUnscaledThumb( $handlerParams );
// $manualthumb = true;
// } else {
// $exists = false;
// }
// }
// } elseif ( isset( $frameParams['framed'] ) ) {
// // Use image dimensions, don't scale
// $thumb = $file->getUnscaledThumb( $handlerParams );
// $noscale = true;
// } else {
// # Do not present an image bigger than the source, for bitmap-style images
// # This is a hack to maintain compatibility with arbitrary pre-1.10 behavior
// $srcWidth = $file->getWidth( $page );
// if ( $srcWidth && !$file->mustRender() && $handlerParams['width'] > $srcWidth ) {
// $handlerParams['width'] = $srcWidth;
// }
// $thumb = $file->transform( $handlerParams );
// }
//
// if ( $thumb ) {
// $outerWidth = $thumb->getWidth() + 2;
// } else {
// $outerWidth = $handlerParams['width'] + 2;
// }
// }
//
// # ThumbnailImage::toHtml() already adds page= onto the end of DjVu URLs
// # So we don't need to pass it here in $query. However, the URL for the
// # zoom icon still needs it, so we make a unique query for it. See bug 14771
// $url = $title->getLocalURL( $query );
// if ( $page ) {
// $url = wfAppendQuery( $url, [ 'page' => $page ] );
// }
// if ( $manualthumb
// && !isset( $frameParams['link-title'] )
// && !isset( $frameParams['link-url'] )
// && !isset( $frameParams['no-link'] ) ) {
// $frameParams['link-url'] = $url;
// }
//
// $s = "<div class=\"thumb t{$frameParams['align']}\">"
// . "<div class=\"thumbinner\" style=\"width:{$outerWidth}px;\">";
//
// if ( !$exists ) {
// $s .= self::makeBrokenImageLinkObj( $title, $frameParams['title'], '', '', '', $time == true );
// $zoomIcon = '';
// } elseif ( !$thumb ) {
// $s .= wfMessage( 'thumbnail_error', '' )->escaped();
// $zoomIcon = '';
// } else {
// if ( !$noscale && !$manualthumb ) {
// self::processResponsiveImages( $file, $thumb, $handlerParams );
// }
// $params = [
// 'alt' => $frameParams['alt'],
// 'title' => $frameParams['title'],
// 'img-class' => ( isset( $frameParams['class'] ) && $frameParams['class'] !== ''
// ? $frameParams['class'] . ' '
// : '' ) . 'thumbimage'
// ];
// $params = self::getImageLinkMTOParams( $frameParams, $query ) + $params;
// $s .= $thumb->toHtml( $params );
// if ( isset( $frameParams['framed'] ) ) {
// $zoomIcon = "";
// } else {
// $zoomIcon = Html::rawElement( 'div', [ 'class' => 'magnify' ],
// Html::rawElement( 'a', [
// 'href' => $url,
// 'class' => '@gplx.Internal protected',
// 'title' => wfMessage( 'thumbnail-more' )->text() ],
// "" ) );
// }
// }
// $s .= ' <div class="thumbcaption">' . $zoomIcon . $frameParams['caption'] . "</div></div></div>";
// return str_replace( "\n", ' ', $s );
// }
// /**
// * Make a "broken" link to an image
// *
// * @since 1.16.3
// * @param Title $title
// * @param String $label Link label (plain text)
// * @param String $query Query String
// * @param String $unused1 Unused parameter kept for b/c
// * @param String $unused2 Unused parameter kept for b/c
// * @param boolean $time A file of a certain timestamp was requested
// * @return String
// */
// public static function makeBrokenImageLinkObj( $title, $label = '',
// $query = '', $unused1 = '', $unused2 = '', $time = false
// ) {
// if ( !$title instanceof Title ) {
// wfWarn( __METHOD__ . ': Requires $title to be a Title Object.' );
// return "<!-- ERROR -->" . htmlspecialchars( $label );
// }
//
// global $wgEnableUploads, $wgUploadMissingFileUrl, $wgUploadNavigationUrl;
// if ( $label == '' ) {
// $label = $title->getPrefixedText();
// }
// $encLabel = htmlspecialchars( $label );
// $currentExists = $time ? ( wfFindFile( $title ) != false ) : false;
//
// if ( ( $wgUploadMissingFileUrl || $wgUploadNavigationUrl || $wgEnableUploads )
// && !$currentExists
// ) {
// $redir = RepoGroup::singleton()->getLocalRepo()->checkRedirect( $title );
//
// if ( $redir ) {
// // We already know it's a redirect, so mark it
// // accordingly
// return self::link(
// $title,
// $encLabel,
// [ 'class' => 'mw-redirect' ],
// wfCgiToArray( $query ),
// [ 'known', 'noclasses' ]
// );
// }
//
// $href = self::getUploadUrl( $title, $query );
//
// return '<a href="' . htmlspecialchars( $href ) . '" class="new" title="' .
// htmlspecialchars( $title->getPrefixedText(), ENT_QUOTES ) . '">' .
// $encLabel . '</a>';
// }
//
// return self::link( $title, $encLabel, [], wfCgiToArray( $query ), [ 'known', 'noclasses' ] );
// }
// /**
// * Create a direct link to a given uploaded file.
// *
// * @since 1.16.3
// * @param Title $title
// * @param String $html Pre-sanitized HTML
// * @param String $time MW timestamp of file creation time
// * @return String HTML
// */
// public static function makeMediaLinkObj( $title, $html = '', $time = false ) {
// $img = wfFindFile( $title, [ 'time' => $time ] );
// return self::makeMediaLinkFile( $title, $img, $html );
// }
//
// /**
// * Create a direct link to a given uploaded file.
// * This will make a broken link if $file is false.
// *
// * @since 1.16.3
// * @param Title $title
// * @param File|boolean $file File Object or false
// * @param String $html Pre-sanitized HTML
// * @return String HTML
// *
// * @todo Handle invalid or missing images better.
// */
// public static function makeMediaLinkFile( Title $title, $file, $html = '' ) {
// if ( $file && $file->exists() ) {
// $url = $file->getUrl();
// $class = '@gplx.Internal protected';
// } else {
// $url = self::getUploadUrl( $title );
// $class = 'new';
// }
//
// $alt = $title->getText();
// if ( $html == '' ) {
// $html = $alt;
// }
//
// $ret = '';
// $attribs = [
// 'href' => $url,
// 'class' => $class,
// 'title' => $alt
// ];
//
// if ( !Hooks::run( 'LinkerMakeMediaLinkFile',
// [ $title, $file, &$html, &$attribs, &$ret ] ) ) {
// wfDebug( "Hook LinkerMakeMediaLinkFile changed the output of link "
// . "with url {$url} and text {$html} to {$ret}\n", true );
// return $ret;
// }
//
// return Html::rawElement( 'a', $attribs, $html );
// }
private static final byte[] Bry__dot2 = Bry_.new_a7("../");
}
class Linker_rel_splitter implements gplx.core.brys.Bry_split_wkr {
private final Hash_adp_bry hash = Hash_adp_bry.cs();
private final Bry_bfr bfr = Bry_bfr_.New();
public int Split(byte[] src, int itm_bgn, int itm_end) { // $combined = array_unique( array_merge( $newRels, $oldRels ) );
byte[] val = (byte[])hash.Get_by_mid(src, itm_bgn, itm_end);
if (val == null) {
val = Bry_.Mid(src, itm_bgn, itm_end);
hash.Add_as_key_and_val(val);
if (bfr.Len_gt_0()) bfr.Add_byte_space();
bfr.Add(val);
}
return Bry_split_.Rv__ok;
}
public byte[] To_bry() {
hash.Clear();
return bfr.To_bry_and_clear();
}
}

@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
public class Xomw_linker__normalize_subpage_link {
public byte[] link;
public byte[] text;
public Xomw_linker__normalize_subpage_link Init(byte[] link, byte[] text) {
this.link = link;
this.text = text;
return this;
}
}

@ -0,0 +1,43 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_linker__normalize_subpage_link__tst {
private final Xomw_linker__normalize_subpage_link__fxt fxt = new Xomw_linker__normalize_subpage_link__fxt();
@Test public void None() {fxt.Test__normalize_subpage_link("A/B/C" , "Z" , "" , "Z" , "");}
@Test public void Hash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Y#Z" , "" , "A/B/C/Y#Z" , "/Y#Z");}
@Test public void Slash__basic() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z" , "" , "A/B/C/Z" , "/Z");}
@Test public void Slash__slash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z/" , "" , "A/B/C/Z" , "Z");}
@Test public void Dot2__empty() {fxt.Test__normalize_subpage_link("A/B/C" , "../" , "" , "A/B" , "");}
@Test public void Dot2__many() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z" , "z1" , "A/Z" , "z1");}
@Test public void Dot2__trailing() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z/" , "" , "A/Z" , "Z");}
}
class Xomw_linker__normalize_subpage_link__fxt {
private final Xomw_linker mgr = new Xomw_linker();
private final Xowe_wiki wiki;
private final Xomw_linker__normalize_subpage_link normalize_subpage_link = new Xomw_linker__normalize_subpage_link();
public Xomw_linker__normalize_subpage_link__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
this.wiki = Xoa_app_fxt.Make__wiki__edit(app);
}
public void Test__normalize_subpage_link(String page_title_str, String link, String text, String expd_link, String expd_text) {
mgr.Normalize_subpage_link(normalize_subpage_link, wiki.Ttl_parse(Bry_.new_u8(page_title_str)), Bry_.new_u8(link), Bry_.new_u8(text));
Gftest.Eq__str(expd_link, String_.new_u8(normalize_subpage_link.link));
Gftest.Eq__str(expd_text, String_.new_u8(normalize_subpage_link.text));
}
}

@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
public class Xomw_linker__split_trail__tst {
private final Xomw_linker__split_trail__fxt fxt = new Xomw_linker__split_trail__fxt();
@Test public void Basic() {fxt.Test__split_trail("abc def" , "abc" , " def");}
@Test public void None() {fxt.Test__split_trail(" abc" , null , " abc");}
}
class Xomw_linker__split_trail__fxt {
private final Xomw_linker linker = new Xomw_linker();
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
public Xomw_linker__split_trail__fxt() {
String[] ary = new String[] {"a", "b", "c", "d", "e", "f"};
for (String itm : ary)
trie.Add_str_str(itm, itm);
linker.Init_by_wiki(trie);
}
public void Test__split_trail(String trail_str, String expd_inside, String expd_trail) {
byte[][] split_trail = linker.Split_trail(Bry_.new_u8(trail_str));
Gftest.Eq__str(expd_inside, String_.new_u8(split_trail[0]));
Gftest.Eq__str(expd_trail , String_.new_u8(split_trail[1]));
}
}

@ -0,0 +1,538 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import gplx.core.encoders.*; import gplx.langs.htmls.entitys.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.mws.parsers.*;
public class Xomw_sanitizer {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
int len = atr_bldr.Atrs__len();
// PORTED: Sanitizer.php|safeEncodeTagAttributes
for (int i = 0; i < len; i++) {
// $encAttribute = htmlspecialchars( $attribute );
// $encValue = Sanitizer::safeEncodeAttribute( $value );
// $attribs[] = "$encAttribute=\"$encValue\"";
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
bfr.Add_byte_eq().Add_byte_quote();
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
bfr.Add_byte_quote();
}
}
public void Normalize_char_references(Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
}
public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
// assert static structs
if (Normalize__dec == null) {
synchronized (Xomw_sanitizer.class) {
html_entities = Html_entities_new();
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
Normalize__hex = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.To_ary();
Normalize__ent = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.Set_rng(128, 255)
.To_ary();
}
}
// XO.BRY_BFR
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
// search for "&"
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
int ent_bgn = find_bgn + 1; // +1 to skip &
// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
boolean[] regex = null;
// check for #;
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
ent_bgn++;
if (ent_bgn < src_end) {
byte nxt = src[ent_bgn];
// check for x
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
ent_bgn++;
regex = Normalize__hex;
}
}
if (regex == null)
regex = Normalize__dec;
}
else {
regex = Normalize__ent;
}
// keep looping until invalid regex
int ent_end = ent_bgn;
byte b = Byte_ascii.Null;
for (int i = ent_bgn; i < src_end; i++) {
b = src[i];
if (regex[b])
ent_end++;
else
break;
}
// mark dirty; can optimize later by checking if "&lt;" already exists
dirty = true;
if (bfr == null) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn); // add everything before &
// invalid <- regex ended, but not at semic
if (b != Byte_ascii.Semic) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
cur = find_bgn + 1; // position after "&"
continue;
}
// do normalization
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
boolean ret = false;
if (regex == Normalize__ent) {
Normalize_entity(bfr, name);
ret = true;
}
else if (regex == Normalize__dec) {
ret = Dec_char_reference(bfr, name);
}
else if (regex == Normalize__hex) {
ret = Hex_char_reference(bfr, name);
}
if (!ret) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
bfr.Add_bry_escape_html(src, find_bgn + 1, ent_end + 1); // "find_bgn + 1" to start after "&"; "ent_end + 1" to include ";"
}
cur = ent_end + 1; // +1 to position after ";"
}
// XO.BRY_BFR
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the equivalent numeric entity reference (except for the core &lt;
// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
// the HTML equivalent. Otherwise, returns HTML-escaped text of
// pseudo-entity source (eg &amp;foo;)
private void Normalize_entity(Bry_bfr bfr, byte[] name) {
Object o = html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(entity.html);
}
}
private boolean Dec_char_reference(Bry_bfr bfr, byte[] codepoint) {
int point = Bry_.To_int_or(codepoint, -1);
if (Validate_codepoint(point)) {
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
return true;
}
return false;
}
private boolean Hex_char_reference(Bry_bfr bfr, byte[] codepoint) {
int point = Hex_utl_.Parse_or(codepoint, -1);
if (Validate_codepoint(point)) {
bfr.Add_str_a7("&#x");
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf( '&#x%x;', $point )
bfr.Add_byte_semic();
return true;
}
return false;
}
private boolean Validate_codepoint(int codepoint) {
// U+000C is valid in HTML5 but not allowed in XML.
// U+000D is valid in XML but not allowed in HTML5.
// U+007F - U+009F are disallowed in HTML5 (control characters).
return codepoint == 0x09
|| codepoint == 0x0a
|| (codepoint >= 0x20 && codepoint <= 0x7e)
|| (codepoint >= 0xa0 && codepoint <= 0xd7ff)
|| (codepoint >= 0xe000 && codepoint <= 0xfffd)
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
}
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
private static Hash_adp_bry html_entities;
private static Hash_adp_bry Html_entities_new() {
Bry_bfr tmp = Bry_bfr_.New();
Hash_adp_bry rv = Hash_adp_bry.cs();
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "רלמ", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "رلم", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "&lt;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", "&gt;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 38, "amp", "&amp;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 34, "quot", "&quot;");
// List of all named character entities defined in HTML 4.01
// https://www.w3.org/TR/html4/sgml/entities.html
// As well as &apos; which is only defined starting in XHTML1.
Html_entities_set(rv, tmp, "Aacute" , 193);
Html_entities_set(rv, tmp, "aacute" , 225);
Html_entities_set(rv, tmp, "Acirc" , 194);
Html_entities_set(rv, tmp, "acirc" , 226);
Html_entities_set(rv, tmp, "acute" , 180);
Html_entities_set(rv, tmp, "AElig" , 198);
Html_entities_set(rv, tmp, "aelig" , 230);
Html_entities_set(rv, tmp, "Agrave" , 192);
Html_entities_set(rv, tmp, "agrave" , 224);
Html_entities_set(rv, tmp, "alefsym" , 8501);
Html_entities_set(rv, tmp, "Alpha" , 913);
Html_entities_set(rv, tmp, "alpha" , 945);
Html_entities_set(rv, tmp, "amp" , 38); // XO: identical to Type__char entry; note that Type__char should be evaluated first
Html_entities_set(rv, tmp, "and" , 8743);
Html_entities_set(rv, tmp, "ang" , 8736);
Html_entities_set(rv, tmp, "apos" , 39); // New in XHTML & HTML 5; avoid in output for compatibility with IE.
Html_entities_set(rv, tmp, "Aring" , 197);
Html_entities_set(rv, tmp, "aring" , 229);
Html_entities_set(rv, tmp, "asymp" , 8776);
Html_entities_set(rv, tmp, "Atilde" , 195);
Html_entities_set(rv, tmp, "atilde" , 227);
Html_entities_set(rv, tmp, "Auml" , 196);
Html_entities_set(rv, tmp, "auml" , 228);
Html_entities_set(rv, tmp, "bdquo" , 8222);
Html_entities_set(rv, tmp, "Beta" , 914);
Html_entities_set(rv, tmp, "beta" , 946);
Html_entities_set(rv, tmp, "brvbar" , 166);
Html_entities_set(rv, tmp, "bull" , 8226);
Html_entities_set(rv, tmp, "cap" , 8745);
Html_entities_set(rv, tmp, "Ccedil" , 199);
Html_entities_set(rv, tmp, "ccedil" , 231);
Html_entities_set(rv, tmp, "cedil" , 184);
Html_entities_set(rv, tmp, "cent" , 162);
Html_entities_set(rv, tmp, "Chi" , 935);
Html_entities_set(rv, tmp, "chi" , 967);
Html_entities_set(rv, tmp, "circ" , 710);
Html_entities_set(rv, tmp, "clubs" , 9827);
Html_entities_set(rv, tmp, "cong" , 8773);
Html_entities_set(rv, tmp, "copy" , 169);
Html_entities_set(rv, tmp, "crarr" , 8629);
Html_entities_set(rv, tmp, "cup" , 8746);
Html_entities_set(rv, tmp, "curren" , 164);
Html_entities_set(rv, tmp, "dagger" , 8224);
Html_entities_set(rv, tmp, "Dagger" , 8225);
Html_entities_set(rv, tmp, "darr" , 8595);
Html_entities_set(rv, tmp, "dArr" , 8659);
Html_entities_set(rv, tmp, "deg" , 176);
Html_entities_set(rv, tmp, "Delta" , 916);
Html_entities_set(rv, tmp, "delta" , 948);
Html_entities_set(rv, tmp, "diams" , 9830);
Html_entities_set(rv, tmp, "divide" , 247);
Html_entities_set(rv, tmp, "Eacute" , 201);
Html_entities_set(rv, tmp, "eacute" , 233);
Html_entities_set(rv, tmp, "Ecirc" , 202);
Html_entities_set(rv, tmp, "ecirc" , 234);
Html_entities_set(rv, tmp, "Egrave" , 200);
Html_entities_set(rv, tmp, "egrave" , 232);
Html_entities_set(rv, tmp, "empty" , 8709);
Html_entities_set(rv, tmp, "emsp" , 8195);
Html_entities_set(rv, tmp, "ensp" , 8194);
Html_entities_set(rv, tmp, "Epsilon" , 917);
Html_entities_set(rv, tmp, "epsilon" , 949);
Html_entities_set(rv, tmp, "equiv" , 8801);
Html_entities_set(rv, tmp, "Eta" , 919);
Html_entities_set(rv, tmp, "eta" , 951);
Html_entities_set(rv, tmp, "ETH" , 208);
Html_entities_set(rv, tmp, "eth" , 240);
Html_entities_set(rv, tmp, "Euml" , 203);
Html_entities_set(rv, tmp, "euml" , 235);
Html_entities_set(rv, tmp, "euro" , 8364);
Html_entities_set(rv, tmp, "exist" , 8707);
Html_entities_set(rv, tmp, "fnof" , 402);
Html_entities_set(rv, tmp, "forall" , 8704);
Html_entities_set(rv, tmp, "frac12" , 189);
Html_entities_set(rv, tmp, "frac14" , 188);
Html_entities_set(rv, tmp, "frac34" , 190);
Html_entities_set(rv, tmp, "frasl" , 8260);
Html_entities_set(rv, tmp, "Gamma" , 915);
Html_entities_set(rv, tmp, "gamma" , 947);
Html_entities_set(rv, tmp, "ge" , 8805);
Html_entities_set(rv, tmp, "gt" , 62);
Html_entities_set(rv, tmp, "harr" , 8596);
Html_entities_set(rv, tmp, "hArr" , 8660);
Html_entities_set(rv, tmp, "hearts" , 9829);
Html_entities_set(rv, tmp, "hellip" , 8230);
Html_entities_set(rv, tmp, "Iacute" , 205);
Html_entities_set(rv, tmp, "iacute" , 237);
Html_entities_set(rv, tmp, "Icirc" , 206);
Html_entities_set(rv, tmp, "icirc" , 238);
Html_entities_set(rv, tmp, "iexcl" , 161);
Html_entities_set(rv, tmp, "Igrave" , 204);
Html_entities_set(rv, tmp, "igrave" , 236);
Html_entities_set(rv, tmp, "image" , 8465);
Html_entities_set(rv, tmp, "infin" , 8734);
Html_entities_set(rv, tmp, "int" , 8747);
Html_entities_set(rv, tmp, "Iota" , 921);
Html_entities_set(rv, tmp, "iota" , 953);
Html_entities_set(rv, tmp, "iquest" , 191);
Html_entities_set(rv, tmp, "isin" , 8712);
Html_entities_set(rv, tmp, "Iuml" , 207);
Html_entities_set(rv, tmp, "iuml" , 239);
Html_entities_set(rv, tmp, "Kappa" , 922);
Html_entities_set(rv, tmp, "kappa" , 954);
Html_entities_set(rv, tmp, "Lambda" , 923);
Html_entities_set(rv, tmp, "lambda" , 955);
Html_entities_set(rv, tmp, "lang" , 9001);
Html_entities_set(rv, tmp, "laquo" , 171);
Html_entities_set(rv, tmp, "larr" , 8592);
Html_entities_set(rv, tmp, "lArr" , 8656);
Html_entities_set(rv, tmp, "lceil" , 8968);
Html_entities_set(rv, tmp, "ldquo" , 8220);
Html_entities_set(rv, tmp, "le" , 8804);
Html_entities_set(rv, tmp, "lfloor" , 8970);
Html_entities_set(rv, tmp, "lowast" , 8727);
Html_entities_set(rv, tmp, "loz" , 9674);
Html_entities_set(rv, tmp, "lrm" , 8206);
Html_entities_set(rv, tmp, "lsaquo" , 8249);
Html_entities_set(rv, tmp, "lsquo" , 8216);
Html_entities_set(rv, tmp, "lt" , 60);
Html_entities_set(rv, tmp, "macr" , 175);
Html_entities_set(rv, tmp, "mdash" , 8212);
Html_entities_set(rv, tmp, "micro" , 181);
Html_entities_set(rv, tmp, "middot" , 183);
Html_entities_set(rv, tmp, "minus" , 8722);
Html_entities_set(rv, tmp, "Mu" , 924);
Html_entities_set(rv, tmp, "mu" , 956);
Html_entities_set(rv, tmp, "nabla" , 8711);
Html_entities_set(rv, tmp, "nbsp" , 160);
Html_entities_set(rv, tmp, "ndash" , 8211);
Html_entities_set(rv, tmp, "ne" , 8800);
Html_entities_set(rv, tmp, "ni" , 8715);
Html_entities_set(rv, tmp, "not" , 172);
Html_entities_set(rv, tmp, "notin" , 8713);
Html_entities_set(rv, tmp, "nsub" , 8836);
Html_entities_set(rv, tmp, "Ntilde" , 209);
Html_entities_set(rv, tmp, "ntilde" , 241);
Html_entities_set(rv, tmp, "Nu" , 925);
Html_entities_set(rv, tmp, "nu" , 957);
Html_entities_set(rv, tmp, "Oacute" , 211);
Html_entities_set(rv, tmp, "oacute" , 243);
Html_entities_set(rv, tmp, "Ocirc" , 212);
Html_entities_set(rv, tmp, "ocirc" , 244);
Html_entities_set(rv, tmp, "OElig" , 338);
Html_entities_set(rv, tmp, "oelig" , 339);
Html_entities_set(rv, tmp, "Ograve" , 210);
Html_entities_set(rv, tmp, "ograve" , 242);
Html_entities_set(rv, tmp, "oline" , 8254);
Html_entities_set(rv, tmp, "Omega" , 937);
Html_entities_set(rv, tmp, "omega" , 969);
Html_entities_set(rv, tmp, "Omicron" , 927);
Html_entities_set(rv, tmp, "omicron" , 959);
Html_entities_set(rv, tmp, "oplus" , 8853);
Html_entities_set(rv, tmp, "or" , 8744);
Html_entities_set(rv, tmp, "ordf" , 170);
Html_entities_set(rv, tmp, "ordm" , 186);
Html_entities_set(rv, tmp, "Oslash" , 216);
Html_entities_set(rv, tmp, "oslash" , 248);
Html_entities_set(rv, tmp, "Otilde" , 213);
Html_entities_set(rv, tmp, "otilde" , 245);
Html_entities_set(rv, tmp, "otimes" , 8855);
Html_entities_set(rv, tmp, "Ouml" , 214);
Html_entities_set(rv, tmp, "ouml" , 246);
Html_entities_set(rv, tmp, "para" , 182);
Html_entities_set(rv, tmp, "part" , 8706);
Html_entities_set(rv, tmp, "permil" , 8240);
Html_entities_set(rv, tmp, "perp" , 8869);
Html_entities_set(rv, tmp, "Phi" , 934);
Html_entities_set(rv, tmp, "phi" , 966);
Html_entities_set(rv, tmp, "Pi" , 928);
Html_entities_set(rv, tmp, "pi" , 960);
Html_entities_set(rv, tmp, "piv" , 982);
Html_entities_set(rv, tmp, "plusmn" , 177);
Html_entities_set(rv, tmp, "pound" , 163);
Html_entities_set(rv, tmp, "prime" , 8242);
Html_entities_set(rv, tmp, "Prime" , 8243);
Html_entities_set(rv, tmp, "prod" , 8719);
Html_entities_set(rv, tmp, "prop" , 8733);
Html_entities_set(rv, tmp, "Psi" , 936);
Html_entities_set(rv, tmp, "psi" , 968);
Html_entities_set(rv, tmp, "quot" , 34);
Html_entities_set(rv, tmp, "radic" , 8730);
Html_entities_set(rv, tmp, "rang" , 9002);
Html_entities_set(rv, tmp, "raquo" , 187);
Html_entities_set(rv, tmp, "rarr" , 8594);
Html_entities_set(rv, tmp, "rArr" , 8658);
Html_entities_set(rv, tmp, "rceil" , 8969);
Html_entities_set(rv, tmp, "rdquo" , 8221);
Html_entities_set(rv, tmp, "real" , 8476);
Html_entities_set(rv, tmp, "reg" , 174);
Html_entities_set(rv, tmp, "rfloor" , 8971);
Html_entities_set(rv, tmp, "Rho" , 929);
Html_entities_set(rv, tmp, "rho" , 961);
Html_entities_set(rv, tmp, "rlm" , 8207);
Html_entities_set(rv, tmp, "rsaquo" , 8250);
Html_entities_set(rv, tmp, "rsquo" , 8217);
Html_entities_set(rv, tmp, "sbquo" , 8218);
Html_entities_set(rv, tmp, "Scaron" , 352);
Html_entities_set(rv, tmp, "scaron" , 353);
Html_entities_set(rv, tmp, "sdot" , 8901);
Html_entities_set(rv, tmp, "sect" , 167);
Html_entities_set(rv, tmp, "shy" , 173);
Html_entities_set(rv, tmp, "Sigma" , 931);
Html_entities_set(rv, tmp, "sigma" , 963);
Html_entities_set(rv, tmp, "sigmaf" , 962);
Html_entities_set(rv, tmp, "sim" , 8764);
Html_entities_set(rv, tmp, "spades" , 9824);
Html_entities_set(rv, tmp, "sub" , 8834);
Html_entities_set(rv, tmp, "sube" , 8838);
Html_entities_set(rv, tmp, "sum" , 8721);
Html_entities_set(rv, tmp, "sup" , 8835);
Html_entities_set(rv, tmp, "sup1" , 185);
Html_entities_set(rv, tmp, "sup2" , 178);
Html_entities_set(rv, tmp, "sup3" , 179);
Html_entities_set(rv, tmp, "supe" , 8839);
Html_entities_set(rv, tmp, "szlig" , 223);
Html_entities_set(rv, tmp, "Tau" , 932);
Html_entities_set(rv, tmp, "tau" , 964);
Html_entities_set(rv, tmp, "there4" , 8756);
Html_entities_set(rv, tmp, "Theta" , 920);
Html_entities_set(rv, tmp, "theta" , 952);
Html_entities_set(rv, tmp, "thetasym" , 977);
Html_entities_set(rv, tmp, "thinsp" , 8201);
Html_entities_set(rv, tmp, "THORN" , 222);
Html_entities_set(rv, tmp, "thorn" , 254);
Html_entities_set(rv, tmp, "tilde" , 732);
Html_entities_set(rv, tmp, "times" , 215);
Html_entities_set(rv, tmp, "trade" , 8482);
Html_entities_set(rv, tmp, "Uacute" , 218);
Html_entities_set(rv, tmp, "uacute" , 250);
Html_entities_set(rv, tmp, "uarr" , 8593);
Html_entities_set(rv, tmp, "uArr" , 8657);
Html_entities_set(rv, tmp, "Ucirc" , 219);
Html_entities_set(rv, tmp, "ucirc" , 251);
Html_entities_set(rv, tmp, "Ugrave" , 217);
Html_entities_set(rv, tmp, "ugrave" , 249);
Html_entities_set(rv, tmp, "uml" , 168);
Html_entities_set(rv, tmp, "upsih" , 978);
Html_entities_set(rv, tmp, "Upsilon" , 933);
Html_entities_set(rv, tmp, "upsilon" , 965);
Html_entities_set(rv, tmp, "Uuml" , 220);
Html_entities_set(rv, tmp, "uuml" , 252);
Html_entities_set(rv, tmp, "weierp" , 8472);
Html_entities_set(rv, tmp, "Xi" , 926);
Html_entities_set(rv, tmp, "xi" , 958);
Html_entities_set(rv, tmp, "Yacute" , 221);
Html_entities_set(rv, tmp, "yacute" , 253);
Html_entities_set(rv, tmp, "yen" , 165);
Html_entities_set(rv, tmp, "Yuml" , 376);
Html_entities_set(rv, tmp, "yuml" , 255);
Html_entities_set(rv, tmp, "Zeta" , 918);
Html_entities_set(rv, tmp, "zeta" , 950);
Html_entities_set(rv, tmp, "zwj" , 8205);
Html_entities_set(rv, tmp, "zwnj" , 8204);
return rv;
}
private static void Html_entities_set(Hash_adp_bry rv, Bry_bfr tmp, String name_str, int code) {
byte[] html_bry = tmp.Add_str_a7("&#").Add_int_variable(code).Add_byte_semic().To_bry_and_clear();
Html_entities_set(rv, Xomw_html_ent.Type__entity, code, name_str, html_bry);
}
private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, String html_str) {Html_entities_set(rv, type, code, name_str, Bry_.new_u8(html_str));}
private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, byte[] html_bry) {
byte[] name_bry = Bry_.new_u8(name_str);
rv.Add_if_dupe_use_1st(name_bry, new Xomw_html_ent(type, code, name_bry, html_bry)); // Add_dupe needed b/c "lt" and co. are added early; ignore subsequent call
}
}
class Xomw_html_ent {
public Xomw_html_ent(byte type, int code, byte[] name, byte[] html) {
this.type = type;
this.code = code;
this.name = name;
this.html = html;
}
public final byte type;
public final int code;
public final byte[] name;
public final byte[] html;
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
}
class Bool_ary_bldr {
private final boolean[] ary;
public Bool_ary_bldr(int len) {
this.ary = new boolean[len];
}
public Bool_ary_bldr Set_many(int... v) {
int len = v.length;
for (int i = 0; i < len; i++)
ary[v[i]] = true;
return this;
}
public Bool_ary_bldr Set_rng(int bgn, int end) {
for (int i = bgn; i <= end; i++)
ary[i] = true;
return this;
}
public boolean[] To_ary() {
return ary;
}
public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);}
}

@ -0,0 +1,44 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
public class Xomw_sanitizer__tst {
private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
@Test public void Text() {fxt.Test__normalize_char_references("abc" , "abc");}
@Test public void Dec() {fxt.Test__normalize_char_references("&#08;" , "&amp;#08;");}
@Test public void Dec__invalid() {fxt.Test__normalize_char_references("&#09;" , "&#9;");}
@Test public void Hex() {fxt.Test__normalize_char_references("&#xFF;" , "&#xff;");}
@Test public void Entity() {fxt.Test__normalize_char_references("&alpha;" , "&#945;");}
@Test public void Entity__lt() {fxt.Test__normalize_char_references("&lt;" , "&lt;");}
@Test public void Invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&amp;(invalid);");}
@Test public void Many() {
fxt.Test__normalize_char_references
( "a &#09; b &alpha; c &#xFF; d &(invalid); e"
, "a &#9; b &#945; c &#xff; d &amp;(invalid); e"
);
}
}
class Xomw_sanitizer__fxt {
private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
private final Bry_bfr tmp = Bry_bfr_.New();
public void Test__normalize_char_references(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
Gftest.Eq__str(expd, tmp.To_str_and_clear());
}
}

@ -0,0 +1,26 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_html_elem {
public Xomw_html_elem(byte[] name) {
this.name = name;
}
public byte[] Name() {return name;} private final byte[] name; // EX: "a", "div", "img"
// private static final Hash_adp_bry void_elements = Hash_adp_bry.cs().Add_many_str("area", "super", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr");
}

@ -0,0 +1,267 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
import gplx.langs.phps.utls.*;
public class Xomw_html_utl {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
public void Raw_element(Bry_bfr bfr, byte[] element, Xomwh_atr_mgr attribs, byte[] contents) {
Bry_.Lcase__all(element); // XO:lcase element
Open_element__lcased(bfr, element, attribs);
if (void_elements.Has(element)) {
bfr.Del_by_1().Add(Bry__elem__lhs__inl);
}
else {
bfr.Add(contents);
Close_element__lcased(bfr, element);
}
}
private void Open_element__lcased(Bry_bfr bfr, byte[] element, Xomwh_atr_mgr attribs) {
// This is not required in HTML5, but let's do it anyway, for
// consistency and better compression.
// $element = strtolower($element); // XO:handled by callers
// Remove invalid input types
if (Bry_.Eq(element, Tag__input)) {
// PORTED.HEADER:valid_input_types
byte[] type_atr_val = attribs.Get_val_or_null(Atr__type);
if (type_atr_val != null && !valid_input_types.Has(type_atr_val)) {
attribs.Del(Atr__type);
}
}
// According to standard the default type for <button> elements is "submit".
// Depending on compatibility mode IE might use "button", instead.
// We enforce the standard "submit".
if (Bry_.Eq(element, Tag__button) && attribs.Get_val_or_null(Atr__type) == null) {
attribs.Set(Atr__type, Val__type__submit);
}
bfr.Add_byte(Byte_ascii.Angle_bgn).Add(element);
Expand_attributes(bfr, attribs); // TODO.XO:self::dropDefaults($element, $attribs)
bfr.Add_byte(Byte_ascii.Angle_end);
}
public void Expand_attributes(Bry_bfr bfr, Xomwh_atr_mgr atrs) {
int len = atrs.Len();
for (int i = 0; i < len; i++) {
Xomwh_atr_itm atr = (Xomwh_atr_itm)atrs.Get_at(i);
byte[] key = atr.Key_bry();
byte[] val = atr.Val();
// Support intuitive [ 'checked' => true/false ] form
if (val == null) { // TESTME
continue;
}
// For boolean attributes, support [ 'foo' ] instead of
// requiring [ 'foo' => 'meaningless' ].
boolean bool_attrib = bool_attribs.Has(val);
if (atr.Key_int() != -1 && bool_attrib) {
key = val;
}
// Not technically required in HTML5 but we'd like consistency
// and better compression anyway.
key = Bry_.Xcase__build__all(tmp, Bool_.N, key);
// PORTED.HEADER:$spaceSeparatedListAttributes
// Specific features for attributes that allow a list of space-separated values
if (space_separated_list_attributes.Has(key)) {
// Apply some normalization and remove duplicates
// Convert into correct array. Array can contain space-separated
// values. Implode/explode to get those into the main array as well.
// if (is_array($value)) {
// If input wasn't an array, we can skip this step
// $newValue = [];
// foreach ($value as $k => $v) {
// if (is_string($v)) {
// String values should be normal `array('foo')`
// Just append them
// if (!isset($value[$v])) {
// As a special case don't set 'foo' if a
// separate 'foo' => true/false exists in the array
// keys should be authoritative
// $newValue[] = $v;
// }
// }
// elseif ($v) {
// If the value is truthy but not a String this is likely
// an [ 'foo' => true ], falsy values don't add strings
// $newValue[] = $k;
// }
// }
// $value = implode(' ', $newValue);
// }
// $value = explode(' ', $value);
// Normalize spacing by fixing up cases where people used
// more than 1 space and/or a trailing/leading space
// $value = array_diff($value, [ '', ' ' ]);
// Remove duplicates and create the String
// $value = implode(' ', array_unique($value));
}
// DELETE
// elseif (is_array($value)) {
// throw new MWException("HTML attribute $key can not contain a list of values");
// }
if (bool_attrib) {
bfr.Add_byte_space().Add(key).Add(Bry__atr__val__empty); // $ret .= " $key=\"\"";
}
else {
// PORTED.HEADER:atr_val_encodings
val = Php_str_.Strtr(val, atr_val_encodings, tmp, trv);
bfr.Add_byte_space().Add(key).Add(Bry__atr__val__quote).Add(val).Add_byte_quote();
}
}
}
private void Close_element__lcased(Bry_bfr bfr, byte[] element) {
bfr.Add(Bry__elem__rhs__bgn).Add(element).Add_byte(Byte_ascii.Angle_end); // EX: "</", element, ">";
}
private static final byte[]
Bry__elem__lhs__inl = Bry_.new_a7("/>")
, Bry__elem__rhs__bgn = Bry_.new_a7("</")
, Bry__atr__val__quote = Bry_.new_a7("=\"")
, Bry__atr__val__empty = Bry_.new_a7("=\"\"")
, Tag__input = Bry_.new_a7("input")
, Tag__button = Bry_.new_a7("button")
, Atr__type = Bry_.new_a7("type")
, Val__type__submit = Bry_.new_a7("submit")
;
// List of void elements from HTML5, section 8.1.2 as of 2016-09-19
private static final Hash_adp_bry void_elements = Hash_adp_bry.cs().Add_many_str
(
"area",
"super",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
);
// Boolean attributes, which may have the value omitted entirely. Manually
// collected from the HTML5 spec as of 2011-08-12.
private static final Hash_adp_bry bool_attribs = Hash_adp_bry.ci_a7().Add_many_str(
"async",
"autofocus",
"autoplay",
"checked",
"controls",
"default",
"defer",
"disabled",
"formnovalidate",
"hidden",
"ismap",
// "itemscope", //XO:duplicate; added below
"loop",
"multiple",
"muted",
"novalidate",
"open",
"pubdate",
"final ",
"required",
"reversed",
"scoped",
"seamless",
"selected",
"truespeed",
"typemustmatch",
// HTML5 Microdata
"itemscope"
);
private static final Btrie_slim_mgr atr_val_encodings = Btrie_slim_mgr.cs()
// Apparently we need to entity-encode \n, \r, \t, although the
// spec doesn't mention that. Since we're doing strtr() anyway,
// we may as well not call htmlspecialchars().
// @todo FIXME: Verify that we actually need to
// escape \n\r\t here, and explain why, exactly.
// We could call Sanitizer::encodeAttribute() for this, but we
// don't because we're stubborn and like our marginal savings on
// byte size from not having to encode unnecessary quotes.
// The only difference between this transform and the one by
// Sanitizer::encodeAttribute() is ' is not encoded.
.Add_str_str("&" , "&amp;")
.Add_str_str("\"" , "&quot;")
.Add_str_str(">" , "&gt;")
// '<' allegedly allowed per spec
// but breaks some tools if not escaped.
.Add_str_str("<" , "&lt;")
.Add_str_str("\n" , "&#10;")
.Add_str_str("\r" , "&#13;")
.Add_str_str("\t" , "&#9;");
// https://www.w3.org/TR/html401/index/attributes.html ("space-separated")
// https://www.w3.org/TR/html5/index.html#attributes-1 ("space-separated")
private static final Hash_adp_bry space_separated_list_attributes = Hash_adp_bry.ci_a7().Add_many_str(
"class", // html4, html5
"accesskey", // as of html5, multiple space-separated values allowed
// html4-spec doesn't document rel= as space-separated
// but has been used like that and is now documented as such
// in the html5-spec.
"rel"
);
private static final Hash_adp_bry valid_input_types = Hash_adp_bry.ci_a7().Add_many_str(
// Remove invalid input types
"hidden",
"text",
"password",
"checkbox",
"radio",
"file",
"submit",
"image",
"reset",
"button",
// HTML input types
"datetime",
"datetime-local",
"date",
"month",
"time",
"week",
"number",
"range",
"email",
"url",
"search",
"tel",
"color"
);
}

@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_html_utl__expand_attributes__tst {
private final Xomw_html_utl__expand_attributes__fxt fxt = new Xomw_html_utl__expand_attributes__fxt();
@Test public void Basic() {fxt.Test__expand_attributes(" a=\"b\"", "a", "b");}
}
class Xomw_html_utl__expand_attributes__fxt {
private final Xomw_html_utl utl = new Xomw_html_utl();
private final Bry_bfr bfr = Bry_bfr_.New();
public void Test__expand_attributes(String expd, String... kvs) {
Xomwh_atr_mgr atrs = new Xomwh_atr_mgr();
int kvs_len = kvs.length;
for (int i = 0; i < kvs_len; i += 2) {
byte[] key = Bry_.new_a7(kvs[i]);
byte[] val = Bry_.new_a7(kvs[i + 1]);
Xomwh_atr_itm itm = new Xomwh_atr_itm(-1, key, val);
atrs.Add(itm);
}
utl.Expand_attributes(bfr, atrs);
Gftest.Eq__str(expd, bfr.To_str_and_clear());
}
}

@ -15,13 +15,15 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.addons.apps.cfgs.mgrs.dflts; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.apps.*; import gplx.xowa.addons.apps.cfgs.*; import gplx.xowa.addons.apps.cfgs.mgrs.*; package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
class Xocfg_dflt_itm__static implements Gfo_invk { public class Xomwh_atr_itm {
private final String val; public Xomwh_atr_itm(int key_int, byte[] key, byte[] val) {
public Xocfg_dflt_itm__static(String val) { this.key_int = key_int;
this.key_bry = key;
this.val = val; this.val = val;
} }
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { public int Key_int() {return key_int;} private int key_int;
return val; public byte[] Key_bry() {return key_bry;} private byte[] key_bry;
} public byte[] Val() {return val;} private byte[] val;
public void Val_(byte[] v) {this.val = v;}
} }

@ -0,0 +1,53 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomwh_atr_mgr {
private final Ordered_hash hash = Ordered_hash_.New();
public int Len() {return hash.Len();}
public Xomwh_atr_itm Get_at(int i) {return (Xomwh_atr_itm)hash.Get_at(i);}
public Xomwh_atr_mgr Clear() {hash.Clear(); return this;}
public void Add(byte[] key, byte[] val) {hash.Add(key, new Xomwh_atr_itm(-1, key, val));}
public void Add(Xomwh_atr_itm itm) {hash.Add(itm.Key_bry(), itm);}
public void Del(byte[] key) {hash.Del(key);}
public void Set(byte[] key, byte[] val) {
Xomwh_atr_itm atr = Get_by_or_make(key);
atr.Val_(val);
}
public void Add_or_set(Xomwh_atr_itm src) {
Xomwh_atr_itm trg = (Xomwh_atr_itm)hash.Get_by(src.Key_bry());
if (trg == null)
this.Add(src);
else
trg.Val_(src.Val());
}
public Xomwh_atr_itm Get_by_or_null(byte[] k) {
return (Xomwh_atr_itm)hash.Get_by(k);
}
public Xomwh_atr_itm Get_by_or_make(byte[] k) {
Xomwh_atr_itm rv = (Xomwh_atr_itm)hash.Get_by(k);
if (rv == null) {
rv = new Xomwh_atr_itm(-1, k, null);
Add(rv);
}
return rv;
}
public byte[] Get_val_or_null(byte[] k) {
Xomwh_atr_itm atr = (Xomwh_atr_itm)hash.Get_by(k);
return atr == null ? null : atr.Val();
}
}

@ -0,0 +1,137 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.linkers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.langs.htmls.*;
import gplx.xowa.mws.htmls.*;
public class Xomw_link_renderer {
private boolean expand_urls = false;
private final Xomw_html_utl html_utl = new Xomw_html_utl();
private final Xomwh_atr_mgr attribs = new Xomwh_atr_mgr();
// If you have already looked up the proper CSS classes using LinkRenderer::getLinkClasses()
// or some other method, use this to avoid looking it up again.
public void Make_preloaded_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, byte[] classes, Xomwh_atr_mgr extra_atrs, byte[] query) {
// IGNORE: $this->runBeginHook --> 'HtmlPageLinkRendererBegin', 'LinkBegin'
// $target = $this->normalizeTarget( $target ); // normalizeSpecialPage
byte[] url = Get_link_url(target, query);
attribs.Clear();
attribs.Add(Gfh_atr_.Bry__href, url); // NOTE: add url 1st; MW does attribs["url", url] + attribs + extra_attribs
if (classes.length > 0) // XO:do not bother adding if empty
attribs.Add(Gfh_atr_.Bry__class, classes);
byte[] prefixed_text = target.Get_prefixed_text();
if (prefixed_text != Bry_.Empty) {
attribs.Add(Gfh_atr_.Bry__title, prefixed_text);
}
int extra_atrs_len = extra_atrs.Len();
for (int i = 0; i < extra_atrs_len; i++) {
attribs.Add_or_set(extra_atrs.Get_at(i));
}
if (text == null) {
text = this.Get_link_text(target);
}
Build_a_element(bfr, target,text, attribs, true);
}
private void Build_a_element(Bry_bfr bfr, Xoa_ttl target, byte[] text, Xomwh_atr_mgr attribs, boolean is_known) {
// IGNORE: if ( !Hooks::run( 'HtmlPageLinkRendererEnd',
byte[] html = text;
// $html = HtmlArmor::getHtml( $text );
// IGNORE: if ( Hooks::isRegistered( 'LinkEnd' ) ) {
html_utl.Raw_element(bfr, Gfh_tag_.Bry__a, attribs, html);
}
private byte[] Get_link_url(Xoa_ttl target, byte[] query) {
// TODO: Use a LinkTargetResolver service instead of Title
// if ( $this->forceArticlePath ) {
// $realQuery = $query;
// $query = [];
// }
// else {
// $realQuery = [];
// }
byte[] url = target.Get_link_url(query, false, expand_urls);
// if ( $this->forceArticlePath && $realQuery ) {
// $url = wfAppendQuery( $url, $realQuery );
// }
return url;
}
private byte[] Get_link_text(Xoa_ttl target) {
byte[] prefixed_text = target.Get_prefixed_text();
// If the target is just a fragment, with no title, we return the fragment
// text. Otherwise, we return the title text itself.
if (prefixed_text == Bry_.Empty && target.Has_fragment()) {
return target.Get_fragment();
}
return prefixed_text;
}
// private function normalizeTarget( LinkTarget $target ) {
// return Linker::normaliseSpecialPage( $target );
// }
// public static function normaliseSpecialPage( LinkTarget $target ) {
// if ( $target->getNamespace() == NS_SPECIAL && !$target->isExternal() ) {
// list( $name, $subpage ) = SpecialPageFactory::resolveAlias( $target->getDBkey() );
// if ( !$name ) {
// return $target;
// }
// $ret = SpecialPage::getTitleValueFor( $name, $subpage, $target->getFragment() );
// return $ret;
// } else {
// return $target;
// }
// }
private static final byte[] Bry__classes__extiw = Bry_.new_a7("extiw");
public void Make_known_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, Xomwh_atr_mgr extra_atrs, byte[] query) {
byte[] classes = Bry_.Empty;
if (target.Is_external()) {
classes = Bry__classes__extiw;
}
byte[] colour = Get_link_classes(target);
if (colour != Bry_.Empty) {
classes = Bry_.Add(classes, Byte_ascii.Space_bry, colour);
}
Make_preloaded_link(bfr, target, text, classes, extra_atrs, query);
}
public byte[] Get_link_classes(Xoa_ttl target) {
// Make sure the target is in the cache
// $id = $this->linkCache->addLinkObj( $target );
// if ( $id == 0 ) {
// // Doesn't exist
// return '';
// }
// if ( $this->linkCache->getGoodLinkFieldObj( $target, 'redirect' ) ) {
// Page is a redirect
// return 'mw-redirect';
// }
// elseif ( $this->stubThreshold > 0 && MWNamespace::isContent( $target->getNamespace() )
// && $this->linkCache->getGoodLinkFieldObj( $target, 'length' ) < $this->stubThreshold
// ) {
// Page is a stub
// return 'stub';
// }
return Bry_.Empty;
}
}

@ -0,0 +1,584 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
import gplx.langs.htmls.*;
import gplx.langs.phps.utls.*;
public class Xomw_block_level_pass {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private boolean in_pre, dt_open;
private int last_section;
private byte[] find_colon_no_links__before, find_colon_no_links__after;
public void Do_block_levels(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, boolean line_start) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
// XO.STATIC
if (block_chars_ary == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
block_chars_ary = Block_chars_ary__new();
open_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
("<table", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<pre", "<tr", "<p", "<ul", "<ol", "<dl", "<li", "</tr", "</td", "</th");
close_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
( "</table", "</h1", "</h2", "</h3", "</h4", "</h5", "</h6", "<td", "<th", "<blockquote", "</blockquote", "<div", "</div", "<hr"
, "</pre", "</p", "</mw:", Xomw_strip_state.Str__marker_bgn + "-pre", "</li", "</ul", "</ol", "</dl", "<center", "</center");
blockquote_trie = Btrie_slim_mgr.ci_a7().Add_many_str("<blockquote", "</blockquote");
pre_trie = Btrie_slim_mgr.ci_a7().Add_str_int("<pre", Pre__bgn).Add_str_int("</pre", Pre__end);
}
}
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
byte[] last_prefix = Bry_.Empty;
bfr.Clear();
this.dt_open = false;
boolean in_block_elem = false;
int prefix_len = 0;
byte para_stack = Para_stack__none;
boolean in_blockquote = false;
this.in_pre = false;
this.last_section = Last_section__none;
byte[] prefix2 = null;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
int line_bgn = src_bgn;
while (line_bgn < src_end) {
int line_end = Bry_find_.Find_fwd(src, Byte_ascii.Nl, line_bgn);
if (line_end == Bry_find_.Not_found)
line_end = src_end;
// Fix up line_start
if (!line_start) {
bfr.Add_mid(src, line_bgn, line_end);
line_start = true;
continue;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int last_prefix_len = last_prefix.length;
// PORTED: pre_close_match = preg_match('/<\\/pre/i', $oLine); pre_open_match = preg_match('/<pre/i', $oLine);
int pre_cur = line_bgn;
boolean pre_close_match = false;
boolean pre_open_match = false;
while (true) {
if (pre_cur >= line_end)
break;
Object o = pre_trie.Match_at(trv, src, pre_cur, line_end);
if (o == null)
pre_cur++;
else {
int pre_tid = (int)o;
if (pre_tid == Pre__bgn)
pre_open_match = true;
else if (pre_tid == Pre__end)
pre_close_match = true;
pre_cur = trv.Pos();
}
}
byte[] prefix = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!in_pre) {
// Multiple prefixes may abut each other for nested lists.
prefix_len = Php_str_.Strspn_fwd__ary(src, block_chars_ary, line_bgn, line_end, line_end); // strspn($oLine, '*#:;');
prefix = Php_str_.Substr(src, line_bgn, prefix_len);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
// substr( $inputLine, $prefixLength );
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, line_bgn + prefix_len, line_end);
in_pre = pre_open_match;
}
else {
// Don't interpret any other prefixes in preformatted text
prefix_len = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, line_bgn, line_end);
}
// List generation
byte[] term = null, t2 = null;
int common_prefix_len = -1;
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
bfr.Add(Next_item(Php_str_.Substr_byte(prefix, -1)));
para_stack = Para_stack__none;
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
}
else if (prefix_len > 0 || last_prefix_len > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
common_prefix_len = Get_common(prefix, last_prefix);
para_stack = Para_stack__none;
// Close all the prefixes which aren't shared.
while (common_prefix_len < last_prefix_len) {
bfr.Add(Close_list(last_prefix[last_prefix_len - 1]));
last_prefix_len--;
}
// Continue the current prefix if appropriate.
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
bfr.Add(Next_item(prefix[common_prefix_len - 1]));
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
bfr.Add_byte_nl();
}
while (prefix_len > common_prefix_len) {
byte c = Php_str_.Substr_byte(prefix, common_prefix_len, 1);
bfr.Add(Open_list(c));
if (c == Byte_ascii.Semic) {
// @todo FIXME: This is dupe of code above
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
++common_prefix_len;
}
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
bfr.Add_byte_nl();
}
last_prefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefix_len) {
// No prefix (not in list)--go to paragraph mode
// XXX: use a stack for nestable elements like span, table and div
int t_len = t.length;
boolean open_match = Php_preg_.Match(open_match_trie, trv, t, 0, t_len) != null;
boolean close_match = Php_preg_.Match(close_match_trie, trv, t, 0, t_len) != null;
if (open_match || close_match) {
para_stack = Para_stack__none;
// @todo bug 5718: paragraph closed
bfr.Add(Close_paragraph());
if (pre_open_match && !pre_close_match) {
in_pre = true;
}
int bq_offset = 0;
// PORTED:preg_match('/<(\\/?)blockquote[\s>]/i', t, $bqMatch, PREG_OFFSET_CAPTURE, $bq_offset)
while (true) {
Object o = Php_preg_.Match(blockquote_trie, trv, t, bq_offset, t_len);
if (o == null) { // no more blockquotes found; exit
break;
}
else {
byte[] bq_bry = (byte[])o;
in_blockquote = bq_bry[1] != Byte_ascii.Slash; // is this a close tag?
bq_offset = trv.Pos();
}
}
in_block_elem = !close_match;
}
else if (!in_block_elem && !in_pre) {
if ( Php_str_.Substr_byte(t, 0) == Byte_ascii.Space
&& (last_section == Last_section__pre || Bry_.Trim(t) != Bry_.Empty)
&& !in_blockquote
) {
// pre
if (last_section != Last_section__pre) {
para_stack = Para_stack__none;
bfr.Add(Close_paragraph()).Add(Gfh_tag_.Pre_lhs);
last_section = Last_section__pre;
}
t = Bry_.Mid(t, 1);
}
else {
// paragraph
if (Bry_.Trim(t) == Bry_.Empty) {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
bfr.Add_str_a7("<br />");
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else {
if (last_section != Last_section__para) {
bfr.Add(Close_paragraph());
last_section = Last_section__none;
para_stack = Para_stack__bgn;
}
else {
para_stack = Para_stack__mid;
}
}
}
else {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else if (last_section != Last_section__para) {
bfr.Add(Close_paragraph()).Add(Gfh_tag_.P_lhs);
this.last_section = Last_section__para;
}
}
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (pre_close_match && in_pre) {
in_pre = false;
}
if (para_stack == Para_stack__none) {
bfr.Add(t);
if (prefix_len == 0) {
bfr.Add_byte_nl();
}
}
line_bgn = line_end + 1;
}
while (prefix_len > 0) {
bfr.Add(Close_list(prefix2[prefix_len - 1]));
prefix_len--;
if (prefix_len > 0) {
bfr.Add_byte_nl();
}
}
if (last_section != Last_section__none) {
bfr.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
last_section = Last_section__none;
}
}
// If a pre or p is open, return the corresponding close tag and update
// the state. If no tag is open, return an empty String.
public byte[] Close_paragraph() {
byte[] result = Bry_.Empty;
if (last_section != Last_section__none) {
tmp.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
result = tmp.Add_byte_nl().To_bry_and_clear();
}
in_pre = false;
last_section = Last_section__none;
return result;
}
// getCommon() returns the length of the longest common substring
// of both arguments, starting at the beginning of both.
private int Get_common(byte[] st1, byte[] st2) {
int st1_len = st1.length, st2_len = st2.length;
int shorter = st1_len < st2_len ? st1_len : st2_len;
int i;
for (i = 0; i < shorter; i++) {
if (st1[i] != st2[i]) {
break;
}
}
return i;
}
// Open the list item element identified by the prefix character.
private byte[] Open_list(byte c) {
byte[] result = Close_paragraph();
if (c == Byte_ascii.Star)
result = tmp.Add(result).Add_str_a7("<ul><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<ol><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<dl><dd>").To_bry_and_clear();
else if (c == Byte_ascii.Semic) {
result = tmp.Add(result).Add_str_a7("<dl><dt>").To_bry_and_clear();
dt_open = true;
}
else
result = tmp.Add_str_a7("<!-- ERR 1 -->").To_bry_and_clear();
return result;
}
// Close the current list item and open the next one.
private byte[] Next_item(byte c) {
if (c == Byte_ascii.Star || c == Byte_ascii.Hash) {
return tmp.Add_str_a7("</li>\n<li>").To_bry_and_clear();
}
else if (c == Byte_ascii.Colon || c == Byte_ascii.Semic) {
byte[] close = tmp.Add_str_a7("</dd>\n").To_bry_and_clear();
if (dt_open) {
close = tmp.Add_str_a7("</dt>\n").To_bry_and_clear();
}
if (c == Byte_ascii.Semic) {
dt_open = true;
return tmp.Add(close).Add_str_a7("<dt>").To_bry_and_clear();
}
else {
dt_open = false;
return tmp.Add(close).Add_str_a7("<dd>").To_bry_and_clear();
}
}
return tmp.Add_str_a7("<!-- ERR 2 -->").To_bry_and_clear();
}
// Close the current list item identified by the prefix character.
private byte[] Close_list(byte c) {
byte[] text = null;
if (c == Byte_ascii.Star) {
text = Bry_.new_a7("</li></ul>");
}
else if (c == Byte_ascii.Hash) {
text = Bry_.new_a7("</li></ol>");
}
else if (c == Byte_ascii.Colon) {
if (dt_open) {
dt_open = false;
text = Bry_.new_a7("</dt></dl>");
}
else {
text = Bry_.new_a7("</dd></dl>");
}
}
else {
return Bry_.new_a7("<!-- ERR 3 -->");
}
return text;
}
// Split up a String on ':', ignoring any occurrences inside tags
// to prevent illegal overlapping.
private int Find_colon_no_links(byte[] str, byte[] before, byte[] after) {
int len = str.length;
int colon_pos = Php_str_.Strpos(str, Byte_ascii.Colon, 0, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing to find!
return Bry_find_.Not_found;
}
int lt_pos = Php_str_.Strpos(str, Byte_ascii.Angle_bgn, 0, len);
if (lt_pos == Bry_find_.Not_found || lt_pos > colon_pos) {
// Easy; no tag nesting to worry about
find_colon_no_links__before = Php_str_.Substr(str, 0, colon_pos);
find_colon_no_links__after = Php_str_.Substr(str, colon_pos + 1);
return colon_pos;
}
// Ugly state machine to walk through avoiding tags.
int state = COLON_STATE_TEXT;
int level = 0;
for (int i = 0; i < len; i++) {
byte c = str[i];
switch (state) {
case COLON_STATE_TEXT:
switch (c) {
case Byte_ascii.Angle_bgn:
// Could be either a <start> tag or an </end> tag
state = COLON_STATE_TAGSTART;
break;
case Byte_ascii.Colon:
if (level == 0) {
// We found it!
find_colon_no_links__before = Php_str_.Substr(str, 0, i);
find_colon_no_links__after = Php_str_.Substr(str, i + 1);
return i;
}
// Embedded in a tag; don't break it.
break;
default:
// Skip ahead looking for something interesting
colon_pos = Php_str_.Strpos(str, Byte_ascii.Colon, i, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing else interesting
return Bry_find_.Not_found;
}
lt_pos = Php_str_.Strpos(str, Byte_ascii.Angle_bgn, i, len);
if (level == 0) {
if (lt_pos == Bry_find_.Not_found || colon_pos < lt_pos) {
// We found it!
find_colon_no_links__before = Php_str_.Substr(str, 0, colon_pos);
find_colon_no_links__after = Php_str_.Substr(str, colon_pos + 1);
return i;
}
}
if (lt_pos == Bry_find_.Not_found) {
// Nothing else interesting to find; abort!
// We're nested, but there's no close tags left. Abort!
i = len; // break 2
break;
}
// Skip ahead to next tag start
i = lt_pos;
state = COLON_STATE_TAGSTART;
break;
}
break;
case COLON_STATE_TAG:
// In a <tag>
switch (c) {
case Byte_ascii.Angle_bgn:
level++;
state = COLON_STATE_TEXT;
break;
case Byte_ascii.Slash:
// Slash may be followed by >?
state = COLON_STATE_TAGSLASH;
break;
default:
// ignore
break;
}
break;
case COLON_STATE_TAGSTART:
switch (c) {
case Byte_ascii.Slash:
state = COLON_STATE_CLOSETAG;
break;
case Byte_ascii.Bang:
state = COLON_STATE_COMMENT;
break;
case Byte_ascii.Angle_bgn:
// Illegal early close? This shouldn't happen D:
state = COLON_STATE_TEXT;
break;
default:
state = COLON_STATE_TAG;
break;
}
break;
case COLON_STATE_CLOSETAG:
// In a </tag>
if (c == Byte_ascii.Angle_bgn) {
level--;
if (level < 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; too many close tags");
return Bry_find_.Not_found;
}
state = COLON_STATE_TEXT;
}
break;
case COLON_STATE_TAGSLASH:
if (c == Byte_ascii.Angle_bgn) {
// Yes, a self-closed tag <blah/>
state = COLON_STATE_TEXT;
}
else {
// Probably we're jumping the gun, and this is an attribute
state = COLON_STATE_TAG;
}
break;
case COLON_STATE_COMMENT:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASH;
}
break;
case COLON_STATE_COMMENTDASH:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASHDASH;
}
else {
state = COLON_STATE_COMMENT;
}
break;
case COLON_STATE_COMMENTDASHDASH:
if (c == Byte_ascii.Angle_bgn) {
state = COLON_STATE_TEXT;
}
else {
state = COLON_STATE_COMMENT;
}
break;
default:
throw Err_.new_wo_type("State machine error");
}
}
if (level > 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; not enough close tags (level ~{0}, state ~{1})", level, state);
return Bry_find_.Not_found;
}
return Bry_find_.Not_found;
}
private static final int
COLON_STATE_TEXT = 0
, COLON_STATE_TAG = 1
, COLON_STATE_TAGSTART = 2
, COLON_STATE_CLOSETAG = 3
, COLON_STATE_TAGSLASH = 4
, COLON_STATE_COMMENT = 5
, COLON_STATE_COMMENTDASH = 6
, COLON_STATE_COMMENTDASHDASH = 7
;
private static final byte
Last_section__none = 0 // ''
, Last_section__para = 1 // p
, Last_section__pre = 2 // pre
;
private static final byte
Para_stack__none = 0 // false
, Para_stack__bgn = 1 // <p>
, Para_stack__mid = 2 // </p><p>
;
private static final int Pre__bgn = 0, Pre__end = 1;
private static Btrie_slim_mgr pre_trie;
private static boolean[] block_chars_ary;
private static boolean[] Block_chars_ary__new() {
boolean[] rv = new boolean[256];
rv[Byte_ascii.Star] = true;
rv[Byte_ascii.Hash] = true;
rv[Byte_ascii.Colon] = true;
rv[Byte_ascii.Semic] = true;
return rv;
}
private static Btrie_slim_mgr open_match_trie, close_match_trie, blockquote_trie;
private static void Para_stack_bfr(Bry_bfr bfr, int id) {
switch (id) {
case Para_stack__bgn: bfr.Add_str_a7("<p>"); break;
case Para_stack__mid: bfr.Add_str_a7("</p><p>"); break;
default: throw Err_.new_unhandled_default(id);
}
}
}

@ -0,0 +1,42 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mws.linkers.*;
public class Xomw_block_level_pass__tst {
private final Xomw_block_level_pass__fxt fxt = new Xomw_block_level_pass__fxt();
@Test public void Basic() {
fxt.Test__do_block_levels(String_.Concat_lines_nl_skip_last
( "a"
), String_.Concat_lines_nl_skip_last
( "<p>a"
, "</p>"
));
}
}
class Xomw_block_level_pass__fxt {
private final Xomw_block_level_pass block_level_pass = new Xomw_block_level_pass();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public void Test__do_block_levels(String src, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
block_level_pass.Do_block_levels(pctx, pbfr.Init(Bry_.new_u8(src)), true);
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear());
}
}

@ -0,0 +1,250 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.langs.htmls.*;
import gplx.xowa.mws.*;
import gplx.xowa.mws.htmls.*;
import gplx.xowa.mws.linkers.*;
public class Xomw_link_holders {
private final Xomw_link_renderer link_renderer;
private final Bry_bfr tmp;
private int link_id = 0; // MOVED:Parser.php
private final Xomw_link_holder_list internals = new Xomw_link_holder_list();
private final Xomwh_atr_mgr extra_atrs = new Xomwh_atr_mgr();
public Xomw_link_holders(Xomw_link_renderer link_renderer, Bry_bfr tmp) {
this.link_renderer = link_renderer;
this.tmp = tmp;
}
public void Clear() {
internals.Clear();
link_id = 0;
}
public void Make_holder(Bry_bfr bfr, Xoa_ttl nt, byte[] text, byte[][] query, byte[] trail, byte[] prefix) {
if (nt == null) {
// Fail gracefully
bfr.Add_str_a7("<!-- ERROR -->").Add(prefix).Add(text).Add(trail);
}
else {
// Separate the link trail from the rest of the link
// list( $inside, $trail ) = Linker::splitTrail( $trail );
byte[] inside = Bry_.Empty;
Xomw_link_holder_item entry = new Xomw_link_holder_item(nt, tmp.Add_bry_many(prefix, text, inside).To_bry_and_clear(), query);
boolean is_external = false; // $nt->isExternal()
if (is_external) {
// Use a globally unique ID to keep the objects mergable
// $key = $this->parent->nextLinkID();
// $this->interwikis[$key] = $entry;
// $retVal = "<!--IWLINK $key-->{$trail}";
}
else {
int key = link_id++;
internals.Add(key, entry);
bfr.Add(Bry__link__bgn).Add_int_variable(key).Add(Gfh_tag_.Comm_end).Add(trail); // "<!--LINK $ns:$key-->{$trail}";
}
}
}
public void Test__add(Xoa_ttl ttl, byte[] capt) {
int key = link_id++;
Xomw_link_holder_item item = new Xomw_link_holder_item(ttl, capt, Bry_.Ary_empty);
internals.Add(key, item);
}
public void Replace(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
this.Replace_internal(pbfr);
// $this->replaceInterwiki( $text );
}
private void Replace_internal(Xomw_parser_bfr pbfr) {
if (internals.Len() == 0)
return;
// $colours = [];
// $linkCache = LinkCache::singleton();
// $output = $this->parent->getOutput();
// $linkRenderer = $this->parent->getLinkRenderer();
// $linkcolour_ids = [];
// SKIP:Replace_internals does db lookup to identify redlinks;
// Construct search and replace arrays
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
int prv = 0;
while (true) {
int link_bgn = Bry_find_.Find_fwd(src, Bry__link__bgn, cur, src_end);
if (link_bgn == Bry_find_.Not_found) {
bfr.Add_mid(src, prv, src_end);
break;
}
int key_bgn = link_bgn + Bry__link__bgn.length;
int key_end = Bry_find_.Find_fwd_while_num(src, key_bgn, src_end);
int link_key = Bry_.To_int_or(src, key_bgn, key_end, -1);
Xomw_link_holder_item item = internals.Get_by(link_key);
// $pdbk = $entry['pdbk'];
// $title = $entry['title'];
// $query = isset( $entry['query'] ) ? $entry['query'] : [];
// $key = "$ns:$index";
// $searchkey = "<!--LINK $key-->";
// $displayText = $entry['text'];
// if ( isset( $entry['selflink'] ) ) {
// $replacePairs[$searchkey] = Linker::makeSelfLinkObj( $title, $displayText, $query );
// continue;
// }
// if ( $displayText === '' ) {
// $displayText = null;
// } else {
// $displayText = new HtmlArmor( $displayText );
// }
// if ( !isset( $colours[$pdbk] ) ) {
// $colours[$pdbk] = 'new';
// }
// $attribs = [];
// if ( $colours[$pdbk] == 'new' ) {
// $linkCache->addBadLinkObj( $title );
// $output->addLink( $title, 0 );
// $link = $linkRenderer->makeBrokenLink(
// $title, $displayText, $attribs, $query
// );
// } else {
// $link = $linkRenderer->makePreloadedLink(
// $title, $displayText, $colours[$pdbk], $attribs, $query
// );
// }
bfr.Add_mid(src, prv, link_bgn);
link_renderer.Make_preloaded_link(bfr, item.Title(), item.Text(), Bry_.Empty, extra_atrs, Bry_.Empty);
cur = key_end + Gfh_tag_.Comm_end_len;
prv = cur;
}
}
// private void Replace_internal__db() {
// // Generate query
// $lb = new LinkBatch();
// $lb->setCaller( __METHOD__ );
//
// foreach ( $this->internals as $ns => $entries ) {
// foreach ( $entries as $entry ) {
// /** @var Title $title */
// $title = $entry['title'];
// $pdbk = $entry['pdbk'];
//
// # Skip invalid entries.
// # Result will be ugly, but prevents crash.
// if ( is_null( $title ) ) {
// continue;
// }
//
// # Check if it's a static known link, e.g. interwiki
// if ( $title->isAlwaysKnown() ) {
// $colours[$pdbk] = '';
// } elseif ( $ns == NS_SPECIAL ) {
// $colours[$pdbk] = 'new';
// } else {
// $id = $linkCache->getGoodLinkID( $pdbk );
// if ( $id != 0 ) {
// $colours[$pdbk] = $linkRenderer->getLinkClasses( $title );
// $output->addLink( $title, $id );
// $linkcolour_ids[$id] = $pdbk;
// } elseif ( $linkCache->isBadLink( $pdbk ) ) {
// $colours[$pdbk] = 'new';
// } else {
// # Not in the link cache, add it to the query
// $lb->addObj( $title );
// }
// }
// }
// }
// if ( !$lb->isEmpty() ) {
// $fields = array_merge(
// LinkCache::getSelectFields(),
// [ 'page_namespace', 'page_title' ]
// );
//
// $res = $dbr->select(
// 'page',
// $fields,
// $lb->constructSet( 'page', $dbr ),
// __METHOD__
// );
//
// # Fetch data and form into an associative array
// # non-existent = broken
// foreach ( $res as $s ) {
// $title = Title::makeTitle( $s->page_namespace, $s->page_title );
// $pdbk = $title->getPrefixedDBkey();
// $linkCache->addGoodLinkObjFromRow( $title, $s );
// $output->addLink( $title, $s->page_id );
// $colours[$pdbk] = $linkRenderer->getLinkClasses( $title );
// // add id to the extension todolist
// $linkcolour_ids[$s->page_id] = $pdbk;
// }
// unset( $res );
// }
// if ( count( $linkcolour_ids ) ) {
// // pass an array of page_ids to an extension
// Hooks::run( 'GetLinkColours', [ $linkcolour_ids, &$colours ] );
// }
//
// # Do a second query for different language variants of links and categories
// if ( $wgContLang->hasVariants() ) {
// $this->doVariants( $colours );
// }
// }
private static final byte[] Bry__link__bgn = Bry_.new_a7("<!--LINK ");
}
class Xomw_link_holder_list {
private int ary_len = 0, ary_max = 128;
private Xomw_link_holder_item[] ary = new Xomw_link_holder_item[128];
public int Len() {return ary_len;}
public void Clear() {
ary_len = 0;
if (ary_max > 128)
ary = new Xomw_link_holder_item[128];
}
public void Add(int key, Xomw_link_holder_item item) {
if (key >= ary_max) {
int new_max = ary_max * 2;
ary = (Xomw_link_holder_item[])Array_.Resize(ary, new_max);
ary_max = new_max;
}
ary[key] = item;
ary_len++;
}
public Xomw_link_holder_item Get_by(int key) {return ary[key];}
}
class Xomw_link_holder_item {
public Xomw_link_holder_item(Xoa_ttl title, byte[] text, byte[][] query) {
this.title = title;
this.text = text;
this.query = query;
}
public Xoa_ttl Title() {return title;} private final Xoa_ttl title;
public byte[] Text() {return text;} private final byte[] text;
public byte[] Pdbk() {return title.Get_prefixed_db_key();}
public byte[][] Query() {return query;} private final byte[][] query;
}

@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mws.linkers.*;
public class Xomw_link_holders__tst {
private final Xomw_link_holders__fxt fxt = new Xomw_link_holders__fxt();
@Test public void Replace__basic() {
fxt.Init__add("A", "a");
fxt.Test__replace("a <!--LINK 0--> b", "a <a href='/wiki/A' title='A'>a</a> b");
}
}
class Xomw_link_holders__fxt {
private final Xomw_link_holders holders = new Xomw_link_holders(new Xomw_link_renderer(), Bry_bfr_.New());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xowe_wiki wiki;
private boolean apos = true;
public Xomw_link_holders__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
this.wiki = Xoa_app_fxt.Make__wiki__edit(app);
}
public void Init__add(String ttl, String capt) {
holders.Test__add(wiki.Ttl_parse(Bry_.new_u8(ttl)), Bry_.new_u8(capt));
}
public void Test__replace(String src, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
holders.Replace(new Xomw_parser_ctx(), pbfr.Init(Bry_.new_u8(src)));
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear());
}
}

@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_output_type {
public static final byte
Tid__html = 1 // like parse()
, Tid__wiki = 2 // like preSaveTransform()
, Tid__preprocess = 3 // like preprocess()
, Tid__msg = 3
, Tid__plain = 4 // like extractSections() - portions of the original are returned unchanged.
;
}

@ -0,0 +1,257 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*; import gplx.core.net.*;
import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*;
import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*;
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*;
import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*;
public class Xomw_parser {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_table_wkr table_wkr;
private final Xomw_hr_wkr hr_wkr = new Xomw_hr_wkr();
private final Xomw_lnke_wkr lnke_wkr;
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
private final Xomw_link_holders holders;
private final Xomw_heading_cbk__html heading_wkr_cbk;
private final Btrie_slim_mgr protocols_trie;
private final Btrie_rv trv = new Btrie_rv();
private int marker_index = 0;
// private final Xomw_prepro_wkr prepro_wkr = new Xomw_prepro_wkr();
public Xomw_strip_state Strip_state() {return strip_state;} private final Xomw_strip_state strip_state = new Xomw_strip_state();
public Xomw_sanitizer Sanitizer() {return sanitizer;} private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
public Xomw_linker Linker() {return linker;} private final Xomw_linker linker = new Xomw_linker();
public Bry_bfr Tmp() {return tmp;} private final Bry_bfr tmp = Bry_bfr_.New();
public Xomw_quote_wkr Quote_wkr() {return quote_wkr;} private final Xomw_quote_wkr quote_wkr;
public Xomw_lnki_wkr Lnki_wkr() {return lnki_wkr;} private final Xomw_lnki_wkr lnki_wkr;
public boolean Output_type__wiki() {return output_type__wiki;} private final boolean output_type__wiki = false;
public Xomw_parser() {
this.protocols_trie = Xomw_parser.Protocols__dflt();
this.holders = new Xomw_link_holders(link_renderer, tmp);
this.table_wkr = new Xomw_table_wkr(this);
this.quote_wkr = new Xomw_quote_wkr(this);
this.lnke_wkr = new Xomw_lnke_wkr(this);
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
}
public void Init_by_wiki(Xowe_wiki wiki) {
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
lnke_wkr.Init_by_wiki(protocols_trie);
lnki_wkr.Init_by_wiki(wiki);
}
public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) {
pbfr.Init(text);
// $origText = text;
// MW.HOOK:ParserBeforeInternalParse
// if ($frame) {
// use frame depth to infer how include/noinclude tags should be handled
// depth=0 means this is the top-level document; otherwise it's an included document
// boolean for_inclusion = false;
// if (!$frame->depth) {
// $flag = 0;
// } else {
// $flag = Parser::PTD_FOR_INCLUSION;
// }
// text = prepro_wkr.Preprocess_to_xml(text, for_inclusion);
// text = $frame->expand($dom);
// } else {
// // if $frame is not provided, then use old-style replaceVariables
// text = $this->replaceVariables(text);
// }
// MW.HOOK:InternalParseBeforeSanitize
// text = Sanitizer::removeHTMLtags(
// text,
// [ &$this, 'attributeStripCallback' ],
// false,
// array_keys($this->mTransparentTagHooks),
// [],
// [ &$this, 'addTrackingCategory' ]
// );
// MW.HOOK:InternalParseBeforeLinks
// Tables need to come after variable replacement for things to work
// properly; putting them before other transformations should keep
// exciting things like link expansions from showing up in surprising
// places.
table_wkr.Do_table_stuff(pctx, pbfr);
hr_wkr.Replace_hrs(pctx, pbfr);
// text = $this->doDoubleUnderscore(text);
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
lnki_wkr.Replace_internal_links(pctx, pbfr);
quote_wkr.Do_all_quotes(pctx, pbfr);
lnke_wkr.Replace_external_links(pctx, pbfr);
// replaceInternalLinks may sometimes leave behind
// absolute URLs, which have to be masked to hide them from replaceExternalLinks
Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty);
// $text = $this->doMagicLinks($text);
// $text = $this->formatHeadings($text, $origText, $isMain);
}
public void Internal_parse_half_parsed(Xomw_parser_bfr pbfr, boolean is_main, boolean line_start) {
strip_state.Unstrip_general(pbfr);
// MW.HOOK:ParserAfterUnstrip
// Clean up special characters, only run once, next-to-last before doBlockLevels
nbsp_wkr.Do_nbsp(pctx, pbfr);
block_wkr.Do_block_levels(pctx, pbfr, line_start);
lnki_wkr.Replace_link_holders(pctx, pbfr);
// The input doesn't get language converted if
// a) It's disabled
// b) Content isn't converted
// c) It's a conversion table
// d) it is an interface message (which is in the user language)
// if ( !( $this->mOptions->getDisableContentConversion()
// || isset( $this->mDoubleUnderscores['nocontentconvert'] ) )
// ) {
// if ( !$this->mOptions->getInterfaceMessage() ) {
// // The position of the convert() call should not be changed. it
// // assumes that the links are all replaced and the only thing left
// // is the <nowiki> mark.
// $text = $this->getConverterLanguage()->convert( $text );
// }
// }
strip_state.Unstrip_nowiki(pbfr);
// MW.HOOK:ParserBeforeTidy
// $text = $this->replaceTransparentTags( $text );
strip_state.Unstrip_general(pbfr);
sanitizer.Normalize_char_references(pbfr);
// if ( MWTidy::isEnabled() ) {
// if ( $this->mOptions->getTidy() ) {
// $text = MWTidy::tidy( $text );
// }
// }
// else {
// attempt to sanitize at least some nesting problems
// (T4702 and quite a few others)
// $tidyregs = [
// // ''Something [http://www.cool.com cool''] -->
// // <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
// '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' =>
// '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9',
// // fix up an anchor inside another anchor, only
// // at least for a single single nested link (T5695)
// '/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\/a>(.*)<\/a>/' =>
// '\\1\\2</a>\\3</a>\\1\\4</a>',
// // fix div inside inline elements- doBlockLevels won't wrap a line which
// // contains a div, so fix it up here; replace
// // div with escaped text
// '/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' =>
// '\\1\\3&lt;div\\5&gt;\\6&lt;/div&gt;\\8\\9',
// // remove empty italic or bold tag pairs, some
// // introduced by rules above
// '/<([bi])><\/\\1>/' => '',
// ];
// $text = preg_replace(
// array_keys( $tidyregs ),
// array_values( $tidyregs ),
// $text );
// }
// MW.HOOK:ParserAfterTidy
}
public byte[] Armor_links(Bry_bfr trg, byte[] src, int src_bgn, int src_end) {
// PORTED:preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/', self::MARKER_PREFIX . "NOPARSE$1", $text )
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
boolean called_by_bry = trg == null;
while (true) {
// exit if EOS
if (cur == src_end) {
// if dirty, add rest of String
if (dirty)
trg.Add_mid(src, prv, src_end);
break;
}
// check if cur matches protocol
Object protocol_obj = protocols_trie.Match_at(trv, src, cur, src_end);
// no match; continue
if (protocol_obj == null) {
cur++;
}
// match; add to bfr
else {
dirty = true;
byte[] protocol_bry = (byte[])protocol_obj;
if (called_by_bry) trg = Bry_bfr_.New();
trg.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__noparse, protocol_bry);
cur += protocol_bry.length;
prv = cur;
}
}
if (called_by_bry) {
if (dirty)
return trg.To_bry_and_clear();
else {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
}
else {
if (dirty)
return null;
else {
trg.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
public byte[] Insert_strip_item(byte[] text) {
tmp.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__strip_state_item);
tmp.Add_int_variable(marker_index);
tmp.Add(Xomw_strip_state.Bry__marker__end);
byte[] marker = tmp.To_bry_and_clear();
marker_index++;
strip_state.Add_general(marker, text);
return marker;
}
private static final byte[] Bry__strip_state_item = Bry_.new_a7("-item-"), Bry__noparse = Bry_.new_a7("NOPARSE");
private static final byte[] Bry__marker__noparse = Bry_.Add(Xomw_strip_state.Bry__marker__bgn, Bry__noparse);
public static Btrie_slim_mgr Protocols__dflt() {
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
Gfo_protocol_itm[] ary = Gfo_protocol_itm.Ary();
for (Gfo_protocol_itm itm : ary) {
byte[] key = itm.Text_bry(); // EX: "https://"
rv.Add_obj(key, key);
}
return rv;
}
}

@ -0,0 +1,72 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*;
public class Xomw_parser__tst {
private final Xomw_parser__fxt fxt = new Xomw_parser__fxt();
@Test public void Basic() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "== heading_1 =="
, "para_1"
, "== heading_2 =="
, "para_2"
, "-----"
, "{|"
, "|-"
, "|a"
, "|}"
, "''italics''"
, "[https://a.org b]"
, "[[A|abc]]"
, "a »b« &#160;!important c"
), String_.Concat_lines_nl_skip_last
( "<h2> heading_1 </h2>"
, "<p>para_1"
, "</p>"
, "<h2> heading_2 </h2>"
, "<p>para_2"
, "</p>"
, "<hr />"
, "<table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
, "<p><i>italics</i>"
, "<a class=\"external text\" rel=\"nofollow\" href=\"https://a.org\">b</a>"
, "<a href=\"/wiki/A\" title=\"A\">abc</a>"
, "a&#160;»b«&#160; !important c"
, "</p>"
));
}
}
class Xomw_parser__fxt {
private final Xomw_parser mgr = new Xomw_parser();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public Xomw_parser__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
mgr.Init_by_wiki(wiki);
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
mgr.Internal_parse(pbfr, src_bry);
mgr.Internal_parse_half_parsed(pbfr, true, true);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

@ -0,0 +1,48 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_parser_bfr { // manages 2 bfrs to eliminate multiple calls to new memory allocations ("return bfr.To_bry_and_clear()")
private final Bry_bfr bfr_1 = Bry_bfr_.New(), bfr_2 = Bry_bfr_.New();
private Bry_bfr src, trg;
public Xomw_parser_bfr() {
this.src = bfr_1;
this.trg = bfr_2;
}
public Bry_bfr Src() {return src;}
public Bry_bfr Trg() {return trg;}
public Bry_bfr Rslt() {return src;}
public Xomw_parser_bfr Init(byte[] text) {
// resize each bfr once by guessing that html_len = text_len * 2
int text_len = text.length;
int html_len = text_len * 2;
src.Resize(html_len);
trg.Resize(html_len);
// clear and add
src.Clear();
trg.Clear();
src.Add(text);
return this;
}
public void Switch() {
Bry_bfr tmp = src;
this.src = trg;
this.trg = tmp;
trg.Clear();
}
}

@ -0,0 +1,69 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_parser_bfr_ {
public static void Replace(Xomw_parser_bfr pbfr, byte[] find, byte[] repl) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
if (Replace(bfr, Bool_.N, src, src_bgn, src_end, find, repl) != null)
pbfr.Switch();
}
private static byte[] Replace(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
int find_bgn = Bry_find_.Find_fwd(src, find, cur);
if (find_bgn == Bry_find_.Not_found) {
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
if (called_by_bry) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn);
cur += find.length;
dirty = true;
}
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
}

@ -15,13 +15,13 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.parsers.mws.utils.*;
import gplx.xowa.parsers.uniqs.*;
public class Xomw_parser_ctx { public class Xomw_parser_ctx {
public Xomw_sanitizer_mgr Sanitizer() {return sanitizer;} private final Xomw_sanitizer_mgr sanitizer = new Xomw_sanitizer_mgr(); public Xoa_ttl Page_title() {return page_title;} private Xoa_ttl page_title;
public Xop_uniq_mgr Uniq_mgr() {return uniq_mgr;} private final Xop_uniq_mgr uniq_mgr = new Xop_uniq_mgr();
public void Init_by_page(Xoa_ttl page_title) {
this.page_title = page_title;
}
public static final int Pos__bos = -1; public static final int Pos__bos = -1;
} }

@ -0,0 +1,139 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
public class Xomw_strip_state { // REF.MW:/parser/StripState.php
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
private final Btrie_rv trv = new Btrie_rv();
private final Bry_bfr tmp_1 = Bry_bfr_.New();
private final Bry_bfr tmp_2 = Bry_bfr_.New();
private boolean tmp_2_used = false;
private int general_len, nowiki_len;
public void Clear() {
trie.Clear();
general_len = nowiki_len = 0;
tmp_2_used = false;
}
public void Add_general(byte[] marker, byte[] val) {Add_item(Tid__general, marker, val);}
public void Add_nowiki (byte[] marker, byte[] val) {Add_item(Tid__nowiki, marker, val);}
public void Add_item(byte tid, byte[] marker, byte[] val) {
trie.Add_obj(marker, new Xomw_strip_item(tid, marker, val));
if (tid == Tid__general)
general_len++;
else
nowiki_len++;
}
public byte[] Unstrip_general(byte[] text) {return Unstrip(Tid__general, text);}
public byte[] Unstrip_nowiki (byte[] text) {return Unstrip(Tid__nowiki , text);}
public byte[] Unstrip_both (byte[] text) {return Unstrip(Tid__both , text);}
public byte[] Unstrip(byte tid, byte[] text) {
boolean dirty = Unstrip(tid, tmp_1, text, 0, text.length);
return dirty ? tmp_1.To_bry_and_clear() : text;
}
public void Unstrip_general(Xomw_parser_bfr pbfr) {Unstrip(Tid__general, pbfr);}
public void Unstrip_nowiki (Xomw_parser_bfr pbfr) {Unstrip(Tid__nowiki , pbfr);}
public void Unstrip_both (Xomw_parser_bfr pbfr) {Unstrip(Tid__both , pbfr);}
private boolean Unstrip(byte tid, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
boolean dirty = Unstrip(tid, pbfr.Trg(), src, 0, src_bfr.Len());
if (dirty)
pbfr.Switch();
return dirty;
}
private boolean Unstrip(byte tid, Bry_bfr trg, byte[] src, int src_bgn, int src_end) {
// exit early if no items for type
if ((tid & Tid__general) == Tid__general) {
if (general_len == 0)
return false;
}
else if ((tid & Tid__nowiki) == Tid__nowiki) {
if (nowiki_len == 0)
return false;
}
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
// loop over each src char
while (true) {
// EOS: exit
if (cur == src_end) {
if (dirty) // add remainder if dirty
trg.Add_mid(src, prv, src_end);
break;
}
// check if current pos matches strip state
Object o = trie.Match_at(trv, src, cur, src_end);
if (o != null) { // match
Xomw_strip_item item = (Xomw_strip_item)o;
byte item_tid = item.Tid();
if ((tid & item_tid) == item_tid) { // check if types match
// get bfr for recursion
Bry_bfr nested_bfr = null;
boolean tmp_2_release = false;
if (tmp_2_used) {
nested_bfr = Bry_bfr_.New();
}
else {
nested_bfr = tmp_2;
tmp_2_used = true;
tmp_2_release = true;
}
// recurse
byte[] item_val = item.Val();
if (Unstrip(tid, nested_bfr, item_val, 0, item_val.length))
item_val = nested_bfr.To_bry_and_clear();
if (tmp_2_release)
tmp_2_used = false;
// add to trg
trg.Add_mid(src, prv, cur);
trg.Add(item_val);
// update vars
dirty = true;
cur += item.Key().length;
prv = cur;
continue;
}
}
cur++;
}
return dirty;
}
public static final String Str__marker_bgn = "\u007f'\"`UNIQ-";
public static final byte[]
Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn)
, Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f")
;
public static final byte Tid__general = 1, Tid__nowiki = 2, Tid__both = 3;
}
class Xomw_strip_item {
public Xomw_strip_item(byte tid, byte[] key, byte[] val) {
this.tid = tid;
this.key = key;
this.val = val;
}
public byte Tid() {return tid;} private final byte tid;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
}

@ -0,0 +1,44 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_strip_state__tst {
private final Xomw_strip_state__fxt fxt = new Xomw_strip_state__fxt();
@Test public void Basic() {
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1");
fxt.Test__nostrip(Xomw_strip_state.Tid__nowiki , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b");
fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b");
fxt.Test__unstrip(Xomw_strip_state.Tid__both , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b");
}
@Test public void Recurse() {
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1");
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-2-QINU`\"'\u007f", "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f");
fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-2-QINU`\"'\u007f b", "a val-1 b");
}
}
class Xomw_strip_state__fxt {
private final Xomw_strip_state strip_state = new Xomw_strip_state();
public void Init__add(byte tid, String marker, String val) {
strip_state.Add_item(tid, Bry_.new_u8(marker), Bry_.new_u8(val));
}
public void Test__nostrip(byte tid, String src) {Test__unstrip(tid, src, src);}
public void Test__unstrip(byte tid, String src, String expd) {
byte[] actl = strip_state.Unstrip(tid, Bry_.new_u8(src));
Gftest.Eq__str(expd, String_.new_u8(actl));
}
}

@ -0,0 +1,84 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
class Xomw_doubleunder_wkr {
public boolean show_toc;
public boolean force_toc_position;
public boolean output__no_gallery ;
public Xomw_doubleunder_data doubleunderscore_data = new Xomw_doubleunder_data();
private void Match_and_remove(byte[] text, Xomw_doubleunder_data doubleunderscore_data) {
doubleunderscore_data.Reset();
}
public void Do_double_underscore(byte[] text) {
// The position of __TOC__ needs to be recorded
// $mw = MagicWord::get( 'toc' );
// if ( $mw->match( $text ) ) {
this.show_toc = true;
this.force_toc_position = true;
// Set a placeholder. At the end we'll fill it in with the TOC.
// $text = $mw->replace( '<!--MWTOC-->', $text, 1 );
// Only keep the first one.
// $text = $mw->replace( '', $text );
// }
// Now match and remove the rest of them
// $mwa = MagicWord::getDoubleUnderscoreArray();
Match_and_remove(text, doubleunderscore_data);
if (doubleunderscore_data.no_gallery) {
output__no_gallery = true;
}
if (doubleunderscore_data.no_toc && !force_toc_position) {
this.show_toc = false;
}
if ( doubleunderscore_data.hidden_cat
// && $this->mTitle->getNamespace() == NS_CATEGORY
) {
//$this->addTrackingCategory( 'hidden-category-category' );
}
// (T10068) Allow control over whether robots index a page.
// __INDEX__ always overrides __NOINDEX__, see T16899
if (doubleunderscore_data.no_index // && $this->mTitle->canUseNoindex()
) {
// $this->mOutput->setIndexPolicy( 'noindex' );
// $this->addTrackingCategory( 'noindex-category' );
}
if (doubleunderscore_data.index //&& $this->mTitle->canUseNoindex()
) {
// $this->mOutput->setIndexPolicy( 'index' );
// $this->addTrackingCategory( 'index-category' );
}
// Cache all double underscores in the database
// foreach ( $this->mDoubleUnderscores as $key => $val ) {
// $this->mOutput->setProperty( $key, '' );
// }
}
}
class Xomw_doubleunder_data {
public boolean no_gallery;
public boolean no_toc;
public boolean hidden_cat;
public boolean no_index;
public boolean index;
public void Reset() {
no_gallery = no_toc = hidden_cat = no_index = index = false;
}
}

@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public interface Xomw_heading_cbk {
void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr);
void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr);
}

@ -15,10 +15,14 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public class Xomw_hdr_cbk__html implements Xomw_hdr_cbk { public class Xomw_heading_cbk__html implements Xomw_heading_cbk {
public Bry_bfr Bfr() {return bfr;} private final Bry_bfr bfr = Bry_bfr_.New(); public Bry_bfr Bfr() {return bfr;} private Bry_bfr bfr;
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) { public Xomw_heading_cbk__html Bfr_(Bry_bfr bfr) {
this.bfr = bfr;
return this;
}
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// add from txt_bgn to hdr_bgn; EX: "abc\n==A==\n"; "\n==" seen -> add "abc" // add from txt_bgn to hdr_bgn; EX: "abc\n==A==\n"; "\n==" seen -> add "abc"
byte[] src = wkr.Src(); byte[] src = wkr.Src();
int hdr_bgn = wkr.Hdr_bgn(), txt_bgn = wkr.Txt_bgn(); int hdr_bgn = wkr.Hdr_bgn(), txt_bgn = wkr.Txt_bgn();
@ -34,7 +38,7 @@ public class Xomw_hdr_cbk__html implements Xomw_hdr_cbk {
bfr.Add_mid(wkr.Src(), wkr.Hdr_lhs_end(), wkr.Hdr_rhs_bgn()); bfr.Add_mid(wkr.Src(), wkr.Hdr_lhs_end(), wkr.Hdr_rhs_bgn());
bfr.Add(Tag__rhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // </h2> bfr.Add(Tag__rhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // </h2>
} }
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) { public void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// add from txt_bgn to EOS; // add from txt_bgn to EOS;
byte[] src = wkr.Src(); byte[] src = wkr.Src();
int txt_bgn = wkr.Txt_bgn(), src_end = wkr.Src_end(); int txt_bgn = wkr.Txt_bgn(), src_end = wkr.Src_end();

@ -15,11 +15,11 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.btries.*; import gplx.xowa.langs.*; import gplx.core.btries.*; import gplx.xowa.langs.*;
public class Xomw_hdr_wkr { public class Xomw_heading_wkr {
private Xomw_parser_ctx pctx; private Xomw_parser_ctx pctx;
private Xomw_hdr_cbk cbk; private Xomw_heading_cbk cbk;
public byte[] Src() {return src;} private byte[] src; public byte[] Src() {return src;} private byte[] src;
public int Src_end() {return src_end;} private int src_end; public int Src_end() {return src_end;} private int src_end;
public int Txt_bgn() {return txt_bgn;} private int txt_bgn; public int Txt_bgn() {return txt_bgn;} private int txt_bgn;
@ -30,13 +30,27 @@ public class Xomw_hdr_wkr {
public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end; public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end;
public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn; public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn;
public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end; public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end;
public void Parse(Xomw_parser_ctx pctx, byte[] src, int src_bgn, int src_end, Xomw_hdr_cbk cbk) { // REF.MW: /includes/parser/Parser.php|doHeadings public void Do_headings(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, Xomw_heading_cbk__html cbk) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src_bry = src_bfr.Bfr();
int src_end = src_bfr.Len();
cbk.Bfr_(pbfr.Trg());
pbfr.Switch();
Parse(pctx, src_bry, 0, src_end, cbk);
}
public void Parse(Xomw_parser_ctx pctx, byte[] src, int src_bgn, int src_end, Xomw_heading_cbk cbk) { // REF.MW: /includes/parser/Parser.php|doHeadings
// init members // init members
this.pctx = pctx; this.pctx = pctx;
this.src = src; this.src = src;
this.src_end = src_end; this.src_end = src_end;
this.cbk = cbk; this.cbk = cbk;
// PORTED:
// for ( $i = 6; $i >= 1; --$i ) {
// $h = str_repeat( '=', $i );
// $text = preg_replace( "/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text );
// }
// do loop // do loop
int pos = src_bgn; int pos = src_bgn;
this.txt_bgn = pos == Xomw_parser_ctx.Pos__bos ? 0 : pos; this.txt_bgn = pos == Xomw_parser_ctx.Pos__bos ? 0 : pos;
@ -92,7 +106,3 @@ public class Xomw_hdr_wkr {
return nl_rhs; return nl_rhs;
} }
} }
// for ( $i = 6; $i >= 1; --$i ) {
// $h = str_repeat( '=', $i );
// $text = preg_replace( "/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text );
// }

@ -15,10 +15,10 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*; import org.junit.*;
public class Xomw_hdr_wkr_tst { public class Xomw_heading_wkr__tst {
private final Xomw_hdr_wkr_fxt fxt = new Xomw_hdr_wkr_fxt(); private final Xomw_heading_wkr__fxt fxt = new Xomw_heading_wkr__fxt();
@Test public void Basic() { @Test public void Basic() {
fxt.Test__parse("==A==" , "<h2>A</h2>"); fxt.Test__parse("==A==" , "<h2>A</h2>");
fxt.Test__parse("abc\n==A==\ndef" , "abc\n<h2>A</h2>\ndef"); fxt.Test__parse("abc\n==A==\ndef" , "abc\n<h2>A</h2>\ndef");
@ -28,10 +28,11 @@ public class Xomw_hdr_wkr_tst {
fxt.Test__parse("abc\n==" , "abc\n<h1></h1>"); fxt.Test__parse("abc\n==" , "abc\n<h1></h1>");
} }
} }
class Xomw_hdr_wkr_fxt { class Xomw_heading_wkr__fxt {
private final Xomw_hdr_wkr wkr = new Xomw_hdr_wkr(); private final Xomw_heading_wkr wkr = new Xomw_heading_wkr();
private final Xomw_hdr_cbk__html cbk = new Xomw_hdr_cbk__html(); private final Xomw_heading_cbk__html cbk = new Xomw_heading_cbk__html().Bfr_(Bry_bfr_.New());
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
public void Test__parse(String src_str, String expd) { public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str); byte[] src_bry = Bry_.new_u8(src_str);
wkr.Parse(pctx, src_bry, -1, src_bry.length, cbk); wkr.Parse(pctx, src_bry, -1, src_bry.length, cbk);

@ -0,0 +1,81 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.hrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.langs.phps.utls.*;
public class Xomw_hr_wkr {// THREAD.UNSAFE: caching for repeated calls
private Bry_bfr bfr;
public void Replace_hrs(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1<hr />', text);
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
this.bfr = pbfr.Trg();
boolean dirty = false;
// do separate check for "-----" at start of String;
int cur = 0;
if (Bry_.Eq(src, 0, Len__wtxt__hr__bos, Bry__wtxt__hr__bos)) {
cur = Replace_hr(Bool_.N, src, src_bgn, src_end, 0, Len__wtxt__hr__bos);
dirty = true;
}
// loop
while (true) {
// find next "\n-----"
int find_bgn = Bry_find_.Find_fwd(src, Bry__wtxt__hr__mid, cur, src_end);
// nothing found; exit
if (find_bgn == Bry_find_.Not_found) {
if (dirty) {
bfr.Add_mid(src, cur, src_end);
}
break;
}
// something found
cur = Replace_hr(Bool_.Y, src, cur, src_end, find_bgn, Len__wtxt__hr__mid);
dirty = true;
}
if (dirty)
pbfr.Switch();
}
private int Replace_hr(boolean mid, byte[] src, int cur, int src_end, int find_bgn, int tkn_len) {
// something found; add to bfr
if (mid) {
bfr.Add_mid(src, cur, find_bgn); // add everything before "\n-----"
bfr.Add_byte_nl();
}
bfr.Add(Bry__html__hr);
// set dirty / cur and continue
cur = find_bgn + tkn_len;
cur = Bry_find_.Find_fwd_while(src, cur, src_end, Byte_ascii.Dash); // gobble up trailing "-"; the "*" in "-----*" from the regex above
return cur;
}
private static final byte[]
Bry__wtxt__hr__mid = Bry_.new_a7("\n-----")
, Bry__wtxt__hr__bos = Bry_.new_a7("-----")
, Bry__html__hr = Bry_.new_a7("<hr />")
;
private static final int
Len__wtxt__hr__mid = Bry__wtxt__hr__mid.length
, Len__wtxt__hr__bos = Bry__wtxt__hr__bos.length
;
}

@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.hrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*;
public class Xomw_hr_wkr__tst {
private final Xomw_hr_wkr__fxt fxt = new Xomw_hr_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a\n-----b" , "a\n<hr />b");}
@Test public void Extend() {fxt.Test__parse("a\n------b" , "a\n<hr />b");}
@Test public void Not_found() {fxt.Test__parse("a\n----b" , "a\n----b");}
@Test public void Bos() {fxt.Test__parse("-----a" , "<hr />a");}
@Test public void Bos_and_mid() {fxt.Test__parse("-----a\n-----b" , "<hr />a\n<hr />b");}
}
class Xomw_hr_wkr__fxt {
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xomw_hr_wkr wkr = new Xomw_hr_wkr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_hrs(new Xomw_parser_ctx(), pbfr.Init(src_bry));
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

@ -0,0 +1,282 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.lnkes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.btries.*; import gplx.core.primitives.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.mws.htmls.*;
// TODO.XO: add proto-rel; EX: [//a.org b]
public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr tmp;
private Btrie_slim_mgr protocol_trie; private final Btrie_rv trv = new Btrie_rv();
private int autonumber;
private final Xomw_linker linker;
private final Xomwh_atr_mgr attribs = new Xomwh_atr_mgr();
public Xomw_lnke_wkr(Xomw_parser mgr) {
this.tmp = mgr.Tmp();
this.linker = mgr.Linker();
}
public void Init_by_wiki(Btrie_slim_mgr protocol_trie) {
this.protocol_trie = protocol_trie;
}
public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
this.autonumber = 1;
// find regex
int prv = 0;
while (true) {
// PORTED.BGN: $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
// $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
// self::EXT_LINK_ADDR .
// self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
//
// REGEX: "[" + "protocol" + "url-char"* + "space"* + "text"* + "]";
// protocol -> ((?i)' . $this->mUrlProtocols . ') -> "http://", "HTTps://"
// url-char* -> (EXT_LINK_ADDR . EXT_LINK_URL_CLASS*) -> "255.255.255.255", "a.b.c"; NOTE: "http:///" is valid
// space* -> \p{Zs}*
// text -> ([^\]\\x00-\\x08\\x0a-\\x1F]*?) -> "abcd"
// NOTE: /S=extra analysis of pattern /u = unicode support; REF.MW:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
// Simplified expression to match an IPv4 or IPv6 address, or
// at least one character of a host name (embeds EXT_LINK_URL_CLASS)
// static final EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
//
// REGEX: "IPv4" | "IPv6" | "url-char"
// IPv4 -> [0-9.]+ -> "255."
// IPv6 -> \\[(?i:[0-9a-f:.]+)\\] -> "2001:"
// url-char -> [^][<>"\\x00-\\x20\\x7F\p{Zs}] -> "abcde"
// Constants needed for external link processing
// Everything except bracket, space, or control characters
// \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
// as well as U+3000 is IDEOGRAPHIC SPACE for T21052
// static final EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
//
// REGEX: NOT [ "symbols" | "control" | "whitespace" ]
// symbols -> ^][<>"
// control -> \\x00-\\x20\\x7F
// whitespace -> \p{Zs}
// search for "["
int lnke_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Brack_bgn, cur, src_end);
if (lnke_bgn == Bry_find_.Not_found) {
bfr.Add_mid(src, cur, src_end);
break; // no more "["; stop
}
// check for protocol; EX: "https://"
cur = lnke_bgn + 1;
int url_bgn = cur;
Object protocol_bry = protocol_trie.Match_at(trv, src, cur, src_end);
if (protocol_bry == null) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue;// unknown protocol; ignore "["
}
cur += ((byte[])protocol_bry).length;
// check for one-or-more url chars; [^][<>"\\x00-\\x20\\x7F\p{Zs}]
int domain_bgn = cur;
while (true) {
byte b = src[cur];
Object url_char_byte = invalid_url_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (url_char_byte == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else
break;
}
if (cur - domain_bgn == 0) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue; // no chars found; invalid; EX: "[https://"abcde"]"
}
int url_end = cur;
// get ws (if any)
int ws_bgn = -1;
while (true) {
Object space_byte = space_chars_trie.Match_at(trv, src, cur, src_end);
if (space_byte == null) break;
if (ws_bgn == -1) ws_bgn = cur;
cur += ((Int_obj_val)space_byte).Val();
}
// get text (if any)
int text_bgn = -1, text_end = -1;
while (true) {
byte b = src[cur];
Object invalid_text_char = invalid_text_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (invalid_text_char != null) break;
if (text_bgn == -1) text_bgn = cur;
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
text_end = cur;
}
// check for "]"
if (src[cur] != Byte_ascii.Brack_end) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue;
}
cur++;
// PORTED.END: $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// TODO.XO:
//$m2 = [];
//if ( preg_match( '/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE ) ) {
// $text = substr( $url, $m2[0][1] ) . ' ' . $text;
// $url = substr( $url, 0, $m2[0][1] );
//}
// If the link text is an image URL, replace it with an <img> tag
// This happened by accident in the original parser, but some people used it extensively
// TODO.XO:
//$img = $this->maybeMakeExternalImage( $text );
//if ( $img !== false ) {
// $text = $img;
//}
//
//$dtrail = '';
// Set linktype for CSS - if URL==text, link is essentially free
boolean text_missing = text_bgn == -1;
byte[] link_type = text_missing ? Link_type__free : Link_type__text;
// No link text, e.g. [http://domain.tld/some.link]
if (text_missing) {
// Autonumber; EX: "[123]"
tmp.Add_byte(Byte_ascii.Brack_bgn);
tmp.Add_int_variable(autonumber++); // TODO.XO:$langObj->formatNum( ++$this->mAutonumber );
tmp.Add_byte(Byte_ascii.Brack_end);
link_type = Link_type__autonumber;
}
else {
// Have link text, e.g. [http://domain.tld/some.link text]s
// Check for trail
// TODO.XO:
// list( $dtrail, $trail ) = Linker::splitTrail( $trail );
}
// TODO.XO:
// $text = $this->getConverterLanguage()->markNoConversion( $text );
// TODO.XO:
// $url = Sanitizer::cleanUrl( $url );
bfr.Add_mid(src, prv, lnke_bgn);
prv = cur;
// Use the encoded URL
// This means that users can paste URLs directly into the text
// Funny characters like <20> aren't valid in URLs anyway
// This was changed in August 2004
// TODO.XO:getExternalLinkAttribs
attribs.Clear();
linker.Make_external_link(bfr, Bry_.Mid(src, url_bgn, url_end), Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, attribs, Bry_.Empty);
// Register link in the output Object.
// Replace unnecessary URL escape codes with the referenced character
// This prevents spammers from hiding links from the filters
// $pasteurized = self::normalizeLinkUrl( $url );
// $this->mOutput->addExternalLink( $pasteurized );
}
}
// public function getExternalLinkAttribs( $url ) {
// $attribs = [];
// $rel = self::getExternalLinkRel( $url, $this->mTitle );
//
// $target = $this->mOptions->getExternalLinkTarget();
// if ( $target ) {
// $attribs['target'] = $target;
// if ( !in_array( $target, [ '_self', '_parent', '_top' ] ) ) {
// // T133507. New windows can navigate parent cross-origin.
// // Including noreferrer due to lacking browser
// // support of noopener. Eventually noreferrer should be removed.
// if ( $rel !== '' ) {
// $rel .= ' ';
// }
// $rel .= 'noreferrer noopener';
// }
// }
// $attribs['rel'] = $rel;
// return $attribs;
// }
// public static function getExternalLinkRel( $url = false, $title = null ) {
// global $wgNoFollowLinks, $wgNoFollowNsExceptions, $wgNoFollowDomainExceptions;
// $ns = $title ? $title->getNamespace() : false;
// if ( $wgNoFollowLinks && !in_array( $ns, $wgNoFollowNsExceptions )
// && !wfMatchesDomainList( $url, $wgNoFollowDomainExceptions )
// ) {
// return 'nofollow';
// }
// return null;
// }
private static final byte[]
Link_type__free = Bry_.new_a7("free")
, Link_type__text = Bry_.new_a7("text")
, Link_type__autonumber = Bry_.new_a7("autonumber")
;
private static final Btrie_slim_mgr
invalid_url_chars_trie = New__invalid_url_chars_trie()
, space_chars_trie = New__space_chars_trie()
, invalid_text_chars_trie = New__invalid_text_chars_trie()
;
private static Btrie_slim_mgr New__invalid_url_chars_trie() { // REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
rv.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
for (byte i = 0; i < 33; i++) {
rv.Add_bry_byte(new byte[] {i}, Byte_.Zero);
}
rv.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
rv.Add_bry_byte(Bry_.New_by_ints(227, 128, 128), Byte_.Zero); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
return rv;
}
private static Btrie_slim_mgr New__space_chars_trie() { // REGEX:\p{Zs}; NOTE: val is key.length
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
New__trie_itm__by_len(rv, 32);
New__trie_itm__by_len(rv, 227, 128, 128); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
return rv;
}
private static Btrie_slim_mgr New__invalid_text_chars_trie() { // REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
New__trie_itm__by_len(rv, Byte_ascii.Brack_end);
for (int i = 0; i <= 8; i++) { // x00-x08
New__trie_itm__by_len(rv, i);
}
for (int i = 10; i <= 31; i++) { // x0a-x1F
New__trie_itm__by_len(rv, i);
}
return rv;
}
private static void New__trie_itm__by_len(Btrie_slim_mgr mgr, int... ary) {
mgr.Add_obj(Bry_.New_by_ints(ary), new Int_obj_val(ary.length));
}
}

@ -0,0 +1,56 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.lnkes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*;
public class Xomw_lnke_wkr__tst {
private final Xomw_lnke_wkr__fxt fxt = new Xomw_lnke_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("[https://a.org b]" , "<a class='external text' rel='nofollow' href='https://a.org'>b</a>");}
@Test public void Invaild__protocol() {fxt.Test__parse("[httpz:a.org]" , "[httpz:a.org]");}
@Test public void Invaild__protocol_slash() {fxt.Test__parse("[https:a.org]" , "[https:a.org]");}
@Test public void Invaild__urlchars__0() {fxt.Test__parse("[https://]" , "[https://]");}
@Test public void Invaild__urlchars__bad() {fxt.Test__parse("[https://\"]" , "[https://\"]");}
@Test public void Many() {
fxt.Test__parse(String_.Concat_lines_nl_apos_skip_last
( "a"
, "[https://b.org c]"
, "d"
, "[https://e.org f]"
, "g"
), String_.Concat_lines_nl_apos_skip_last
( "a"
, "<a class='external text' rel='nofollow' href='https://b.org'>c</a>"
, "d"
, "<a class='external text' rel='nofollow' href='https://e.org'>f</a>"
, "g"
));
}
}
class Xomw_lnke_wkr__fxt {
private final Xomw_lnke_wkr wkr = new Xomw_lnke_wkr(new Xomw_parser());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_lnke_wkr__fxt() {
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt());
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_external_links(new Xomw_parser_ctx(), pbfr.Init(src_bry));
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

@ -0,0 +1,462 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.btries.*; import gplx.core.primitives.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.wikis.nss.*; import gplx.xowa.wikis.xwikis.*;
import gplx.xowa.mws.parsers.*; import gplx.xowa.mws.parsers.quotes.*;
import gplx.xowa.mws.htmls.*; import gplx.xowa.mws.linkers.*;
import gplx.xowa.mws.utls.*;
import gplx.xowa.parsers.uniqs.*;
public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Xomw_link_holders holders;
private final Xomw_linker linker;
private final Xomw_link_renderer link_renderer;
// private final Btrie_slim_mgr protocols_trie;
private final Xomw_quote_wkr quote_wkr;
private final Xomw_strip_state strip_state;
private Xow_wiki wiki;
private Xoa_ttl page_title;
private final Xomw_linker__normalize_subpage_link normalize_subpage_link = new Xomw_linker__normalize_subpage_link();
private final Bry_bfr tmp;
private final Xomw_parser parser;
private final Xomwh_atr_mgr extra_atrs = new Xomwh_atr_mgr();
public Xomw_lnki_wkr(Xomw_parser parser, Xomw_link_holders holders, Xomw_link_renderer link_renderer, Btrie_slim_mgr protocols_trie) {
this.parser = parser;
this.holders = holders;
this.link_renderer = link_renderer;
// this.protocols_trie = protocols_trie;
this.linker = parser.Linker();
this.quote_wkr = parser.Quote_wkr();
this.tmp = parser.Tmp();
this.strip_state = parser.Strip_state();
}
public void Init_by_wiki(Xow_wiki wiki) {
this.wiki = wiki;
if (title_chars_for_lnki == null) {
title_chars_for_lnki = (boolean[])Array_.Clone(Xomw_ttl_utl.Title_chars_valid());
// the % is needed to support urlencoded titles as well
title_chars_for_lnki[Byte_ascii.Hash] = true;
title_chars_for_lnki[Byte_ascii.Percent] = true;
}
}
public void Clear_state() {
holders.Clear();
}
public void Replace_internal_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
this.page_title = pctx.Page_title();
Replace_internal_links(bfr, src, src_bgn, src_end);
}
public void Replace_internal_links(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
// PORTED: regex for tc move to header; e1 and e1_img moved to code
// split the entire text String on occurrences of [[
int cur = src_bgn;
int prv = cur;
while (true) {
int lnki_bgn = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__bgn, cur, src_end); // $a = StringUtils::explode('[[', ' ' . $s);
if (lnki_bgn == Bry_find_.Not_found) { // no more "[["; stop loop
bfr.Add_mid(src, cur, src_end);
break;
}
cur = lnki_bgn + 2; // 2="[[".length
// IGNORE: handles strange split logic of adding space to String; "$s = substr($s, 1);"
// TODO.XO:lnke_bgn; EX: b[[A]]
// $useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension();
// $e2 = null;
// if ($useLinkPrefixExtension) {
// // Match the end of a line for a word that's not followed by whitespace,
// // e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
// global $wgContLang;
// $charset = $wgContLang->linkPrefixCharset();
// $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu";
// }
// IGNORE: throw new MWException(__METHOD__ . ": \$this->mTitle is null\n");
// $nottalk = !$this->mTitle->isTalkPage();
// TODO.XO:lnke_bgn
byte[] prefix = Bry_.Empty;
//if ($useLinkPrefixExtension) {
// $m = [];
// if (preg_match($e2, $s, $m)) {
// $first_prefix = $m[2];
// } else {
// $first_prefix = false;
// }
//} else {
// $prefix = '';
//}
// IGNORE: "Check for excessive memory usage"
// TODO.XO:lnke_bgn; EX: b[[A]]
//if ($useLinkPrefixExtension) {
// if (preg_match($e2, $s, $m)) {
// $prefix = $m[2];
// $s = $m[1];
// } else {
// $prefix = '';
// }
// // first link
// if ($first_prefix) {
// $prefix = $first_prefix;
// $first_prefix = false;
// }
//}
// PORTED.BGN: if (preg_match($e1, $line, $m)) && else if (preg_match($e1_img, $line, $m))
// NOTE: both e1 and e1_img are effectively the same; e1_img allows nested "[["; EX: "[[A|b[[c]]d]]" will stop at "[[A|b"
int ttl_bgn = cur;
int ttl_end = Xomw_ttl_utl.Find_fwd_while_title(src, cur, src_end, title_chars_for_lnki);
cur = ttl_end;
int capt_bgn = -1, capt_end = -1;
int nxt_lnki = -1;
boolean might_be_img = false;
if (ttl_end > ttl_bgn) { // at least one valid title-char found; check for "|" or "]]" EX: "[[a"
byte nxt_byte = src[ttl_end];
if (nxt_byte == Byte_ascii.Pipe) { // handles lnki with capt ([[A|a]])and lnki with file ([[File:A.png|b|c|d]])
cur = ttl_end + 1;
// find next "[["
nxt_lnki = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__bgn, cur, src_end);
if (nxt_lnki == Bry_find_.Not_found)
nxt_lnki = src_end;
// find end "]]"
capt_bgn = cur;
capt_end = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__end, cur, nxt_lnki);
if (capt_end == Bry_find_.Not_found) {
capt_end = nxt_lnki;
cur = nxt_lnki;
might_be_img = true;
}
else {
cur = capt_end + Bry__wtxt__lnki__end.length;
}
}
else if (Bry_.Match(src, ttl_end, ttl_end + 2, Bry__wtxt__lnki__end)) { // handles simple lnki; EX: [[A]]
cur = ttl_end + 2;
}
else {
ttl_end = -1;
}
}
else
ttl_end = -1;
if (ttl_end == -1) { // either (a) no valid title-chars ("[[<") or (b) title char, but has stray "]" ("[[a]b]]")
// Invalid form; output directly
bfr.Add_mid(src, cur, src_end);
continue;
}
// PORTED.END: if (preg_match($e1, $line, $m)) && else if (preg_match($e1_img, $line, $m))
byte[] text = Bry_.Mid(src, capt_bgn, capt_end);
byte[] trail = Bry_.Empty;
if (!might_be_img) {
// If we get a ] at the beginning of $m[3] that means we have a link that's something like:
// [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up,
// the real problem is with the $e1 regex
// See T1500.
// Still some problems for cases where the ] is meant to be outside punctuation,
// and no image is in sight. See T4095.
// if ($text !== ''
// && substr($m[3], 0, 1) === ']'
// && strpos($text, '[') !== false
// ) {
// $text .= ']'; // so that replaceExternalLinks($text) works later
// $m[3] = substr($m[3], 1);
// }
// fix up urlencoded title texts
// if (strpos($m[1], '%') !== false) {
// // Should anchors '#' also be rejected?
// $m[1] = str_replace([ '<', '>' ], [ '&lt;', '&gt;' ], rawurldecode($m[1]));
// }
// $trail = $m[3];
}
else {
// Invalid, but might be an image with a link in its caption
// $text = $m[2];
// if (strpos($m[1], '%') !== false) {
// $m[1] = str_replace([ '<', '>' ], [ '&lt;', '&gt;' ], rawurldecode($m[1]));
// }
// $trail = "";
}
byte[] orig_link = Bry_.Mid(src, ttl_bgn, ttl_end);
// TODO.XO: handle "[[http://a.org]]"
// Don't allow @gplx.Internal protected links to pages containing
// PROTO: where PROTO is a valid URL protocol; these
// should be external links.
// if (preg_match('/^(?i:' . $this->mUrlProtocols . ')/', $origLink)) {
// $s .= $prefix . '[[' . $line;
// continue;
// }
byte[] link = orig_link;
boolean no_force = orig_link[0] != Byte_ascii.Colon;
if (!no_force) {
// Strip off leading ':'
link = Bry_.Mid(link, 1);
}
Xoa_ttl nt = wiki.Ttl_parse(link);
// Make subpage if necessary
boolean subpages_enabled = nt.Ns().Subpages_enabled();
if (subpages_enabled) {
Maybe_do_subpage_link(normalize_subpage_link, orig_link, text);
link = normalize_subpage_link.link;
text = normalize_subpage_link.text;
nt = wiki.Ttl_parse(link);
}
// IGNORE: handled in rewrite above
// else {
// link = orig_link;
// }
byte[] unstrip = strip_state.Unstrip_nowiki(link);
if (!Bry_.Eq(unstrip, link))
nt = wiki.Ttl_parse(unstrip);
if (nt == null) {
bfr.Add_mid(src, prv, lnki_bgn + 2); // $s .= $prefix . '[[' . $line;
cur = lnki_bgn + 2;
prv = cur;
continue;
}
Xow_ns ns = nt.Ns();
Xow_xwiki_itm iw = nt.Wik_itm();
if (might_be_img) { // if this is actually an invalid link
if (ns.Id_is_file() && no_force) { // but might be an image
boolean found = false;
// while (true) {
// // look at the next 'line' to see if we can close it there
// a->next();
// next_line = a->current();
// if (next_line === false || next_line === null) {
// break;
// }
// m = explode(']]', next_line, 3);
// if (count(m) == 3) {
// // the first ]] closes the inner link, the second the image
// found = true;
// text .= "[[{m[0]}]]{m[1]}";
// trail = m[2];
// break;
// } else if (count(m) == 2) {
// // if there's exactly one ]] that's fine, we'll keep looking
// text .= "[[{m[0]}]]{m[1]}";
// } else {
// // if next_line is invalid too, we need look no further
// text .= '[[' . next_line;
// break;
// }
// }
if (!found) {
// we couldn't find the end of this imageLink, so output it raw
// but don't ignore what might be perfectly normal links in the text we've examined
Bry_bfr nested = wiki.Utl__bfr_mkr().Get_b128();
this.Replace_internal_links(nested, text, 0, text.length);
nested.Mkr_rls();
bfr.Add(prefix).Add(Bry__wtxt__lnki__bgn).Add(link).Add_byte_pipe().Add(text); // s .= "{prefix}[[link|text";
// note: no trail, because without an end, there *is* no trail
continue;
}
}
else { // it's not an image, so output it raw
bfr.Add(prefix).Add(Bry__wtxt__lnki__bgn).Add(link).Add_byte_pipe().Add(text); // s .= "{prefix}[[link|text";
// note: no trail, because without an end, there *is* no trail
continue;
}
}
boolean was_blank = text.length == 0;
if (was_blank) {
text = link;
}
else {
// T6598 madness. Handle the quotes only if they come from the alternate part
// [[Lista d''e paise d''o munno]] -> <a href="...">Lista d''e paise d''o munno</a>
// [[Criticism of Harry Potter|Criticism of ''Harry Potter'']]
// -> <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a>
text = quote_wkr.Do_quotes(tmp, text);
}
// Link not escaped by : , create the various objects
// if (no_force && !nt->wasLocalInterwiki()) {
// Interwikis
// if (
// iw && this->mOptions->getInterwikiMagic() && nottalk && (
// Language::fetchLanguageName(iw, null, 'mw') ||
// in_array(iw, wgExtraInterlanguageLinkPrefixes)
// )
// ) {
// T26502: filter duplicates
// if (!isset(this->mLangLinkLanguages[iw])) {
// this->mLangLinkLanguages[iw] = true;
// this->mOutput->addLanguageLink(nt->getFullText());
// }
//
// s = rtrim(s . prefix);
// s .= trim(trail, "\n") == '' ? '': prefix . trail;
// continue;
// }
//
if (ns.Id_is_file()) {
// if (!wfIsBadImage(nt->getDBkey(), this->mTitle)) {
// if (wasblank) {
// // if no parameters were passed, text
// // becomes something like "File:Foo.png",
// // which we don't want to pass on to the
// // image generator
// text = '';
// } else {
// // recursively parse links inside the image caption
// // actually, this will parse them in any other parameters, too,
// // but it might be hard to fix that, and it doesn't matter ATM
// text = this->replaceExternalLinks(text);
// holders->merge(this->replaceInternalLinks2(text));
// }
// // cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them
// s .= prefix . this->armorLinks(
// this->makeImage(nt, text, holders)) . trail;
// continue;
// }
}
else if (ns.Id_is_ctg()) {
bfr.Trim_end_ws(); // s = rtrim(s . "\n"); // T2087
if (was_blank) {
// sortkey = this->getDefaultSort();
}
else {
// sortkey = text;
}
// sortkey = Sanitizer::decodeCharReferences(sortkey);
// sortkey = str_replace("\n", '', sortkey);
// sortkey = this->getConverterLanguage()->convertCategoryKey(sortkey);
// this->mOutput->addCategory(nt->getDBkey(), sortkey);
//
// Strip the whitespace Category links produce, see T2087
// s .= trim(prefix . trail, "\n") == '' ? '' : prefix . trail;
continue;
}
// }
// Self-link checking. For some languages, variants of the title are checked in
// LinkHolderArray::doVariants() to allow batching the existence checks necessary
// for linking to a different variant.
if (!ns.Id_is_special() && nt.Eq_full_db(page_title) && !nt.Has_fragment()) {
bfr.Add(prefix);
linker.Make_self_link_obj(bfr, nt, text, Bry_.Empty, trail, Bry_.Empty);
continue;
}
// NS_MEDIA is a pseudo-namespace for linking directly to a file
// @todo FIXME: Should do batch file existence checks, see comment below
if (ns.Id_is_media()) {
// Give extensions a chance to select the file revision for us
// options = [];
// descQuery = false;
// MW.HOOK:BeforeParserFetchFileAndTitle
// Fetch and register the file (file title may be different via hooks)
// list(file, nt) = this->fetchFileAndTitle(nt, options);
// Cloak with NOPARSE to avoid replacement in replaceExternalLinks
// s .= prefix . this->armorLinks(
// Linker::makeMediaLinkFile(nt, file, text)) . trail;
// continue;
}
// Some titles, such as valid special pages or files in foreign repos, should
// be shown as bluelinks even though they're not included in the page table
// @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do
// batch file existence checks for NS_FILE and NS_MEDIA
bfr.Add_mid(src, prv, lnki_bgn);
prv = cur;
if (iw == null && nt.Is_always_known()) {
// this->mOutput->addLink(nt);
Make_known_link_holder(bfr, nt, text, trail, prefix);
}
else {
// Links will be added to the output link list after checking
holders.Make_holder(bfr, nt, text, Bry_.Ary_empty, trail, prefix);
}
}
}
public void Maybe_do_subpage_link(Xomw_linker__normalize_subpage_link rv, byte[] target, byte[] text) {
linker.Normalize_subpage_link(rv, page_title, target, text);
}
public void Replace_link_holders(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
holders.Replace(pctx, pbfr);
}
public void Make_known_link_holder(Bry_bfr bfr, Xoa_ttl nt, byte[] text, byte[] trail, byte[] prefix) {
byte[][] split_trail = linker.Split_trail(trail);
byte[] inside = split_trail[0];
trail = split_trail[1];
if (text == Bry_.Empty) {
text = Bry_.Escape_html(nt.Get_prefixed_text());
}
// PORTED:new HtmlArmor( "$prefix$text$inside" )
tmp.Add_bry_escape_html(prefix);
tmp.Add_bry_escape_html(text);
tmp.Add_bry_escape_html(inside);
text = tmp.To_bry_and_clear();
link_renderer.Make_known_link(bfr, nt, text, extra_atrs, Bry_.Empty);
byte[] link = bfr.To_bry_and_clear();
parser.Armor_links(bfr, link, 0, link.length);
bfr.Add(trail);
}
private static boolean[] title_chars_for_lnki;
private static final byte[] Bry__wtxt__lnki__bgn = Bry_.new_a7("[["), Bry__wtxt__lnki__end = Bry_.new_a7("]]");
// $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD";
//
// REGEX: "title-char"(1+) + "pipe"(0-1) + "]]"(0-1) + "other chars up to next [["
// title-char -> ([{$tc}]+)
// pipe -> (?:\\|(.+?))?
// ]] -> ?]]
// other chars... -> (.*)
// $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD";
//
// REGEX: "title-char"(1+) + "pipe"(0-1) + "other chars up to next [["
// title-char -> ([{$tc}]+)
// pipe -> \\|
// other chars... -> (.*)
}

@ -0,0 +1,63 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*;
public class Xomw_lnki_wkr__tst {
private final Xomw_lnki_wkr__fxt fxt = new Xomw_lnki_wkr__fxt();
@Before public void init() {fxt.Clear();}
// @Test public void Basic() {fxt.Test__parse("[[A]]" , "<!--LINK 0-->");}
@Test public void Text() {fxt.Test__parse("a [[A]] z" , "a <!--LINK 0--> z");}
@Test public void Capt() {fxt.Test__parse("a [[A|a]] z" , "a <!--LINK 0--> z");}
// @Test public void Text() {fxt.Test__parse("a [[A]] z" , "a <a href='/wiki/A' title='A'>A</a> z");}
// @Test public void Capt() {fxt.Test__parse("a [[A|a]] z" , "a <a href='/wiki/A' title='A'>a</a> z");}
// @Test public void Text() {fxt.Test__parse("a [[A]] z" , "a <!--LINK 0--> z");}
// @Test public void Invalid__char() {fxt.Test__parse("[[<A>]]" , "[[<A>]]");}
@Test public void Self() {fxt.Test__to_html("[[Page_1]]" , "<strong class='selflink'>Page_1</strong>");}
}
class Xomw_lnki_wkr__fxt {
private final Xomw_lnki_wkr wkr;
private final Xomw_parser_ctx pctx;
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_lnki_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
Xomw_parser parser = new Xomw_parser();
wkr = parser.Lnki_wkr();
parser.Init_by_wiki(wiki);
pctx = new Xomw_parser_ctx();
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
}
public void Clear() {
wkr.Clear_state();
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_internal_links(pctx, pbfr.Init(src_bry));
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
public void Test__to_html(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_internal_links(pctx, pbfr.Init(src_bry));
wkr.Replace_link_holders(pctx, pbfr);
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

@ -0,0 +1,331 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
import gplx.langs.phps.utls.*;
// public class Xomw_magiclinks_wkr {
// private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
// private final Btrie_rv trv = new Btrie_rv();
// public Xomw_magiclinks_wkr() {
// }
// private static byte[] Tag__anch__rhs, Prefix__rfc, Prefix__pmid;
//
// private static final byte Space__tab = 1, Space__nbsp_ent = 2, Space__nbsp_dec = 3, Space__nbsp_hex = 4;
// private static Btrie_slim_mgr space_trie;
// // static final SPACE_NOT_NL = '(?:\t|&nbsp;|&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})';
//// public void Test() {
//// regex.Add("\t", Space__tab);
//// regex.Add("&nbsp;", Space__nbsp__ent);
//// regex.Add(Regex.Make("&#").Star("0").Add("160;"), Space__nbsp__dec);
//// regex.Add(Regex.Make("&#").Brack("X", "x").Star("0").Brack("A", "a").Add("0"), Space__nbsp__hex);
//// }
// public int Find_fwd_space(byte[] src, int cur, int src_end) {
// return -1;
// }
//
// private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3, Regex__rfc = 5, Regex__isbn = 6, Regex__pmid = 7;
// public void Init_by_wiki() {
// regex_trie.Add_str_byte("<a", Regex__anch);
// regex_trie.Add_str_byte("<" , Regex__elem);
//
// Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
// int protocol_len = protocol_ary.length;
// for (int i = 0; i < protocol_len; i++) {
// Gfo_protocol_itm itm = protocol_ary[i];
// regex_trie.Add_bry_byte(itm.Key_w_colon_bry(), Regex__free);
// }
// regex_trie.Add_str_byte("RFC " , Regex__rfc);
// regex_trie.Add_str_byte("PMID " , Regex__rfc);
// regex_trie.Add_str_byte("ISBN ", Regex__rfc);
//
// if (Tag__anch__rhs == null) {
// synchronized (Type_adp_.ClassOf_obj(this)) {
// Tag__anch__rhs = Bry_.new_a7("</a>");
// Prefix__rfc = Bry_.new_a7("RFC");
// Prefix__pmid = Bry_.new_a7("PMID");
// space_trie = Btrie_slim_mgr.ci_a7()
// .Add_str_byte("\t", Space__tab)
// .Add_str_byte("&nbsp;", Space__nbsp_ent)
// .Add_str_byte("&#", Space__nbsp_dec)
// .Add_str_byte("&x", Space__nbsp_hex)
// ;
// }
// }
// }
//
// // Replace special strings like "ISBN xxx" and "RFC xxx" with
// // magic external links.
// public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// // XO.PBFR
// Bry_bfr src_bfr = pbfr.Src();
// byte[] src = src_bfr.Bfr();
// int src_bgn = 0;
// int src_end = src_bfr.Len();
// Bry_bfr bfr = pbfr.Trg();
//
// int cur = src_bgn;
// int prv = cur;
// boolean dirty = true;
// while (true) {
// if (cur == src_end) {
// if (dirty)
// bfr.Add_mid(src, prv, src_end);
// break;
// }
//
// byte b = src[cur];
// Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
// // current byte doesn't look like magiclink; continue;
// if (o == null) {
// cur++;
// continue;
// }
// // looks like magiclink; do additional processing
// byte regex_tid = ((Byte_obj_ref)o).Val();
// int trv_pos = trv.Pos();
// int nxt_pos = trv_pos;
// boolean regex_valid = true;
// switch (regex_tid) {
// case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// if (trv_pos < src_end) {
// // find ws in "[ \t\r\n>]"
// byte ws_byte = src[cur];
// switch (ws_byte) {
// case Byte_ascii.Space:
// case Byte_ascii.Tab:
// case Byte_ascii.Cr:
// case Byte_ascii.Nl:
// break;
// default:
// regex_valid = false;
// break;
// }
// if (regex_valid) {
// // find </a>
// nxt_pos++;
// int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
// if (anch_end == Bry_find_.Not_found) {
// regex_valid = false;
// }
// else {
// cur = anch_end + Tag__anch__rhs.length;
// }
// }
// }
// else {
// regex_valid = false;
// }
// break;
// case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// // just find ">"
// int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
// if (elem_end == Bry_find_.Not_found)
// regex_valid = false;
// else
// cur = elem_end + 1;
// break;
// case Regex__free:
// // addr; urlchar
// break;
// case Regex__rfc:
// case Regex__pmid:
// // byte[] prefix = regex == Regex__rfc ? Prefix__rfc : Prefix__pmid;
// // match previous for case sensitivity
//// if (Bry_.Eq(src, trv_pos - prefix.length - 1, trv_pos - 1, prefix)) {
////
//// }
//// else {
//// regex_valid = false;
//// }
// break;
// }
//
//// '!(?: // Start cases
//// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
//// (<.*?>) | // m[2]: Skip stuff inside
//// // HTML elements' . "
//// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
//// // m[4]: Post-protocol path
//// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
//// ([0-9]+)\b |
//// \bISBN $spaces ( // m[6]: ISBN, capture number
//// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
//// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
//// [0-9Xx] // check digit
//// )\b
//
// }
// if (dirty)
// pbfr.Switch();
// $prots = wfUrlProtocolsWithoutProtRel();
// $urlChar = self::EXT_LINK_URL_CLASS;
// $addr = self::EXT_LINK_ADDR;
// $space = self::SPACE_NOT_NL; // non-newline space
// $spdash = "(?:-|$space)"; // a dash or a non-newline space
// $spaces = "$space++"; // possessive match of 1 or more spaces
// $text = preg_replace_callback(
// '!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// ([0-9]+)\b |
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// )\b
// )!xu", [ &$this, 'magicLinkCallback' ], $text);
// return $text;
// }
// public function magicLinkCallback($m) {
// if (isset($m[1]) && $m[1] !== '') {
// // Skip anchor
// return $m[0];
// } else if (isset($m[2]) && $m[2] !== '') {
// // Skip HTML element
// return $m[0];
// } else if (isset($m[3]) && $m[3] !== '') {
// // Free external link
// return $this->makeFreeExternalLink($m[0], strlen($m[4]));
// } else if (isset($m[5]) && $m[5] !== '') {
// // RFC or PMID
// if (substr($m[0], 0, 3) === 'RFC') {
// if (!$this->mOptions->getMagicRFCLinks()) {
// return $m[0];
// }
// $keyword = 'RFC';
// $urlmsg = 'rfcurl';
// $cssClass = 'mw-magiclink-rfc';
// $trackingCat = 'magiclink-tracking-rfc';
// $id = $m[5];
// } else if (substr($m[0], 0, 4) === 'PMID') {
// if (!$this->mOptions->getMagicPMIDLinks()) {
// return $m[0];
// }
// $keyword = 'PMID';
// $urlmsg = 'pubmedurl';
// $cssClass = 'mw-magiclink-pmid';
// $trackingCat = 'magiclink-tracking-pmid';
// $id = $m[5];
// } else {
// throw new MWException(__METHOD__ . ': unrecognised match type "' .
// substr($m[0], 0, 20) . '"');
// }
// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text();
// $this->addTrackingCategory($trackingCat);
// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle);
// } else if (isset($m[6]) && $m[6] !== ''
// && $this->mOptions->getMagicISBNLinks()
// ) {
// // ISBN
// $isbn = $m[6];
// $space = self::SPACE_NOT_NL; // non-newline space
// $isbn = preg_replace("/$space/", ' ', $isbn);
// $num = strtr($isbn, [
// '-' => '',
// ' ' => '',
// 'x' => 'X',
// ]);
// $this->addTrackingCategory('magiclink-tracking-isbn');
// return $this->getLinkRenderer()->makeKnownLink(
// SpecialPage::getTitleFor('Booksources', $num),
// "ISBN $isbn",
// [
// 'class' => '@gplx.Internal protected mw-magiclink-isbn',
// 'title' => false // suppress title attribute
// ]
// );
// } else {
// return $m[0];
// }
// Make a free external link, given a user-supplied URL
// public void Make_free_external_link(byte[] url, int num_post_proto) {
// byte[] trail = Bry_.Empty;
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// Make &nbsp; terminate a URL as well (bug T84937)
// $m2 = [];
// if (preg_match(
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
// $url,
// $m2,
// PREG_OFFSET_CAPTURE
// )) {
// trail = substr($url, $m2[0][1]) . $trail;
// $url = substr($url, 0, $m2[0][1]);
// }
// Move trailing punctuation to $trail
// $sep = ',;\.:!?';
// If there is no left bracket, then consider right brackets fair game too
// if (strpos($url, '(') === false) {
// $sep .= ')';
// }
// $urlRev = strrev($url);
// $numSepChars = strspn($urlRev, $sep);
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// String starting at the desired offset.
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) {
// $numSepChars--;
// }
// }
// if ($numSepChars) {
// $trail = substr($url, -$numSepChars) . $trail;
// $url = substr($url, 0, -$numSepChars);
// }
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
// if (strlen($trail) >= $numPostProto) {
// return $url . $trail;
// }
// $url = Sanitizer::cleanUrl($url);
// Is this an external image?
// $text = $this->maybeMakeExternalImage($url);
// if ($text === false) {
// Not an image, make a link
// $text = Linker::makeExternalLink($url,
// $this->getConverterLanguage()->markNoConversion($url, true),
// true, 'free',
// $this->getExternalLinkAttribs($url), $this->mTitle);
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
// }
// return $text . $trail;
// }
// }
// }

@ -0,0 +1,134 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.nbsps; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.btries.*;
public class Xomw_nbsp_wkr {
private final Btrie_rv trv = new Btrie_rv();
public void Do_nbsp(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// PORTED:
// Clean up special characters, only run once, next-to-last before doBlockLevels
// $fixtags = [
// // French spaces, last one Guillemet-left
// // only if there is something before the space
// '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1&#160;',
// // french spaces, Guillemet-right
// '/(\\302\\253) /' => '\\1&#160;',
// '/&#160;(!\s*important)/' => ' \\1', // Beware of CSS magic word !important, T13874.
// ];
// $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
if (trie == null) {
synchronized (this.getClass()) {
trie = Btrie_slim_mgr.cs();
Trie__add(trie, Tid__space_lhs, " ?");
Trie__add(trie, Tid__space_lhs, " :");
Trie__add(trie, Tid__space_lhs, " ;");
Trie__add(trie, Tid__space_lhs, " !");
Trie__add(trie, Tid__space_lhs, " »");
Trie__add(trie, Tid__space_rhs, "« ");
Trie__add(trie, Tid__important, "&#160;!");
}
}
int cur = src_bgn;
int prv = cur;
boolean dirty = true;
// search forward for...
// "\s" before ? : ; ! % 302,273; EX: "a :"
// "\s" after 302,253
// "&160;!\simportant"
while (true) {
if (cur == src_end) {
if (dirty)
bfr.Add_mid(src, prv, src_end);
break;
}
Object o = trie.Match_at(trv, src, cur, src_end);
if (o == null) {
cur++;
continue;
}
Xomw_nbsp_itm itm = (Xomw_nbsp_itm)o;
// '/&#160;(!\s*important)/' => ' \\1'
byte itm_tid = itm.Tid();
int important_end = -1;
if (itm_tid == Tid__important) {
int space_bgn = cur + itm.Key().length;
int space_end = Bry_find_.Find_fwd_while(src, space_bgn, src_end, Byte_ascii.Space);
important_end = space_end + Bry__important.length;
if (!Bry_.Match(src, space_end, important_end, Bry__important)) {
continue;
}
}
dirty = true;
bfr.Add_mid(src, prv, cur);
switch (itm_tid) {
case Tid__space_lhs:
bfr.Add_bry_many(Bry__nbsp, itm.Val());
break;
case Tid__space_rhs:
bfr.Add_bry_many(itm.Val(), Bry__nbsp);
break;
case Tid__important:
bfr.Add(Bry__important__repl);
break;
}
cur += itm.Key().length;
prv = cur;
}
if (dirty)
pbfr.Switch();
}
private static final byte Tid__space_lhs = 0, Tid__space_rhs = 1, Tid__important = 2;
private static Btrie_slim_mgr trie;
private static void Trie__add(Btrie_slim_mgr trie, byte tid, String key_str) {
byte[] key_bry = Bry_.new_u8(key_str);
byte[] val_bry = null;
switch (tid) {
case Tid__space_lhs:
val_bry = Bry_.Mid(key_bry, 1);
break;
case Tid__space_rhs:
val_bry = Bry_.Mid(key_bry, 0, key_bry.length - 1);
break;
case Tid__important:
val_bry = key_bry;
break;
}
Xomw_nbsp_itm itm = new Xomw_nbsp_itm(tid, key_bry, val_bry);
trie.Add_obj(key_bry, itm);
}
private static final byte[] Bry__nbsp = Bry_.new_a7("&#160;"), Bry__important = Bry_.new_a7("important"), Bry__important__repl = Bry_.new_a7(" !");
}
class Xomw_nbsp_itm {
public Xomw_nbsp_itm(byte tid, byte[] key, byte[] val) {
this.tid = tid;
this.key = key;
this.val = val;
}
public byte Tid() {return tid;} private final byte tid;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
}

@ -0,0 +1,40 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.nbsps; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*;
public class Xomw_nbsp_wkr__tst {
private final Xomw_nbsp_wkr__fxt fxt = new Xomw_nbsp_wkr__fxt();
@Test public void Noop() {fxt.Test__parse("abc" , "abc");}
@Test public void Space_lhs__colon() {fxt.Test__parse("a :b c" , "a&#160;:b c");}
@Test public void Space_lhs__laquo() {fxt.Test__parse("a »b c" , "a&#160;»b c");}
@Test public void Space_rhs() {fxt.Test__parse("a« b c" , "a«&#160;b c");}
@Test public void Important() {fxt.Test__parse("a &#160;! important b" , "a ! important b");}
}
class Xomw_nbsp_wkr__fxt {
private final Xomw_nbsp_wkr wkr = new Xomw_nbsp_wkr();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
pbfr.Init(src_bry);
wkr.Do_nbsp(pctx, pbfr);
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

@ -15,8 +15,9 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public interface Xomw_hdr_cbk { public class Xomw_frame_itm {
void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr); public byte[] Expand(byte[] ttl) {
void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr); return null;
}
} }

@ -0,0 +1,564 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
// public class Xomw_frame_wkr { // THREAD.UNSAFE: caching for repeated calls
// private final Xomw_parser parser;
// public Xomw_frame_wkr(Xomw_parser parser) {
// this.parser = parser;
// }
// \\ Replace magic variables, templates, and template arguments
// \\ with the appropriate text. Templates are substituted recursively,
// \\ taking care to avoid infinite loops.
// \\
// \\ Note that the substitution depends on value of $mOutputType:
// \\ self::OT_WIKI: only {{subst:}} templates
// \\ self::OT_PREPROCESS: templates but not extension tags
// \\ self::OT_HTML: all templates and extension tags
// \\
// \\ @param String $text The text to transform
// \\ @param boolean|PPFrame $frame Object describing the arguments passed to the
// \\ template. Arguments may also be provided as an associative array, as
// \\ was the usual case before MW1.12. Providing arguments this way may be
// \\ useful for extensions wishing to perform variable replacement
// \\ explicitly.
// \\ @param boolean $argsOnly Only do argument (triple-brace) expansion, not
// \\ double-brace expansion.
// \\ @return String
// public function replaceVariables($text, $frame = false, $argsOnly = false) {
// // Is there any text? Also, Prevent too big inclusions!
// $textSize = strlen($text);
// if ($textSize < 1 || $textSize > $this->mOptions->getMaxIncludeSize()) {
// return $text;
// }
//
// if ($frame == false) {
// $frame = $this->getPreprocessor()->newFrame();
// } elseif (!($frame instanceof PPFrame)) {
// wfDebug(__METHOD__ . " called using plain parameters instead of "
// . "a PPFrame instance. Creating custom frame.\n");
// $frame = $this->getPreprocessor()->newCustomFrame($frame);
// }
//
// $dom = $this->preprocessToDom($text);
// $flags = $argsOnly ? PPFrame::NO_TEMPLATES : 0;
// $text = $frame->expand($dom, $flags);
//
// return $text;
// }
//
// \\ Clean up argument array - refactored in 1.9 so parserfunctions can use it, too.
// public static function createAssocArgs($args) {
// $assocArgs = [];
// $index = 1;
// foreach ($args as $arg) {
// $eqpos = strpos($arg, '=');
// if ($eqpos == false) {
// $assocArgs[$index++] = $arg;
// } else {
// $name = trim(substr($arg, 0, $eqpos));
// $value = trim(substr($arg, $eqpos + 1));
// if ($value == false) {
// $value = '';
// }
// if ($name != false) {
// $assocArgs[$name] = $value;
// }
// }
// }
//
// return $assocArgs;
// }
// \\ Return the text of a template, after recursively
// \\ replacing any variables or templates within the template.
// \\
// \\ @param array $piece The parts of the template
// \\ $piece['title']: the title, i.e. the part before the |
// \\ $piece['parts']: the parameter array
// \\ $piece['lineStart']: whether the brace was at the start of a line
// \\ @param PPFrame $frame The current frame, contains template arguments
// \\ @throws Exception
// \\ @return String The text of the template
// public void Brace_substitution(Xomw_prepro_node__template piece, Xomw_frame_itm frame) {
// // Flags
//
// // $text has been filled
// boolean found = false;
// // wiki markup in $text should be escaped
// boolean nowiki = false;
// // $text is HTML, armour it against wikitext transformation
// boolean is_html = false;
// // Force interwiki transclusion to be done in raw mode not rendered
// boolean force_raw_interwiki = false;
// // $text is a DOM node needing expansion in a child frame
// boolean is_child_obj = false;
// // $text is a DOM node needing expansion in the current frame
// boolean is_local_obj = false;
//
// // Title Object, where $text came from
// byte[] title = null;
//
// // $part1 is the bit before the first |, and must contain only title characters.
// // Various prefixes will be stripped from it later.
// byte[] title_with_spaces = frame.Expand(piece.Title());
// byte[] part1 = Bry_.Trim(title_with_spaces);
// byte[] title_text = null;
//
// // Original title text preserved for various purposes
// byte[] originalTitle = part1;
//
// // $args is a list of argument nodes, starting from index 0, not including $part1
// // @todo FIXME: If piece['parts'] is null then the call to getLength()
// // below won't work b/c this $args isn't an Object
// Xomw_prepro_node__part[] args = (null == piece.Parts()) ? null : piece.Parts();
//
// byte[] profile_section = null; // profile templates
//
// Tfds.Write(nowiki, is_html, force_raw_interwiki, is_child_obj, is_local_obj, title, title_text, profile_section);
// // SUBST
// if (!found) {
// String subst_match = null; // $this->mSubstWords->matchStartAndRemove($part1);
// boolean literal = false;
//
// // Possibilities for substMatch: "subst", "safesubst" or FALSE
// // Decide whether to expand template or keep wikitext as-is.
// if (parser.Output_type__wiki()) {
// if (subst_match == null) {
// literal = true; // literal when in PST with no prefix
// }
// else {
// literal = false; // expand when in PST with subst: or safesubst:
// }
// }
// else {
// if (subst_match == "subst") {
// literal = true; // literal when not in PST with plain subst:
// }
// else {
// literal = false; // expand when not in PST with safesubst: or no prefix
// }
// }
// if (literal) {
//// $text = $frame->virtualBracketedImplode('{{', '|', '}}', title_with_spaces, $args);
// is_local_obj = true;
// found = true;
// }
// }
//
// // Variables
// if (!found && args.length == 0) {
//// $id = $this->mVariables->matchStartToEnd($part1);
//// if ($id != false) {
//// $text = $this->getVariableValue($id, $frame);
//// if (MagicWord::getCacheTTL($id) > -1) {
//// $this->mOutput->updateCacheExpiry(MagicWord::getCacheTTL($id));
//// }
// found = true;
//// }
// }
//
// // MSG, MSGNW and RAW
// if (!found) {
// // Check for MSGNW:
//// $mwMsgnw = MagicWord::get('msgnw');
//// if ($mwMsgnw->matchStartAndRemove($part1)) {
// nowiki = true;
//// }
//// else {
// // Remove obsolete MSG:
//// $mwMsg = MagicWord::get('msg');
//// $mwMsg->matchStartAndRemove($part1);
//// }
//
// // Check for RAW:
//// $mwRaw = MagicWord::get('raw');
//// if ($mwRaw->matchStartAndRemove($part1)) {
//// force_raw_interwiki = true;
//// }
// }
// Parser functions
// if (!found) {
// $colonPos = strpos($part1, ':');
// if ($colonPos != false) {
// $func = substr($part1, 0, $colonPos);
// $funcArgs = [ trim(substr($part1, $colonPos + 1)) ];
// $argsLength = $args->getLength();
// for ($i = 0; $i < $argsLength; $i++) {
// $funcArgs[] = $args->item($i);
// }
// try {
// $result = $this->callParserFunction($frame, $func, $funcArgs);
// } catch (Exception $ex) {
// throw $ex;
// }
// The interface for parser functions allows for extracting
// flags into the local scope. Extract any forwarded flags
// here.
// extract($result);
// }
// }
// Finish mangling title and then check for loops.
// Set title to a Title Object and $title_text to the PDBK
// if (!found) {
// $ns = NS_TEMPLATE;
// Split the title into page and subpage
// $subpage = '';
// $relative = $this->maybeDoSubpageLink($part1, $subpage);
// if ($part1 != $relative) {
// $part1 = $relative;
// $ns = $this->mTitle->getNamespace();
// }
// title = Title::newFromText($part1, $ns);
// if (title) {
// $title_text = title->getPrefixedText();
// // Check for language variants if the template is not found
// if ($this->getConverterLanguage()->hasVariants() && title->getArticleID() == 0) {
// $this->getConverterLanguage()->findVariantLink($part1, title, true);
// }
// // Do recursion depth check
// $limit = $this->mOptions->getMaxTemplateDepth();
// if ($frame->depth >= $limit) {
// found = true;
// $text = '<span class="error">'
// . wfMessage('parser-template-recursion-depth-warning')
// ->numParams($limit)->inContentLanguage()->text()
// . '</span>';
// }
// }
// }
// Load from database
// if (!found && title) {
// $profile_section = $this->mProfiler->scopedProfileIn(title->getPrefixedDBkey());
// if (!title->isExternal()) {
// if (title->isSpecialPage()
// && $this->mOptions->getAllowSpecialInclusion()
// && $this->ot['html']
// ) {
// $specialPage = SpecialPageFactory::getPage(title->getDBkey());
// // Pass the template arguments as URL parameters.
// // "uselang" will have no effect since the Language Object
// // is forced to the one defined in ParserOptions.
// $pageArgs = [];
// $argsLength = $args->getLength();
// for ($i = 0; $i < $argsLength; $i++) {
// $bits = $args->item($i)->splitArg();
// if (strval($bits['index']) == '') {
// $name = trim($frame->expand($bits['name'], PPFrame::STRIP_COMMENTS));
// $value = trim($frame->expand($bits['value']));
// $pageArgs[$name] = $value;
// }
// }
//
// // Create a new context to execute the special page
// $context = new RequestContext;
// $context->setTitle(title);
// $context->setRequest(new FauxRequest($pageArgs));
// if ($specialPage && $specialPage->maxIncludeCacheTime() == 0) {
// $context->setUser($this->getUser());
// } else {
// // If this page is cached, then we better not be per user.
// $context->setUser(User::newFromName('127.0.0.1', false));
// }
// $context->setLanguage($this->mOptions->getUserLangObj());
// $ret = SpecialPageFactory::capturePath(
// title, $context, $this->getLinkRenderer());
// if ($ret) {
// $text = $context->getOutput()->getHTML();
// $this->mOutput->addOutputPageMetadata($context->getOutput());
// found = true;
// is_html = true;
// if ($specialPage && $specialPage->maxIncludeCacheTime() != false) {
// $this->mOutput->updateRuntimeAdaptiveExpiry(
// $specialPage->maxIncludeCacheTime()
// );
// }
// }
// } elseif (MWNamespace::isNonincludable(title->getNamespace())) {
// found = false; // access denied
// wfDebug(__METHOD__ . ": template inclusion denied for " .
// title->getPrefixedDBkey() . "\n");
// } else {
// list($text, title) = $this->getTemplateDom(title);
// if ($text != false) {
// found = true;
// is_child_obj = true;
// }
// }
//
// // If the title is valid but undisplayable, make a link to it
// if (!found && ($this->ot['html'] || $this->ot['pre'])) {
// $text = "[[:$title_text]]";
// found = true;
// }
// } elseif (title->isTrans()) {
// // Interwiki transclusion
// if ($this->ot['html'] && !force_raw_interwiki) {
// $text = $this->interwikiTransclude(title, 'render');
// is_html = true;
// } else {
// $text = $this->interwikiTransclude(title, 'raw');
// // Preprocess it like a template
// $text = $this->preprocessToDom($text, self::PTD_FOR_INCLUSION);
// is_child_obj = true;
// }
// found = true;
// }
//
// // Do infinite loop check
// // This has to be done after redirect resolution to avoid infinite loops via redirects
// if (!$frame->loopCheck(title)) {
// found = true;
// $text = '<span class="error">'
// . wfMessage('parser-template-loop-warning', $title_text)->inContentLanguage()->text()
// . '</span>';
// wfDebug(__METHOD__ . ": template loop broken at '$title_text'\n");
// }
// }
// If we haven't found text to substitute by now, we're done
// Recover the source wikitext and return it
// if (!found) {
// $text = $frame->virtualBracketedImplode('{{', '|', '}}', title_with_spaces, $args);
// if ($profile_section) {
// $this->mProfiler->scopedProfileOut($profile_section);
// }
// return [ 'Object' => $text ];
// }
// Expand DOM-style return values in a child frame
// if (is_child_obj) {
// // Clean up argument array
// $newFrame = $frame->newChild($args, title);
//
// if (nowiki) {
// $text = $newFrame->expand($text, PPFrame::RECOVER_ORIG);
// } elseif ($title_text != false && $newFrame->isEmpty()) {
// // Expansion is eligible for the empty-frame cache
// $text = $newFrame->cachedExpand($title_text, $text);
// } else {
// // Uncached expansion
// $text = $newFrame->expand($text);
// }
// }
// if (is_local_obj && nowiki) {
// $text = $frame->expand($text, PPFrame::RECOVER_ORIG);
// is_local_obj = false;
// }
// if ($profile_section) {
// $this->mProfiler->scopedProfileOut($profile_section);
// }
// Replace raw HTML by a placeholder
// if (is_html) {
// $text = $this->insertStripItem($text);
// } elseif (nowiki && ($this->ot['html'] || $this->ot['pre'])) {
// // Escape nowiki-style return values
// $text = wfEscapeWikiText($text);
// } elseif (is_string($text)
// && !$piece['lineStart']
// && preg_match('/^(?:{\\||:|;|#|\*)/', $text)
// ) {
// // T2529: if the template begins with a table or block-level
// // element, it should be treated as beginning a new line.
// // This behavior is somewhat controversial.
// $text = "\n" . $text;
// }
// if (is_string($text) && !$this->incrementIncludeSize('post-expand', strlen($text))) {
// // Error, oversize inclusion
// if ($title_text != false) {
// // Make a working, properly escaped link if possible (T25588)
// $text = "[[:$title_text]]";
// } else {
// // This will probably not be a working link, but at least it may
// // provide some hint of where the problem is
// preg_replace('/^:/', '', $originalTitle);
// $text = "[[:$originalTitle]]";
// }
// $text .= $this->insertStripItem('<!-- WARNING: template omitted, '
// . 'post-expand include size too large -->');
// $this->limitationWarn('post-expand-template-inclusion');
// }
//
// if (is_local_obj) {
// $ret = [ 'Object' => $text ];
// } else {
// $ret = [ 'text' => $text ];
// }
// return $ret;
// }
// \\ Triple brace replacement -- used for template arguments
// public function argSubstitution($piece, $frame) {
//
// $error = false;
// $parts = $piece['parts'];
// $nameWithSpaces = $frame->expand($piece['title']);
// $argName = trim($nameWithSpaces);
// $Object = false;
// $text = $frame->getArgument($argName);
// if ($text == false && $parts->getLength() > 0
// && ($this->ot['html']
// || $this->ot['pre']
// || ($this->ot['wiki'] && $frame->isTemplate())
// )
// ) {
// // No match in frame, use the supplied default
// $Object = $parts->item(0)->getChildren();
// }
// if (!$this->incrementIncludeSize('arg', strlen($text))) {
// $error = '<!-- WARNING: argument omitted, expansion size too large -->';
// $this->limitationWarn('post-expand-template-argument');
// }
//
// if ($text == false && $Object == false) {
// // No match anywhere
// $Object = $frame->virtualBracketedImplode('{{{', '|', '}}}', $nameWithSpaces, $parts);
// }
// if ($error != false) {
// $text .= $error;
// }
// if ($Object != false) {
// $ret = [ 'Object' => $Object ];
// } else {
// $ret = [ 'text' => $text ];
// }
//
// return $ret;
// }
//
// /**
// \\ Return the text to be used for a given extension tag.
// \\ This is the ghost of strip().
// \\
// \\ @param array $params Associative array of parameters:
// \\ name PPNode for the tag name
// \\ attr PPNode for unparsed text where tag attributes are thought to be
// \\ attributes Optional associative array of parsed attributes
// \\ inner Contents of extension element
// \\ noClose Original text did not have a close tag
// \\ @param PPFrame $frame
// \\
// \\ @throws MWException
// \\ @return String
// \\/
// public function extensionSubstitution($params, $frame) {
// static $errorStr = '<span class="error">';
// static $errorLen = 20;
//
// $name = $frame->expand($params['name']);
// if (substr($name, 0, $errorLen) == $errorStr) {
// // Probably expansion depth or node count exceeded. Just punt the
// // error up.
// return $name;
// }
//
// $attrText = !isset($params['attr']) ? null : $frame->expand($params['attr']);
// if (substr($attrText, 0, $errorLen) == $errorStr) {
// // See above
// return $attrText;
// }
//
// // We can't safely check if the expansion for $content resulted in an
// // error, because the content could happen to be the error String
// // (T149622).
// $content = !isset($params['inner']) ? null : $frame->expand($params['inner']);
//
// $marker = self::MARKER_PREFIX . "-$name-"
// . sprintf('%08X', $this->mMarkerIndex++) . self::MARKER_SUFFIX;
//
// $isFunctionTag = isset($this->mFunctionTagHooks[strtolower($name)]) &&
// ($this->ot['html'] || $this->ot['pre']);
// if ($isFunctionTag) {
// $markerType = 'none';
// } else {
// $markerType = 'general';
// }
// if ($this->ot['html'] || $isFunctionTag) {
// $name = strtolower($name);
// $attributes = Sanitizer::decodeTagAttributes($attrText);
// if (isset($params['attributes'])) {
// $attributes = $attributes + $params['attributes'];
// }
//
// if (isset($this->mTagHooks[$name])) {
// // Workaround for PHP bug 35229 and similar
// if (!is_callable($this->mTagHooks[$name])) {
// throw new MWException("Tag hook for $name is not callable\n");
// }
// $output = call_user_func_array($this->mTagHooks[$name],
// [ $content, $attributes, $this, $frame ]);
// } elseif (isset($this->mFunctionTagHooks[$name])) {
// list($callback,) = $this->mFunctionTagHooks[$name];
// if (!is_callable($callback)) {
// throw new MWException("Tag hook for $name is not callable\n");
// }
//
// $output = call_user_func_array($callback, [ &$this, $frame, $content, $attributes ]);
// } else {
// $output = '<span class="error">Invalid tag extension name: ' .
// htmlspecialchars($name) . '</span>';
// }
//
// if (is_array($output)) {
// // Extract flags to local scope (to override $markerType)
// $flags = $output;
// $output = $flags[0];
// unset($flags[0]);
// extract($flags);
// }
// } else {
// if (is_null($attrText)) {
// $attrText = '';
// }
// if (isset($params['attributes'])) {
// foreach ($params['attributes'] as $attrName => $attrValue) {
// $attrText .= ' ' . htmlspecialchars($attrName) . '="' .
// htmlspecialchars($attrValue) . '"';
// }
// }
// if ($content == null) {
// $output = "<$name$attrText/>";
// } else {
// $close = is_null($params['close']) ? '' : $frame->expand($params['close']);
// if (substr($close, 0, $errorLen) == $errorStr) {
// // See above
// return $close;
// }
// $output = "<$name$attrText>$content$close";
// }
// }
//
// if ($markerType == 'none') {
// return $output;
// } elseif ($markerType == 'nowiki') {
// $this->mStripState->addNoWiki($marker, $output);
// } elseif ($markerType == 'general') {
// $this->mStripState->addGeneral($marker, $output);
// } else {
// throw new MWException(__METHOD__ . ': invalid marker type');
// }
// return $marker;
// }
// }

@ -0,0 +1,98 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public interface Xomw_prepro_node {
int Subs__len();
Xomw_prepro_node Subs__get_at(int i);
void Subs__add(Xomw_prepro_node sub);
void To_xml(Bry_bfr bfr);
}
class Xomw_prepro_node__text extends Xomw_prepro_node__base {
public Xomw_prepro_node__text(byte[] bry) {
this.bry = bry;
}
public byte[] Bry() {return bry;} protected final byte[] bry;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add(bry);
}
}
class Xomw_prepro_node__comment extends Xomw_prepro_node__base {
public Xomw_prepro_node__comment(byte[] bry) {
this.bry = bry;
}
public byte[] Bry() {return bry;} protected final byte[] bry;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<comment>");
bfr.Add(bry);
bfr.Add_str_a7("</comment>");
}
}
class Xomw_prepro_node__ext extends Xomw_prepro_node__base {
public Xomw_prepro_node__ext(byte[] name, byte[] attr, byte[] inner, byte[] close) {
this.name = name;
this.attr = attr;
this.inner = inner;
this.close = close;
}
public byte[] Name() {return name;} private final byte[] name;
public byte[] Attr() {return attr;} private final byte[] attr;
public byte[] Inner() {return inner;} private final byte[] inner;
public byte[] Close() {return close;} private final byte[] close;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<ext>");
bfr.Add_str_a7("<name>").Add(name).Add_str_a7("</name>");
bfr.Add_str_a7("<atr>").Add(attr).Add_str_a7("</atr>");
bfr.Add_str_a7("<inner>").Add(inner).Add_str_a7("</inner>");
bfr.Add_str_a7("<close>").Add(close).Add_str_a7("</close>");
bfr.Add_str_a7("</ext>");
}
}
class Xomw_prepro_node__heading extends Xomw_prepro_node__base {
public Xomw_prepro_node__heading(int heading_index, int title_index, byte[] text) {
this.heading_index = heading_index;
this.title_index = title_index;
this.text = text;
}
public int Heading_index() {return heading_index;} private final int heading_index;
public int Title_index() {return title_index;} private final int title_index;
public byte[] Text() {return text;} private final byte[] text;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<h ");
bfr.Add_str_a7(" level=\"").Add_int_variable(heading_index);
bfr.Add_str_a7("\" i=\"").Add_int_variable(title_index);
bfr.Add_str_a7("\">");
bfr.Add(text);
bfr.Add_str_a7("</h>");
}
}
class Xomw_prepro_node__tplarg extends Xomw_prepro_node__base {
public Xomw_prepro_node__tplarg(byte[] title, Xomw_prepro_node__part[] parts) {
this.title = title; this.parts = parts;
}
public byte[] Title() {return title;} private final byte[] title;
public Xomw_prepro_node__part[] Parts() {return parts;} private final Xomw_prepro_node__part[] parts;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<tplarg>");
bfr.Add_str_a7("<title>").Add(title);
bfr.Add_str_a7("</title>");
for (Xomw_prepro_node__part part : parts)
part.To_xml(bfr);
bfr.Add_str_a7("</tplarg>");
}
}

@ -0,0 +1,28 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public abstract class Xomw_prepro_node__base implements Xomw_prepro_node {
private List_adp subs;
public int Subs__len() {return subs == null ? 0 : subs.Len();}
public Xomw_prepro_node Subs__get_at(int i) {return subs == null ? null : (Xomw_prepro_node)subs.Get_at(i);}
public void Subs__add(Xomw_prepro_node sub) {
if (subs == null) subs = List_adp_.New();
subs.Add(sub);
}
public abstract void To_xml(Bry_bfr bfr);
}

@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public class Xomw_prepro_node__part extends Xomw_prepro_node__base {
public Xomw_prepro_node__part(int idx, byte[] key, byte[] val) {
this.idx = idx;
this.key = key;
this.val = val;
}
public int Idx() {return idx;} private final int idx;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<part>");
bfr.Add_str_a7("<name");
if (idx > 0) {
bfr.Add_str_a7(" index=\"").Add_int_variable(idx).Add_str_a7("\" />");
}
else {
bfr.Add_str_a7(">");
bfr.Add(key);
bfr.Add_str_a7("</name>");
bfr.Add_str_a7("=");
}
bfr.Add_str_a7("<value>");
bfr.Add(val);
bfr.Add_str_a7("</value>");
bfr.Add_str_a7("</part>");
}
}

@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public class Xomw_prepro_node__template extends Xomw_prepro_node__base {
public Xomw_prepro_node__template(byte[] title, Xomw_prepro_node__part[] parts, int line_start) {
this.title = title; this.parts = parts; this.line_start = line_start;
}
public byte[] Title() {return title;} private final byte[] title;
public Xomw_prepro_node__part[] Parts() {return parts;} private final Xomw_prepro_node__part[] parts;
public int Line_start() {return line_start;} private final int line_start;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<template");
if (line_start > 0) bfr.Add_str_a7(" lineStart=\"").Add_int_variable(line_start).Add_byte_quote();
bfr.Add_byte(Byte_ascii.Angle_end);
bfr.Add_str_a7("<title>").Add(title);
bfr.Add_str_a7("</title>");
for (Xomw_prepro_node__part part : parts)
part.To_xml(bfr);
bfr.Add_str_a7("</template>");
}
}

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
class Xomw_prepro_rule { class Xomw_prepro_rule {
public Xomw_prepro_rule(byte[] bgn, byte[] end, int min, int max, int[] names) { public Xomw_prepro_rule(byte[] bgn, byte[] end, int min, int max, int[] names) {
this.bgn = bgn; this.bgn = bgn;

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
class Xomw_prepro_stack { class Xomw_prepro_stack {
public List_adp stack = List_adp_.New(); public List_adp stack = List_adp_.New();
public Xomw_prepro_piece top; public Xomw_prepro_piece top;

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.btries.*; import gplx.core.btries.*;
import gplx.langs.phps.utls.*; import gplx.langs.phps.utls.*;
public class Xomw_prepro_wkr { // THREAD.UNSAFE: caching for repeated calls public class Xomw_prepro_wkr { // THREAD.UNSAFE: caching for repeated calls

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*; import org.junit.*;
public class Xomw_prepro_wkr__tst { public class Xomw_prepro_wkr__tst {
private final Xomw_prepro_wkr__fxt fxt = new Xomw_prepro_wkr__fxt(); private final Xomw_prepro_wkr__fxt fxt = new Xomw_prepro_wkr__fxt();
@ -55,6 +55,9 @@ public class Xomw_prepro_wkr__tst {
@Test public void Tplarg() { @Test public void Tplarg() {
fxt.Test__parse("a{{{b}}}c", "<root>a<tplarg><title>b</title></tplarg>c</root>"); fxt.Test__parse("a{{{b}}}c", "<root>a<tplarg><title>b</title></tplarg>c</root>");
} }
@Test public void Tplarg__dflt() {
fxt.Test__parse("a{{{b|c}}}d", "<root>a<tplarg><title>b</title><part><name index=\"1\" /><value>c</value></part></tplarg>d</root>");
}
@Test public void Comment() { @Test public void Comment() {
fxt.Test__parse("a<!--b-->c", "<root>a<comment>&lt;!--b--&gt;</comment>c</root>"); fxt.Test__parse("a<!--b-->c", "<root>a<comment>&lt;!--b--&gt;</comment>c</root>");
} }

@ -15,26 +15,53 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.langs.phps.utls.*; import gplx.langs.phps.utls.*;
import gplx.xowa.parsers.htmls.*; import gplx.xowa.parsers.htmls.*;
import gplx.core.primitives.*; import gplx.core.primitives.*;
public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls public class Xomw_quote_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New(); private Bry_bfr tmp;
private final Bry_bfr tmp = Bry_bfr_.New();
private final Int_list apos_pos_ary = new Int_list(32); private final Int_list apos_pos_ary = new Int_list(32);
public byte[] Do_all_quotes(byte[] src) { public Xomw_quote_wkr(Xomw_parser mgr) {
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text ); this.tmp = mgr.Tmp();
}
public void Do_all_quotes(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
int line_bgn = cur;
while (true) {
int line_end = Bry_find_.Find_fwd(src, Byte_ascii.Nl, line_bgn, src_end);
if (line_end == Bry_find_.Not_found) {
line_end = src_end;
}
Do_quotes(bfr, Bool_.Y, src, line_bgn, line_end);
if (line_end == src_end)
break;
else
line_bgn = line_end + 1; // 1=\n.length
}
// Bry_split_.Split(src, src_bgn, src_end, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
if (bfr.Match_end_byt(Byte_ascii.Nl))
bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 ); bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 );
apos_pos_ary.Clear(); apos_pos_ary.Clear();
return bfr.To_bry_and_clear();
} }
private static final byte[] Wtxt__apos = Bry_.new_a7("''"); public byte[] Do_quotes(Bry_bfr tmp, byte[] src) {
public int Split(byte[] src, int itm_bgn, int itm_end) { boolean found = Do_quotes(tmp, Bool_.N, src, 0, src.length);
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE); return found ? tmp.To_bry_and_clear() : src;
}
private boolean Do_quotes(Bry_bfr bfr, boolean all_quotes_mode, byte[] src, int line_bgn, int line_end) {
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, line_bgn, line_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
if (arr == null) { if (arr == null) {
bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl(); if (all_quotes_mode) {
return Bry_split_.Rv__ok; bfr.Add_mid(src, line_bgn, line_end).Add_byte_nl();
}
return false;
} }
int arr_len = arr.length; int arr_len = arr.length;
@ -226,7 +253,7 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>"); bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
} }
bfr.Add_byte_nl(); bfr.Add_byte_nl();
return Bry_split_.Rv__ok; return true;
} }
private static final int private static final int
State__empty = 0 State__empty = 0
@ -236,4 +263,5 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
, State__ib = 4 , State__ib = 4
, State__both = 5 , State__both = 5
; ;
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
} }

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*; import org.junit.*;
public class Xomw_quote_wkr__tst { public class Xomw_quote_wkr__tst {
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt(); private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
@ -32,12 +32,14 @@ public class Xomw_quote_wkr__tst {
@Test public void Dangling__b() {fxt.Test__parse("a'''b" , "a<b>b</b>");} // COVERS: "if (state == State__b || state == State__ib)" @Test public void Dangling__b() {fxt.Test__parse("a'''b" , "a<b>b</b>");} // COVERS: "if (state == State__b || state == State__ib)"
@Test public void Dangling__i() {fxt.Test__parse("a''b" , "a<i>b</i>");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)" @Test public void Dangling__i() {fxt.Test__parse("a''b" , "a<i>b</i>");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)"
@Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "a<b><i>b</i></b>");} // COVERS: "There might be lonely ''''', so make sure we have a buffer" @Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "a<b><i>b</i></b>");} // COVERS: "There might be lonely ''''', so make sure we have a buffer"
@Test public void Nl__text() {fxt.Test__parse("a\nb''c''d\n\ne" , "a\nb<i>c</i>d\n\ne");}
} }
class Xomw_quote_wkr__fxt { class Xomw_quote_wkr__fxt {
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(); private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(new Xomw_parser());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public void Test__parse(String src_str, String expd) { public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str); byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Do_all_quotes(src_bry); wkr.Do_all_quotes(new Xomw_parser_ctx(), pbfr.Init(src_bry));
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
} }
} }

@ -15,12 +15,14 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.langs.phps.utls.*; import gplx.langs.phps.utls.*;
import gplx.xowa.parsers.htmls.*; import gplx.xowa.parsers.htmls.*;
import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*; import gplx.xowa.mws.utls.*; import gplx.xowa.parsers.uniqs.*;
public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New(), tmp_bfr = Bry_bfr_.New(); private final Bry_bfr tmp;
private Bry_bfr bfr;
private final Xomw_sanitizer sanitizer; private final Xomw_strip_state strip_state;
private final List_adp private final List_adp
td_history = List_adp_.New() // Is currently a td tag open? td_history = List_adp_.New() // Is currently a td tag open?
, last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption) , last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption)
@ -30,14 +32,22 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
; ;
private int indent_level = 0; // indent level of the table private int indent_level = 0; // indent level of the table
private byte[] first_2 = new byte[2]; private byte[] first_2 = new byte[2];
private Xomw_sanitizer_mgr sanitizer; public Xomw_table_wkr(Xomw_parser parser) {
private Xop_uniq_mgr uniq_mgr; this.tmp = parser.Tmp();
public byte[] Do_table_stuff(Xomw_parser_ctx ctx, byte[] src) { this.sanitizer = parser.Sanitizer();
this.sanitizer = ctx.Sanitizer(); this.strip_state = parser.Strip_state();
this.uniq_mgr = ctx.Uniq_mgr(); }
public void Do_table_stuff(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
this.bfr = pbfr.Trg();
pbfr.Switch();
indent_level = 0; indent_level = 0;
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text); Bry_split_.Split(src, src_bgn, src_end, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text);
// Closing open td, tr && table // Closing open td, tr && table
while (td_history.Len() > 0) { while (td_history.Len() > 0) {
@ -62,9 +72,8 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
if ( bfr.Len() == Len__tb__empty if ( bfr.Len() == Len__tb__empty
&& Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) { && Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) {
bfr.Clear(); bfr.Clear();
return Bry_.Empty; return;
} }
return bfr.To_bry_and_clear();
} }
public int Split(byte[] src, int itm_bgn, int itm_end) { public int Split(byte[] src, int itm_bgn, int itm_end) {
byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine" byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine"
@ -78,7 +87,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
byte first_char = line[0]; byte first_char = line[0];
first_2[0] = line[0]; first_2[0] = line[0];
if (line_len > 1) first_2[1] = line[1]; first_2[1] = line_len == 1 ? Byte_ascii.Null : line[1];
// PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches) // PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)
byte[] tblw_atrs = null; byte[] tblw_atrs = null;
@ -94,15 +103,15 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
// First check if we are starting a new table // First check if we are starting a new table
indent_level = colons_end; indent_level = colons_end;
tblw_atrs = uniq_mgr.Convert(tblw_atrs); tblw_atrs = strip_state.Unstrip_both(tblw_atrs);
// PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>"; // PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>";
for (int j = 0; j < indent_level; j++) for (int j = 0; j < indent_level; j++)
tmp_bfr.Add(Html__dl__bgn); tmp.Add(Html__dl__bgn);
tmp_bfr.Add_str_a7("<table"); tmp.Add_str_a7("<table");
sanitizer.Fix_tag_attributes(tmp_bfr, Name__table, tblw_atrs); sanitizer.Fix_tag_attributes(tmp, Name__table, tblw_atrs);
tmp_bfr.Add_byte(Byte_ascii.Angle_end); tmp.Add_byte(Byte_ascii.Angle_end);
out_line = tmp_bfr.To_bry_and_clear(); out_line = tmp.To_bry_and_clear();
td_history.Add(false); td_history.Add(false);
last_tag_history.Add(Bry_.Empty); last_tag_history.Add(Bry_.Empty);
tr_history.Add(false); tr_history.Add(false);
@ -116,35 +125,35 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
} }
else if (Bry_.Eq(first_2, Wtxt__tb__end)) { else if (Bry_.Eq(first_2, Wtxt__tb__end)) {
// We are ending a table // We are ending a table
line = tmp_bfr.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear(); line = tmp.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear();
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history); byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) { if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
line = tmp_bfr.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear(); line = tmp.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear();
} }
if (Php_ary_.Pop_bool_or_n(tr_history)) { if (Php_ary_.Pop_bool_or_n(tr_history)) {
line = tmp_bfr.Add_str_a7("</tr>").Add(line).To_bry_and_clear(); line = tmp.Add_str_a7("</tr>").Add(line).To_bry_and_clear();
} }
if (Php_ary_.Pop_bool_or_n(td_history)) { if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear(); line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear();
} }
Php_ary_.Pop_bry_or_null(tr_attributes); Php_ary_.Pop_bry_or_null(tr_attributes);
// PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level ); // PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level );
tmp_bfr.Add(line); tmp.Add(line);
for (int j = 0; j < indent_level; j++) for (int j = 0; j < indent_level; j++)
tmp_bfr.Add(Html__dl__end); tmp.Add(Html__dl__end);
out_line = tmp_bfr.To_bry_and_clear(); out_line = tmp.To_bry_and_clear();
} }
else if (Bry_.Eq(first_2, Wtxt__tr)) { else if (Bry_.Eq(first_2, Wtxt__tr)) {
// Now we have a table row // Now we have a table row
line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line); line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line);
// Whats after the tag is now only attributes // Whats after the tag is now only attributes
byte[] atrs = uniq_mgr.Unstrip_both(line); byte[] atrs = strip_state.Unstrip_both(line);
sanitizer.Fix_tag_attributes(tmp_bfr, Name__tr, atrs); sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs);
atrs = tmp_bfr.To_bry_and_clear(); atrs = tmp.To_bry_and_clear();
Php_ary_.Pop_bry_or_null(tr_attributes); Php_ary_.Pop_bry_or_null(tr_attributes);
tr_attributes.Add(atrs); tr_attributes.Add(atrs);
@ -159,7 +168,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
} }
if (Php_ary_.Pop_bool_or_n(td_history)) { if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear(); line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear();
} }
out_line = line; out_line = line;
@ -181,13 +190,14 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
// Implies both are valid for table headings. // Implies both are valid for table headings.
if (first_char == Byte_ascii.Bang) { if (first_char == Byte_ascii.Bang) {
Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line); Xomw_string_utl.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
} }
// Split up multiple cells on the same line. // Split up multiple cells on the same line.
// FIXME : This can result in improper nesting of tags processed // FIXME : This can result in improper nesting of tags processed
// by earlier parser steps. // by earlier parser steps.
byte[][] cells = Bry_split_.Split(line, Wtxt__td2); byte[][] cells = Bry_split_.Split(line, Wtxt__td2);
if (cells.length == 0) cells = Cells__empty; // handle "\n|\n" which should still generate "<tr><td></td></tr>", not ""; see TEST
out_line = Bry_.Empty; out_line = Bry_.Empty;
@ -200,7 +210,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
if (first_char != Byte_ascii.Plus) { if (first_char != Byte_ascii.Plus) {
byte[] tr_after = Php_ary_.Pop_bry_or_null(tr_attributes); byte[] tr_after = Php_ary_.Pop_bry_or_null(tr_attributes);
if (!Php_ary_.Pop_bool_or_n(tr_history)) { if (!Php_ary_.Pop_bool_or_n(tr_history)) {
previous = tmp_bfr.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear(); previous = tmp.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear();
} }
tr_history.Add(true); tr_history.Add(true);
tr_attributes.Add(Bry_.Empty); tr_attributes.Add(Bry_.Empty);
@ -211,7 +221,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history); byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (Php_ary_.Pop_bool_or_n(td_history)) { if (Php_ary_.Pop_bool_or_n(td_history)) {
previous = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear(); previous = tmp.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear();
} }
if (first_char == Byte_ascii.Pipe) { if (first_char == Byte_ascii.Pipe) {
@ -237,17 +247,17 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
byte[] cell_data_0 = cell_data[0]; byte[] cell_data_0 = cell_data[0];
byte[] cell_data_1 = cell_data[1]; byte[] cell_data_1 = cell_data[1];
if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) { if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) {
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear(); cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear();
} }
else if (cell_data_1 == null) { else if (cell_data_1 == null) {
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear(); cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear();
} }
else { else {
byte[] atrs = uniq_mgr.Unstrip_both(cell_data_0); byte[] atrs = strip_state.Unstrip_both(cell_data_0);
tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag); tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
sanitizer.Fix_tag_attributes(tmp_bfr, last_tag, atrs); sanitizer.Fix_tag_attributes(tmp, last_tag, atrs);
tmp_bfr.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1); tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
cell = tmp_bfr.To_bry_and_clear(); cell = tmp.To_bry_and_clear();
} }
out_line = Bry_.Add(out_line, cell); out_line = Bry_.Add(out_line, cell);
@ -278,4 +288,5 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
, Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>") , Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>")
; ;
private static final int Len__tb__empty = Html__tb__empty.length; private static final int Len__tb__empty = Html__tb__empty.length;
private static final byte[][] Cells__empty = new byte[][] {Bry_.Empty};
} }

@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.parsers.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*; import org.junit.*;
public class Xomw_table_wkr__tst { public class Xomw_table_wkr__tst {
private final Xomw_table_wkr__fxt fxt = new Xomw_table_wkr__fxt(); private final Xomw_table_wkr__fxt fxt = new Xomw_table_wkr__fxt();
@ -101,13 +101,29 @@ public class Xomw_table_wkr__tst {
, "<tr><td></td></tr></table>" , "<tr><td></td></tr></table>"
)); ));
} }
@Test public void Td__empty() { // PURPOSE: handles (a) failure due to "first_2" array not handling "\n|\n"; (b) missing <tr><td></td></tr>
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|"
, "|-"
, "|"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table>"
, ""
, "<tr>"
, "<td>"
, "</td></tr></table>"
));
}
} }
class Xomw_table_wkr__fxt { class Xomw_table_wkr__fxt {
private final Xomw_parser_ctx ctx = new Xomw_parser_ctx(); private final Xomw_parser_bfr parser_bfr = new Xomw_parser_bfr();
private final Xomw_table_wkr wkr = new Xomw_table_wkr(); private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_table_wkr wkr = new Xomw_table_wkr(new Xomw_parser());
public void Test__parse(String src_str, String expd) { public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str); byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Do_table_stuff(ctx, src_bry); parser_bfr.Init(src_bry);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str); wkr.Do_table_stuff(pctx, parser_bfr);
Tfds.Eq_str_lines(expd, parser_bfr.Rslt().To_str_and_clear(), src_str);
} }
} }

@ -15,8 +15,8 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.utls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_string_utils { public class Xomw_string_utl {
public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
// PORTED: avoiding multiple regex calls / String creations // PORTED: avoiding multiple regex calls / String creations
// $placeholder = "\x00"; // $placeholder = "\x00";

@ -15,10 +15,10 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; package gplx.xowa.mws.utls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import org.junit.*; import gplx.core.tests.*;
public class Xomw_string_utils__tst { public class Xomw_string_utl__tst {
private final Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt(); private final Xomw_string_utl__fxt fxt = new Xomw_string_utl__fxt();
@Test public void Basic() { @Test public void Basic() {
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b"); fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
} }
@ -38,10 +38,10 @@ public class Xomw_string_utils__tst {
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;" fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
} }
} }
class Xomw_string_utils__fxt { class Xomw_string_utl__fxt {
public void Test__replace_markup(String src_str, String find, String repl, String expd) { public void Test__replace_markup(String src_str, String find, String repl, String expd) {
byte[] src_bry = Bry_.new_u8(src_str); byte[] src_bry = Bry_.new_u8(src_str);
Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl)); Xomw_string_utl.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
Tfds.Eq_str(expd, src_bry); Gftest.Eq__str(expd, src_bry);
} }
} }

@ -0,0 +1,120 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.utls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_ttl_utl {
// REF.MW: DefaultSettings.php
// Allowed title characters -- regex character class
// Don't change this unless you know what you're doing
//
// Problematic punctuation:
// - []{}|# Are needed for link syntax, never enable these
// - <> Causes problems with HTML escaping, don't use
// - % Enabled by default, minor problems with path to query rewrite rules, see below
// - + Enabled by default, but doesn't work with path to query rewrite rules,
// corrupted by apache
// - ? Enabled by default, but doesn't work with path to PATH_INFO rewrites
//
// All three of these punctuation problems can be avoided by using an alias,
// instead of a rewrite rule of either variety.
//
// The problem with % is that when using a path to query rewrite rule, URLs are
// double-unescaped: once by Apache's path conversion code, and again by PHP. So
// %253F, for example, becomes "?". Our code does not double-escape to compensate
// for this, indeed double escaping would break if the double-escaped title was
// passed in the query String rather than the path. This is a minor security issue
// because articles can be created such that they are hard to view or edit.
//
// In some rare cases you may wish to remove + for compatibility with old links.
//
// Theoretically 0x80-0x9F of ISO 8859-1 should be disallowed, but
// this breaks interlanguage links
// $wgLegalTitleChars = " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+";
//
// REGEX:
// without-backslash escaping --> \s%!"$&'()*,-./0-9:;=?@A-Z\^_`a-z~x80-xFF+
// rearranged
// letters --> 0-9A-Za-z
// unicode-chars --> x80-xFF
// symbols --> \s%!"$&'()*,-./:;=?@\^_`~+"
// deliberately ignores
// control chars: 00-31,127
// []{}|#<>
public static int Find_fwd_while_title(byte[] src, int src_bgn, int src_end, boolean[] valid) {
int cur = src_bgn;
while (true) {
if (cur == src_end) break;
byte b = src[cur];
int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
if (b_len == 1) { // ASCII
if (valid[b]) // valid; EX: "a0A B&$"
cur++;
else // invalid; EX: "<title>"
break;
}
else { // Multi-byte UTF8; NOTE: all sequences are valid
cur += b_len;
}
}
return cur;
}
private static boolean[] title_chars_valid;
public static boolean[] Title_chars_valid() {
if (title_chars_valid == null) {
title_chars_valid = new boolean[128];
// add num and alpha
for (int i = Byte_ascii.Num_0; i <= Byte_ascii.Num_9; i++)
title_chars_valid[i] = true;
for (int i = Byte_ascii.Ltr_A; i <= Byte_ascii.Ltr_Z; i++)
title_chars_valid[i] = true;
for (int i = Byte_ascii.Ltr_a; i <= Byte_ascii.Ltr_z; i++)
title_chars_valid[i] = true;
// add symbols: \s%!"$&'()*,-./:;=?@\^_`~+"
byte[] symbols = new byte[]
{ Byte_ascii.Space
, Byte_ascii.Percent
, Byte_ascii.Bang
, Byte_ascii.Quote
, Byte_ascii.Amp
, Byte_ascii.Apos
, Byte_ascii.Paren_bgn
, Byte_ascii.Paren_end
, Byte_ascii.Star
, Byte_ascii.Comma
, Byte_ascii.Dash
, Byte_ascii.Dot
, Byte_ascii.Slash
, Byte_ascii.Colon
, Byte_ascii.Semic
, Byte_ascii.Eq
, Byte_ascii.Question
, Byte_ascii.At
, Byte_ascii.Backslash
, Byte_ascii.Pow
, Byte_ascii.Underline
, Byte_ascii.Tick
, Byte_ascii.Tilde
, Byte_ascii.Plus
};
int symbols_len = symbols.length;
for (int i = 0; i < symbols_len; i++)
title_chars_valid[symbols[i]] = true;
}
return title_chars_valid;
}
}

@ -0,0 +1,30 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.utls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_ttl_utl__tst {
private final Xomw_ttl_utl__fxt fxt = new Xomw_ttl_utl__fxt();
@Test public void Alphanum() {fxt.Test__find_fwd_while_title("0aB" , 3);}
@Test public void Angle() {fxt.Test__find_fwd_while_title("0a<" , 2);}
}
class Xomw_ttl_utl__fxt {
public void Test__find_fwd_while_title(String src_str, int expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__int(expd, Xomw_ttl_utl.Find_fwd_while_title(src_bry, 0, src_bry.length, Xomw_ttl_utl.Title_chars_valid()));
}
}

@ -16,10 +16,10 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*; package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*; import gplx.xowa.mws.parsers.*; import gplx.xowa.mws.parsers.headings.*;
import gplx.xowa.addons.htmls.tocs.*; import gplx.xowa.htmls.core.htmls.tidy.*; import gplx.xowa.addons.htmls.tocs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
class Xop_section_list implements Xomw_hdr_cbk { class Xop_section_list implements Xomw_heading_cbk {
private final Xomw_hdr_wkr hdr_wkr = new Xomw_hdr_wkr(); private final Xomw_heading_wkr hdr_wkr = new Xomw_heading_wkr();
private final Ordered_hash hash = Ordered_hash_.New_bry(); private final Ordered_hash hash = Ordered_hash_.New_bry();
private final Xoh_toc_mgr toc_mgr = new Xoh_toc_mgr(); private final Xoh_toc_mgr toc_mgr = new Xoh_toc_mgr();
private byte[] src; private byte[] src;
@ -92,7 +92,7 @@ class Xop_section_list implements Xomw_hdr_cbk {
return new int[] {src_bgn, src_end}; return new int[] {src_bgn, src_end};
} }
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) { public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// get key by taking everything between ==; EX: "== abc ==" -> " abc " // get key by taking everything between ==; EX: "== abc ==" -> " abc "
byte[] src = wkr.Src(); byte[] src = wkr.Src();
int hdr_txt_bgn = wkr.Hdr_lhs_end(); int hdr_txt_bgn = wkr.Hdr_lhs_end();
@ -117,5 +117,5 @@ class Xop_section_list implements Xomw_hdr_cbk {
Xop_section_itm itm = new Xop_section_itm(hash.Count(), num, key, wkr.Hdr_bgn(), wkr.Hdr_end()); Xop_section_itm itm = new Xop_section_itm(hash.Count(), num, key, wkr.Hdr_bgn(), wkr.Hdr_end());
hash.Add(key, itm); hash.Add(key, itm);
} }
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {} public void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {}
} }

@ -17,7 +17,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*; package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
import gplx.langs.htmls.*; import gplx.langs.htmls.*;
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
public class Xop_section_mgr implements Gfo_invk { public class Xop_section_mgr implements Gfo_invk {
private Xoae_app app; private Xowe_wiki wiki; private Xoae_app app; private Xowe_wiki wiki;
private Xow_tidy_mgr_interface tidy_mgr; private Xow_tidy_mgr_interface tidy_mgr;

@ -1,261 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.blocks; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.langs.phps.utls.*;
public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New();
private byte[] last_prefix, last_section;
private boolean line_start, dt_open, in_block_elem, para_stack, in_blockquote, in_pre = false;
private int prefix_len;
private int src_len;
public byte[] Do_block_levels(byte[] src, boolean line_start) {
this.src_len = src.length;
this.line_start = line_start;
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
this.last_prefix = Bry_.Empty;
bfr.Clear();
this.dt_open = this.in_block_elem = false;
this.prefix_len = 0;
this.para_stack = false;
this.in_blockquote = false;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
Bry_split_.Split(src, 0, src_len, Byte_ascii.Nl, Bool_.N, this);
while (prefix_len > 0) {
// bfr .= this.closeList(prefix2[prefix_len - 1]);
prefix_len--;
if (prefix_len > 0) {
bfr.Add_byte_nl();
}
}
if (Bry_.Len_gt_0(last_section)) {
bfr.Add_str_a7("</").Add(last_section).Add_str_a7(">");
this.last_section = Bry_.Empty;
}
if (dt_open || in_block_elem || para_stack || in_blockquote || in_pre) {
}
return bfr.To_bry_and_clear();
}
public int Split(byte[] src, int itm_bgn, int itm_end) {
// Fix up line_start
if (!line_start) {
bfr.Add_mid(src, itm_bgn, itm_end);
line_start = true;
return Bry_split_.Rv__ok;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int last_prefix_len = last_prefix.length;
boolean pre_close_match = false; //preg_match('/<\\/pre/i', $oLine);
boolean pre_open_match = false; //preg_match('/<pre/i', $oLine);
byte[] prefix = null, prefix2 = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!in_pre) {
// Multiple prefixes may abut each other for nested lists.
prefix_len = 0;// strspn($oLine, '*#:;');
prefix = Php_str_.Substr(src, itm_bgn, prefix_len);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, itm_bgn + prefix_len, itm_end);
// this.in_pre = (boolean)pre_open_match;
}
else {
// Don't interpret any other prefixes in preformatted text
prefix_len = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, itm_bgn, itm_end);
}
// List generation
byte[] term = null, t2 = null;
int common_prefix_len = -1;
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
// bfr .= this.nextItem(substr(prefix, -1));
para_stack = false;
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
// if (this.findColonNoLinks(t, term, t2) !== false) {
t = t2;
bfr.Add(term); // . this.nextItem(':');
// }
}
}
else if (prefix_len > 0 || last_prefix_len > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
// common_prefix_len = this.getCommon(prefix, last_prefix);
para_stack = false;
// Close all the prefixes which aren't shared.
while (common_prefix_len < last_prefix_len) {
// bfr .= this.closeList(last_prefix[last_prefix_len - 1]);
last_prefix_len--;
}
//
// Continue the current prefix if appropriate.
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
// bfr .= this.nextItem(prefix[common_prefix_len - 1]);
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
bfr.Add_byte_nl();
}
while (prefix_len > common_prefix_len) {
// $char = substr(prefix, common_prefix_len, 1);
// bfr .= this.openList($char);
//
// if (';' == $char) {
// // @todo FIXME: This is dupe of code above
// if (this.findColonNoLinks(t, term, t2) !== false) {
// t = t2;
// bfr .= term . this.nextItem(':');
// }
// }
++common_prefix_len;
}
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
bfr.Add_byte_nl();
}
last_prefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefix_len) {
// No prefix (not in list)--go to paragraph mode
// XXX: use a stack for nestable elements like span, table and div
boolean open_match = false, close_match = false;
// open_match = preg_match(
// '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
// . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
// t
// );
// close_match = preg_match(
// '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
// . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
// . self::MARKER_PREFIX
// . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
// t
// );
if (open_match || close_match) {
para_stack = false;
// @todo bug 5718: paragraph closed
// bfr .= this.closeParagraph();
if (pre_open_match && !pre_close_match) {
this.in_pre = true;
}
// $bqOffset = 0;
// while (preg_match('/<(\\/?)blockquote[\s>]/i', t,
// $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
// ) {
// in_blockquote = !$bqMatch[1][0]; // is this a close tag?
// $bqOffset = $bqMatch[0][1] + strlen($bqMatch[0][0]);
// }
in_block_elem = !close_match;
}
else if (!in_block_elem && !this.in_pre) {
if ( Byte_ascii.Space == t[0]
// && (last_section == 'pre' || trim(t) != '')
&& !in_blockquote
) {
// pre
// if (this.last_section !== 'pre') {
para_stack = false;
// bfr .= this.closeParagraph() . '<pre>';
// this.last_section = 'pre';
// }
t = Bry_.Mid(t, 1);
}
else {
// paragraph
// if (trim(t) == '') {
if (para_stack) {
// bfr .= para_stack . '<br />';
para_stack = false;
// this.last_section = 'p';
}
else {
// if (this.last_section !== 'p') {
// bfr .= this.closeParagraph();
// this.last_section = '';
// para_stack = '<p>';
// }
// else {
// para_stack = '</p><p>';
// }
}
// }
// else {
if (para_stack) {
// bfr .= para_stack;
para_stack = false;
// this.last_section = 'p';
}
// else if (this.last_section !== 'p') {
// bfr .= this.closeParagraph() . '<p>';
// this.last_section = 'p';
// }
// }
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (pre_close_match && this.in_pre) {
this.in_pre = false;
}
if (para_stack == false) {
bfr.Add(t);
if (prefix_len == 0) {
bfr.Add_byte_nl();
}
}
if (last_prefix_len == -1 || common_prefix_len == -1) {
}
return Bry_split_.Rv__ok;
}
// private static final int
// Para_stack_none = 0 // false
// , Para_stack_bgn = 1 // <p>
// , Para_stack_mid = 2 // </p><p>
// ;
// private static final byte
// Mode_none = 0 // ''
// , Mode_para = 1 // p
// , Mode_pre = 2 // pre
// ;
}

@ -1,41 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.xowa.parsers.htmls.*;
public class Xomw_sanitizer_mgr {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
int len = atr_bldr.Atrs__len();
// PORTED: Sanitizer.php|safeEncodeTagAttributes
for (int i = 0; i < len; i++) {
// $encAttribute = htmlspecialchars( $attribute );
// $encValue = Sanitizer::safeEncodeAttribute( $value );
// $attribs[] = "$encAttribute=\"$encValue\"";
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
bfr.Add_byte_eq().Add_byte_quote();
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
bfr.Add_byte_quote();
}
}
}

@ -22,6 +22,7 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
private final Bry_bfr key_bfr = Bry_bfr_.New_w_size(32); private final Bry_bfr key_bfr = Bry_bfr_.New_w_size(32);
private int idx = -1; private int idx = -1;
public void Clear() {idx = -1; general_trie.Clear();} public void Clear() {idx = -1; general_trie.Clear();}
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
public byte[] Add(byte[] val) { // "<b>" -> "\u007fUNIQ-item-1--QINU\u007f" public byte[] Add(byte[] val) { // "<b>" -> "\u007fUNIQ-item-1--QINU\u007f"
byte[] key = key_bfr byte[] key = key_bfr
.Add(Bry__uniq__add__bgn) .Add(Bry__uniq__add__bgn)
@ -30,10 +31,6 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
general_trie.Add_bry_bry(key, val); general_trie.Add_bry_bry(key, val);
return key; return key;
} }
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
public byte[] Unstrip_both(byte[] src) {
return Convert(src);
}
public byte[] Convert(byte[] src) { public byte[] Convert(byte[] src) {
if (general_trie.Count() == 0) return src; if (general_trie.Count() == 0) return src;

Loading…
Cancel
Save