diff --git a/100_core/src/gplx/Array_.java b/100_core/src/gplx/Array_.java index 732329566..1c0d5765e 100644 --- a/100_core/src/gplx/Array_.java +++ b/100_core/src/gplx/Array_.java @@ -114,4 +114,9 @@ public class Array_ { Set_at(trg, i, Get_at(add, i - srcLen)); return trg; } + public static Object Clone(Object src) { + Object trg = Create(Component_type(src), Len(src)); + Copy(src, trg); + return trg; + } } diff --git a/100_core/src/gplx/Bry_.java b/100_core/src/gplx/Bry_.java index e9fff3ca4..df78d9751 100644 --- a/100_core/src/gplx/Bry_.java +++ b/100_core/src/gplx/Bry_.java @@ -18,6 +18,7 @@ along with this program. If not, see . package gplx; import java.lang.*; import gplx.core.brys.*; import gplx.core.primitives.*; import gplx.core.ios.*; +import gplx.langs.htmls.entitys.*; public class Bry_ { public static final String Cls_val_name = "byte[]"; public static final byte[] Empty = new byte[0]; @@ -62,6 +63,7 @@ public class Bry_ { public static byte[] new_u8(String str) { try { int str_len = str.length(); + if (str_len == 0) return Bry_.Empty; int bry_len = new_u8__by_len(str, str_len); byte[] rv = new byte[bry_len]; new_u8__write(str, str_len, rv, 0); @@ -365,7 +367,7 @@ public class Bry_ { || (end < bgn) ) return or; - return Mid(src, bgn, src.length); + return bgn == src_len ? Bry_.Empty : Mid(src, bgn, src_len); } public static byte[] Mid(byte[] src, int bgn, int end) { try { @@ -1001,6 +1003,33 @@ public class Bry_ { } return rv; } + public static byte[] Xcase__build__all(Bry_bfr tmp, boolean upper, byte[] src) { + if (src == null) return null; + int src_bgn = 0; + int src_end = src.length; + int lbound = 96, ubound = 123; + if (!upper) { + lbound = 64; ubound = 91; + } + + boolean dirty = false; + for (int i = src_bgn; i < src_end; i++) { + byte b = src[i]; + if (b > lbound && b < ubound) { + if (!dirty) { + dirty = true; + tmp.Add_mid(src, src_bgn, i); + } + if (upper) + b -= 32; + else + b += 32; + } + if (dirty) + tmp.Add_byte(b); + } + return dirty ? tmp.To_bry_and_clear() : src; + } public static byte[] Ucase__1st(byte[] src) {return Xcase__1st(Bool_.Y, src);} public static byte[] Lcase__1st(byte[] src) {return Xcase__1st(Bool_.N, src);} private static byte[] Xcase__1st(boolean upper, byte[] src) { @@ -1076,4 +1105,71 @@ public class Bry_ { public static byte[] Replace_nl_w_tab(byte[] src, int bgn, int end) { return Bry_.Replace(Bry_.Mid(src, bgn, end), Byte_ascii.Nl, Byte_ascii.Tab); } + public static byte[] Escape_html(byte[] src) { + return Escape_html(null, src, 0, src.length); + } + public static byte[] Escape_html(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) { // uses PHP rules for htmlspecialchars; REF.PHP:http://php.net/manual/en/function.htmlspecialchars.php + boolean dirty = false; + int cur = src_bgn; + int prv = cur; + boolean called_by_bry = bfr == null; + + // loop over chars + while (true) { + // if EOS, exit + if (cur == src_end) { + if (dirty) { + bfr.Add_mid(src, prv, src_end); + } + break; + } + + // check current byte if escaped + byte b = src[cur]; + byte[] escaped = null; + switch (b) { + case Byte_ascii.Amp: escaped = Gfh_entity_.Amp_bry; break; + case Byte_ascii.Quote: escaped = Gfh_entity_.Quote_bry; break; + case Byte_ascii.Apos: escaped = Gfh_entity_.Apos_num_bry; break; + case Byte_ascii.Lt: escaped = Gfh_entity_.Lt_bry; break; + case Byte_ascii.Gt: escaped = Gfh_entity_.Gt_bry; break; + } + + // not escaped; increment and continue + if (escaped == null) { + cur++; + continue; + } + // escaped + else { + dirty = true; + if (bfr == null) bfr = Bry_bfr_.New(); + + if (prv < cur) + bfr.Add_mid(src, prv, cur); + bfr.Add(escaped); + cur++; + prv = cur; + } + } + + if (dirty) { + if (called_by_bry) + return bfr.To_bry_and_clear(); + else + return null; + } + else { + if (called_by_bry) { + if (src_bgn == 0 && src_end == src.length) + return src; + else + return Bry_.Mid(src, src_bgn, src_end); + } + else { + bfr.Add_mid(src, src_bgn, src_end); + return null; + } + } + } } diff --git a/100_core/src/gplx/Bry__tst.java b/100_core/src/gplx/Bry__tst.java index c02d5f75f..31e109f6f 100644 --- a/100_core/src/gplx/Bry__tst.java +++ b/100_core/src/gplx/Bry__tst.java @@ -279,8 +279,13 @@ public class Bry__tst { @Test public void Repeat_bry() { fxt.Test__repeat_bry("abc" , 3, "abcabcabc"); } + @Test public void Xcase__build__all() { + fxt.Test__xcase__build__all(Bool_.N, "abc", "abc"); + fxt.Test__xcase__build__all(Bool_.N, "aBc", "abc"); + } } class Bry__fxt { + private final Bry_bfr tmp = Bry_bfr_.New(); public void Test_trim_end(String raw, byte trim, String expd) { byte[] raw_bry = Bry_.new_a7(raw); Tfds.Eq(expd, String_.new_u8(Bry_.Trim_end(raw_bry, trim, raw_bry.length))); @@ -298,4 +303,7 @@ class Bry__fxt { public void Test__repeat_bry(String s, int count, String expd) { Gftest.Eq__str(expd, Bry_.Repeat_bry(Bry_.new_u8(s), count)); } + public void Test__xcase__build__all(boolean upper, String src, String expd) { + Gftest.Eq__str(expd, Bry_.Xcase__build__all(tmp, upper, Bry_.new_u8(src))); + } } diff --git a/100_core/src/gplx/Bry_bfr.java b/100_core/src/gplx/Bry_bfr.java index 195be3313..4d2c59cc9 100644 --- a/100_core/src/gplx/Bry_bfr.java +++ b/100_core/src/gplx/Bry_bfr.java @@ -297,35 +297,21 @@ public class Bry_bfr { Add_mid(val, bgn, end); return this; } - public Bry_bfr Add_bry_escape_html(byte[] val) {return Add_bry_escape_html(val, 0, val.length);} - public Bry_bfr Add_bry_escape_html(byte[] val, int bgn, int end) { // uses PHP rules for htmlspecialchars; REF.PHP:http://php.net/manual/en/function.htmlspecialchars.php - boolean clean = true; - for (int i = bgn; i < end; ++i) { - byte[] escaped = null; - byte b = val[i]; - switch (b) { - case Byte_ascii.Amp: escaped = Gfh_entity_.Amp_bry; break; - case Byte_ascii.Quote: escaped = Gfh_entity_.Quote_bry; break; - case Byte_ascii.Apos: escaped = Gfh_entity_.Apos_num_bry; break; - case Byte_ascii.Lt: escaped = Gfh_entity_.Lt_bry; break; - case Byte_ascii.Gt: escaped = Gfh_entity_.Gt_bry; break; - } - if (escaped == null && clean) { - continue; - } - else { - if (clean) { - clean = false; - this.Add_mid(val, bgn, i); - } - if (escaped == null) - this.Add_byte(b); - else - this.Add(escaped); - } + public Bry_bfr Add_bry_many(byte[]... ary) { + int len = ary.length; + for (int i = 0; i < len; i++) { + byte[] bry = ary[i]; + if (bry != null && bry.length > 0) + this.Add(bry); } - if (clean) - Add_mid(val, bgn, end); + return this; + } + public Bry_bfr Add_bry_escape_html(byte[] val) { + if (val == null) return this; + return Add_bry_escape_html(val, 0, val.length); + } + public Bry_bfr Add_bry_escape_html(byte[] val, int bgn, int end) { + Bry_.Escape_html(this, val, bgn, end); return this; } public Bry_bfr Add_str_u8_w_nl(String s) {Add_str_u8(s); return Add_byte_nl();} @@ -542,6 +528,30 @@ public class Bry_bfr { this.Del_by(count); return this; } + public Bry_bfr Trim_end_ws() { + if (bfr_len == 0) return this; + int count = 0; + for (int i = bfr_len - 1; i > -1; --i) { + byte b = bfr[i]; + if (Trim_end_ws_ary[b]) + ++count; + else + break; + } + if (count > 0) + this.Del_by(count); + return this; + } + private static final boolean[] Trim_end_ws_ary = Trim_end_ws_new(); + private static boolean[] Trim_end_ws_new() { + boolean[] rv = new boolean[256]; + rv[32] = true; + rv[ 9] = true; + rv[10] = true; + rv[13] = true; + rv[11] = true; + return rv; + } public Bry_bfr Concat_skip_empty(byte[] dlm, byte[]... ary) { int ary_len = ary.length; for (int i = 0; i < ary_len; i++) { diff --git a/100_core/src/gplx/Bry_find_.java b/100_core/src/gplx/Bry_find_.java index 18f2314d2..de34e783e 100644 --- a/100_core/src/gplx/Bry_find_.java +++ b/100_core/src/gplx/Bry_find_.java @@ -245,6 +245,13 @@ public class Bry_find_ { cur += while_len; } } + public static int Find_fwd_while_in(byte[] src, int cur, int end, boolean[] while_ary) { + while (cur < end) { + if (cur == end || !while_ary[src[cur]]) return cur; + cur++; + } + return end; + } public static int Find_fwd_until(byte[] src, int cur, int end, byte until_byte) { while (true) { if ( cur == end diff --git a/100_core/src/gplx/Bry_find__tst.java b/100_core/src/gplx/Bry_find__tst.java index ee58f8dd5..098609553 100644 --- a/100_core/src/gplx/Bry_find__tst.java +++ b/100_core/src/gplx/Bry_find__tst.java @@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx; -import org.junit.*; +import org.junit.*; import gplx.core.tests.*; public class Bry_find__tst { private Bry_find__fxt fxt = new Bry_find__fxt(); @Test public void Find_fwd() { @@ -59,6 +59,10 @@ public class Bry_find__tst { fxt.Test_Trim_bwd_space_tab("" , 0); fxt.Test_Trim_bwd_space_tab(" \t" , 0); } + @Test public void Find_fwd_while_in() { + boolean[] while_ary = fxt.Init__find_fwd_while_in(Byte_ascii.Space, Byte_ascii.Tab, Byte_ascii.Nl); + fxt.Test__find_fwd_while_in(" \t\na", while_ary, 3); + } } class Bry_find__fxt { public void Test_Find_fwd(String src, String lkp, int bgn, int expd) {Tfds.Eq(expd, Bry_find_.Find_fwd(Bry_.new_u8(src), Bry_.new_u8(lkp), bgn));} @@ -74,4 +78,15 @@ class Bry_find__fxt { int actl = Bry_find_.Trim_fwd_space_tab(raw_bry, 0, raw_bry.length); Tfds.Eq(expd, actl, raw_str); } + public boolean[] Init__find_fwd_while_in(byte... ary) { + boolean[] rv = new boolean[256]; + int len = ary.length; + for (int i = 0; i < len; i++) + rv[ary[i]] = true; + return rv; + } + public void Test__find_fwd_while_in(String src, boolean[] ary, int expd) { + byte[] src_bry = Bry_.new_u8(src); + Gftest.Eq__int(expd, Bry_find_.Find_fwd_while_in(src_bry, 0, src_bry.length, ary)); + } } diff --git a/100_core/src/gplx/Bry_split_.java b/100_core/src/gplx/Bry_split_.java index 096dcfa13..f4744a7ec 100644 --- a/100_core/src/gplx/Bry_split_.java +++ b/100_core/src/gplx/Bry_split_.java @@ -48,7 +48,7 @@ public class Bry_split_ { boolean reset = true; if (itm_bgn == -1) { if (pos_is_last) {} // skip dlm at bgn / end; EX: "a," - else {wkr.Split(src, itm_bgn, itm_end);} // else, process "empty" dlm; EX: ",a" + else {wkr.Split(src, pos, pos );} // else, process "empty" dlm; EX: ",a" } else { int rv = wkr.Split(src, itm_bgn, itm_end); diff --git a/100_core/src/gplx/Bry_split__tst.java b/100_core/src/gplx/Bry_split__tst.java index a77bb678d..2c38ffe32 100644 --- a/100_core/src/gplx/Bry_split__tst.java +++ b/100_core/src/gplx/Bry_split__tst.java @@ -43,6 +43,9 @@ public class Bry_split__tst { fxt.Test_split("a|b|c|d" , 2, 6, "|", "b", "c"); fxt.Test_split("a|b|c|d" , 2, 4, "|", "b"); } + @Test public void Empty() { + fxt.Test_split("a\n\nb" , Byte_ascii.Nl, Bool_.N, "a", "", "b"); + } @Test public void Split_w_max() { fxt.Test__split_w_max("a|b|c|d" , Byte_ascii.Pipe, 2, "a", "b"); // max is less fxt.Test__split_w_max("a" , Byte_ascii.Pipe, 2, "a", null); // max is more diff --git a/100_core/src/gplx/core/btries/Btrie_slim_mgr.java b/100_core/src/gplx/core/btries/Btrie_slim_mgr.java index f6a57241f..02ebf8766 100644 --- a/100_core/src/gplx/core/btries/Btrie_slim_mgr.java +++ b/100_core/src/gplx/core/btries/Btrie_slim_mgr.java @@ -109,6 +109,14 @@ public class Btrie_slim_mgr implements Btrie_mgr { Add_obj(Bry_.new_u8(ary[i]), bval); return this; } + public Btrie_slim_mgr Add_many_str(String... ary) { + int len = ary.length; + for (int i = 0; i < len; i++) { + byte[] itm = Bry_.new_u8(ary[i]); + Add_obj(itm, itm); + } + return this; + } public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));} public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) { int len = ary.length; diff --git a/100_core/src/gplx/core/encoders/Hex_utl_.java b/100_core/src/gplx/core/encoders/Hex_utl_.java index 1a9a0232d..cd99ddab5 100644 --- a/100_core/src/gplx/core/encoders/Hex_utl_.java +++ b/100_core/src/gplx/core/encoders/Hex_utl_.java @@ -87,11 +87,31 @@ public class Hex_utl_ { public static void Write(byte[] bry, int bgn, int end, int val) { for (int i = end - 1; i > bgn - 1; i--) { int b = val % 16; - bry[i] = To_byte(b); + bry[i] = To_byte_ucase(b); val /= 16; if (val == 0) break; } } + public static void Write_bfr(Bry_bfr bfr, boolean lcase, int val) { + // count bytes + int val_len = 0; + int tmp = val; + while (true) { + tmp /= 16; + val_len++; + if (tmp == 0) break; + } + + // fill bytes from right to left + int hex_bgn = bfr.Len(); + bfr.Add_byte_repeat(Byte_ascii.Null, val_len); + byte[] bry = bfr.Bfr(); + for (int i = 0; i < val_len; i++) { + int b = val % 16; + bry[hex_bgn + val_len - i - 1] = lcase ? To_byte_lcase(b) : To_byte_ucase(b); + val /= 16; + } + } public static boolean Is_hex_many(byte... ary) { for (byte itm : ary) { switch (itm) { @@ -123,7 +143,7 @@ public class Hex_utl_ { default: throw Err_.new_parse("hexstring", Int_.To_str(val)); } } - private static byte To_byte(int v) { + private static byte To_byte_ucase(int v) { switch (v) { case 0: return Byte_ascii.Num_0; case 1: return Byte_ascii.Num_1; case 2: return Byte_ascii.Num_2; case 3: return Byte_ascii.Num_3; case 4: return Byte_ascii.Num_4; case 5: return Byte_ascii.Num_5; case 6: return Byte_ascii.Num_6; case 7: return Byte_ascii.Num_7; case 8: return Byte_ascii.Num_8; case 9: return Byte_ascii.Num_9; diff --git a/100_core/src/gplx/core/encoders/Hex_utl__tst.java b/100_core/src/gplx/core/encoders/Hex_utl__tst.java index 9579d71da..fac2cd5f3 100644 --- a/100_core/src/gplx/core/encoders/Hex_utl__tst.java +++ b/100_core/src/gplx/core/encoders/Hex_utl__tst.java @@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx.core.encoders; import gplx.*; import gplx.core.*; -import org.junit.*; +import org.junit.*; import gplx.core.tests.*; public class Hex_utl__tst { private final Hex_utl__fxt fxt = new Hex_utl__fxt(); @Test public void To_int() { @@ -46,6 +46,15 @@ public class Hex_utl__tst { fxt.Test__write("[00000000]", 1, 9, 15, "[0000000F]"); fxt.Test__write("[00000000]", 1, 9, 255, "[000000FF]"); } + @Test public void Write_bfr() { + fxt.Test__write_bfr(Bool_.Y, 0, "0"); + fxt.Test__write_bfr(Bool_.Y, 15, "f"); + fxt.Test__write_bfr(Bool_.Y, 16, "10"); + fxt.Test__write_bfr(Bool_.Y, 32, "20"); + fxt.Test__write_bfr(Bool_.Y, 255, "ff"); + fxt.Test__write_bfr(Bool_.Y, 256, "100"); + fxt.Test__write_bfr(Bool_.Y, Int_.Max_value, "7fffffff"); + } } class Hex_utl__fxt { public void Test__write(String s, int bgn, int end, int val, String expd) { @@ -63,6 +72,11 @@ class Hex_utl__fxt { String actl = Hex_utl_.To_str(val, pad); Tfds.Eq(expd, actl); } + private final Bry_bfr bfr = Bry_bfr_.New(); + public void Test__write_bfr(boolean lcase, int val, String expd) { + Hex_utl_.Write_bfr(bfr, lcase, val); + Gftest.Eq__str(expd, bfr.To_str_and_clear()); + } // public void Test__encode_bry(int val, int pad, String expd) { // String actl = Hex_utl_.To_str(val, pad); // Tfds.Eq(expd, actl); diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java b/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java index d2a952d9c..97ccebe41 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java @@ -16,6 +16,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; +import gplx.core.btries.*; import gplx.core.primitives.*; public class Php_preg_ { public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) { @@ -27,7 +28,7 @@ public class Php_preg_ { while (true) { if (i == src_end) break; int dlm_end = i + dlm_len; - if (dlm_end < src_end && Bry_.Eq(src, i, dlm_end, dlm)) { + if (dlm_end <= src_end && Bry_.Eq(src, i, dlm_end, dlm)) { if (extend) { dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth); } @@ -42,13 +43,33 @@ public class Php_preg_ { // create brys int rv_len = list.Len() - 1; - if (rv_len == 1) return null; + if (rv_len == 1) { + list.Clear(); + return null; + } + if (list.Get_at(list.Len() - 2) == src_end) { // if 2nd to last elem == src_end, then last item is Bry_.Empty; ignore it; EX: "a''" -> "a", "''" x> "a", "''", "" + rv_len--; + } byte[][] rv = new byte[rv_len][]; for (i = 0; i < rv_len; i += 2) { rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1)); if (i + 1 == rv_len) break; rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2)); } + list.Clear(); return rv; } + public static Object Match(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) { + int cur = src_bgn; + while (cur < src_end) { + byte b = src[cur]; + Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end); + if (o == null) + cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); + else { + return o; + } + } + return null; + } } diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java index ea65075cd..dd47a1dc5 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java @@ -21,11 +21,12 @@ public class Php_preg___tst { private final Php_preg___fxt fxt = new Php_preg___fxt(); @Test public void Basic() {fxt.Test__split("a''b''c" , "''", Bool_.Y, "a", "''", "b", "''", "c");} @Test public void Extend() {fxt.Test__split("a'''b'''c" , "''", Bool_.Y, "a", "'''", "b", "'''", "c");} + @Test public void Eos() {fxt.Test__split("a''" , "''", Bool_.Y, "a", "''");} } class Php_preg___fxt { + private final gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list(); public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);} public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) { - gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list(); byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend); Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed"); } diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java index 52898cc51..63de16d2a 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java @@ -16,7 +16,11 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; +import gplx.core.btries.*; public class Php_str_ { + public static int Strpos(byte[] src, byte find, int bgn, int end) { + return Bry_find_.Find_fwd(src, find, bgn, end); + } public static byte[] Substr(byte[] src, int bgn) {return Substr(src, bgn, src.length);} public static byte[] Substr(byte[] src, int bgn, int len) { int src_len = src.length; @@ -29,11 +33,23 @@ public class Php_str_ { public static byte Substr_byte(byte[] src, int bgn) {return Substr_byte(src, bgn, src.length);} public static byte Substr_byte(byte[] src, int bgn, int len) { int src_len = src.length; + if (src_len == 0) return Byte_ascii.Null; if (bgn < 0) bgn = src_len + bgn; // handle negative if (bgn < 0) bgn = 0; // handle out of bounds; EX: ("a", -1, -1) int end = len < 0 ? src_len + len : bgn + len; if (end > src.length) end = src.length;; // handle out of bounds; return src[bgn]; + } + public static int Strspn_fwd__ary(byte[] src, boolean[] find, int bgn, int max, int src_len) { + if (max == -1) max = src_len; + int rv = 0; + for (int i = bgn; i < src_len; i++) { + if (find[src[i]] && rv < max) + rv++; + else + break; + } + return rv; } public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) { if (max == -1) max = src_len; @@ -91,4 +107,31 @@ public class Php_str_ { } return rv; } + public static byte[] Strtr(byte[] src, Btrie_slim_mgr trie, Bry_bfr tmp, Btrie_rv trv) { + boolean dirty = false; + int src_bgn = 0; + int src_end = src.length; + int i = src_bgn; + + while (true) { + if (i == src_end) break; + byte b = src[i]; + Object o = trie.Match_at_w_b0(trv, b, src, i, src_end); + if (o == null) { + if (dirty) { + tmp.Add_byte(b); + } + i++; + } + else { + if (!dirty) { + dirty = true; + tmp.Add_mid(src, 0, i); + } + tmp.Add((byte[])o); + i = trv.Pos(); + } + } + return dirty ? tmp.To_bry_and_clear() : src; + } } diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java b/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java index 87048591b..53b978f0b 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java @@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; -import org.junit.*; import gplx.core.tests.*; +import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*; public class Php_str___tst { private final Php_str___fxt fxt = new Php_str___fxt(); @Test public void Strspn_fwd__byte() { @@ -43,6 +43,14 @@ public class Php_str___tst { fxt.Test__substr("abcde" , -1, "e"); fxt.Test__substr("abcde" , -3, -1, "cd"); } + @Test public void Strtr() { + fxt.Init__strtr_by_trie("01", "89", "02", "79"); + fxt.Test__strtr_by_trie("abc" , "abc"); // found=none + fxt.Test__strtr_by_trie("ab_01_cd" , "ab_89_cd"); // found=one + fxt.Test__strtr_by_trie("ab_01_cd_02_ef", "ab_89_cd_79_ef"); // found=many + fxt.Test__strtr_by_trie("01_ab" , "89_ab"); // BOS + fxt.Test__strtr_by_trie("ab_01" , "ab_89"); // EOS + } } class Php_str___fxt { public void Test__strspn_fwd__byte(String src_str, byte find, int bgn, int max, int expd) { @@ -63,4 +71,17 @@ class Php_str___fxt { public void Test__substr(String src_str, int bgn, int len, String expd) { Gftest.Eq__str(expd, Php_str_.Substr(Bry_.new_u8(src_str), bgn, len)); } + private Btrie_slim_mgr strtr_trie; + public void Init__strtr_by_trie(String... kvs) { + if (strtr_trie == null) strtr_trie = Btrie_slim_mgr.cs(); + int len = kvs.length; + for (int i = 0; i < len; i += 2) { + strtr_trie.Add_str_str(kvs[i], kvs[i + 1]); + } + } + public void Test__strtr_by_trie(String src, String expd) { + Bry_bfr tmp = Bry_bfr_.New(); + Btrie_rv trv = new Btrie_rv(); + Gftest.Eq__str(expd, Php_str_.Strtr(Bry_.new_u8(src), strtr_trie, tmp, trv)); + } } diff --git a/400_xowa/src/gplx/xowa/Xoa_ttl.java b/400_xowa/src/gplx/xowa/Xoa_ttl.java index 12512cd2e..f906c501b 100644 --- a/400_xowa/src/gplx/xowa/Xoa_ttl.java +++ b/400_xowa/src/gplx/xowa/Xoa_ttl.java @@ -132,6 +132,67 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF. return Bry_.Mid(full_txt, page_bgn, ques_pos == Bry_find_.Not_found ? full_txt_len : ques_pos); } + public byte[] Get_prefixed_text() {return Full_txt_wo_qarg();} + public byte[] Get_prefixed_db_key() {return Full_db();} + public boolean Has_fragment() {return anch_bgn != -1;} + public byte[] Get_fragment() {return Anch_txt();} + public byte[] Get_link_url(byte[] query, boolean query2, boolean proto) { + // if ( $this->isExternal() || $proto !== false ) { + // $ret = $this->getFullURL( $query, $query2, $proto ); + // } + // else if ( $this->getPrefixedText() === '' && $this->hasFragment() ) { + // $ret = $this->getFragmentForURL(); + // } + // else { + // $ret = $this->getLocalURL( $query, $query2 ) . $this->getFragmentForURL(); + // } + return Bry_.Add(gplx.xowa.htmls.hrefs.Xoh_href_.Bry__wiki, this.Full_db_w_anch()); + } + public boolean Is_always_known() { +// $isKnown = null; + + /** + * Allows overriding default behavior for determining if a page exists. + * If $isKnown is kept as null, regular checks happen. If it's + * a boolean, this value is returned by the isKnown method. + * + * @since 1.20 + * + * @param Title $title + * @param boolean|null $isKnown + */ +// Hooks::run( 'TitleIsAlwaysKnown', [ $this, &$isKnown ] ); +// +// if ( !is_null( $isKnown ) ) { +// return $isKnown; +// } +// +// if ( $this->isExternal() ) { +// return true; // any interwiki link might be viewable, for all we know +// } +// +// switch ( $this->mNamespace ) { +// case NS_MEDIA: +// case NS_FILE: +// // file exists, possibly in a foreign repo +// return (boolean)wfFindFile( $this ); +// case NS_SPECIAL: +// // valid special page +// return SpecialPageFactory::exists( $this->getDBkey() ); +// case NS_MAIN: +// // selflink, possibly with fragment +// return $this->mDbkeyform == ''; +// case NS_MEDIAWIKI: +// // known system message +// return $this->hasSourceText() !== false; +// default: +// return false; +// } + return false; + } + + public boolean Is_external() {return this.wik_bgn != -1;} + public static final byte Subpage_spr = Byte_ascii.Slash; // EX: A/B/C public static final Xoa_ttl Null = null; diff --git a/400_xowa/src/gplx/xowa/addons/apps/cfgs/mgrs/dflts/Xocfg_dflt_mgr.java b/400_xowa/src/gplx/xowa/addons/apps/cfgs/mgrs/dflts/Xocfg_dflt_mgr.java index 184b591c0..3f05b5fff 100644 --- a/400_xowa/src/gplx/xowa/addons/apps/cfgs/mgrs/dflts/Xocfg_dflt_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/apps/cfgs/mgrs/dflts/Xocfg_dflt_mgr.java @@ -46,3 +46,12 @@ public class Xocfg_dflt_mgr { gfs_mgr.Run_url(url); } } +class Xocfg_dflt_itm__static implements Gfo_invk { + private final String val; + public Xocfg_dflt_itm__static(String val) { + this.val = val; + } + public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { + return val; + } +} diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_cmd.java b/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_cmd.java index a527c7d23..33db57231 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_cmd.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_cmd.java @@ -17,6 +17,7 @@ along with this program. If not, see . */ package gplx.xowa.addons.bldrs.updates.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.updates.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; +import gplx.xowa.files.*; public class Xodel_small_cmd extends Xob_cmd__base { public Xodel_small_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki);} private final int[] ext_max_ary = Xobldr__fsdb_db__delete_small_files_.New_ext_max_ary(); @@ -31,3 +32,20 @@ public class Xodel_small_cmd extends Xob_cmd__base { public static final Xob_cmd Prototype = new Xodel_small_cmd(null, null); @Override public Xob_cmd Cmd_clone(Xob_bldr bldr, Xowe_wiki wiki) {return new Xodel_small_cmd(bldr, wiki);} } +class Xobldr__fsdb_db__delete_small_files_ { + public static int[] New_ext_max_ary() { + int[] rv = new int[Xof_ext_.Id__max]; + Ext_max_(rv, 35, Xof_ext_.Id_svg); + Ext_max_(rv, 40, Xof_ext_.Id_gif); + Ext_max_(rv, 100, Xof_ext_.Id_png, Xof_ext_.Id_jpg, Xof_ext_.Id_jpeg); + Ext_max_(rv, 500, Xof_ext_.Id_tif, Xof_ext_.Id_tiff); + Ext_max_(rv, 500, Xof_ext_.Id_xcf); + Ext_max_(rv, 1000, Xof_ext_.Id_bmp); + Ext_max_(rv, 700, Xof_ext_.Id_webm); + Ext_max_(rv, 1000, Xof_ext_.Id_ogv); + Ext_max_(rv, 400, Xof_ext_.Id_pdf); + Ext_max_(rv, 700, Xof_ext_.Id_djvu); + return rv; + } + private static void Ext_max_(int[] ary, int max, int... exts) {for (int ext : exts) ary[ext] = max;} +} diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_mgr.java b/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_mgr.java index c61a57b63..1183a30b3 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/updates/files/Xodel_small_mgr.java @@ -19,6 +19,7 @@ package gplx.xowa.addons.bldrs.updates.files; import gplx.*; import gplx.xowa.*; import gplx.dbs.*; import gplx.xowa.bldrs.*; import gplx.fsdb.*; import gplx.fsdb.meta.*; import gplx.xowa.files.*; +import gplx.xowa.bldrs.wkrs.*; class Xodel_small_mgr { public void Exec(Xowe_wiki wiki, int[] ext_max_ary) { wiki.Init_assert(); @@ -53,20 +54,3 @@ class Xodel_small_mgr { ); } } -class Xobldr__fsdb_db__delete_small_files_ { - public static int[] New_ext_max_ary() { - int[] rv = new int[Xof_ext_.Id__max]; - Ext_max_(rv, 35, Xof_ext_.Id_svg); - Ext_max_(rv, 40, Xof_ext_.Id_gif); - Ext_max_(rv, 100, Xof_ext_.Id_png, Xof_ext_.Id_jpg, Xof_ext_.Id_jpeg); - Ext_max_(rv, 500, Xof_ext_.Id_tif, Xof_ext_.Id_tiff); - Ext_max_(rv, 500, Xof_ext_.Id_xcf); - Ext_max_(rv, 1000, Xof_ext_.Id_bmp); - Ext_max_(rv, 700, Xof_ext_.Id_webm); - Ext_max_(rv, 1000, Xof_ext_.Id_ogv); - Ext_max_(rv, 400, Xof_ext_.Id_pdf); - Ext_max_(rv, 700, Xof_ext_.Id_djvu); - return rv; - } - private static void Ext_max_(int[] ary, int max, int... exts) {for (int ext : exts) ary[ext] = max;} -} diff --git a/400_xowa/src/gplx/xowa/langs/lnki_trails/Xol_lnki_trail_mgr.java b/400_xowa/src/gplx/xowa/langs/lnki_trails/Xol_lnki_trail_mgr.java index 73abcd6b2..4e44c9416 100644 --- a/400_xowa/src/gplx/xowa/langs/lnki_trails/Xol_lnki_trail_mgr.java +++ b/400_xowa/src/gplx/xowa/langs/lnki_trails/Xol_lnki_trail_mgr.java @@ -21,7 +21,7 @@ public class Xol_lnki_trail_mgr implements Gfo_invk { public Xol_lnki_trail_mgr(Xol_lang_itm lang) {} public void Clear() {trie.Clear();} public int Count() {return trie.Count();} - public Btrie_slim_mgr Trie() {return trie;} Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); + public Btrie_slim_mgr Trie() {return trie;} private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); public void Add(byte[] v) {trie.Add_obj(v, v);} public void Del(byte[] v) {trie.Del(v);} private void Add(String... ary) { diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_linker.java b/400_xowa/src/gplx/xowa/mws/Xomw_linker.java new file mode 100644 index 000000000..fde27a9fc --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_linker.java @@ -0,0 +1,819 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +import gplx.core.btries.*; +import gplx.xowa.mws.htmls.*; +public class Xomw_linker { + private final Bry_bfr tmp = Bry_bfr_.New(); + private final Linker_rel_splitter splitter = new Linker_rel_splitter(); + private final Xomw_html_utl html_utl = new Xomw_html_utl(); + private byte[] wg_title = null; + private final Btrie_rv trv = new Btrie_rv(); + private final byte[][] split_trail_rv = new byte[2][]; + private Btrie_slim_mgr split_trail_trie; + private static final byte[] Atr__class = Bry_.new_a7("class"), Atr__rel = Bry_.new_a7("rel"), Atr__href = Bry_.new_a7("href"), Rel__nofollow = Bry_.new_a7("nofollow"); + public void Init_by_wiki(Btrie_slim_mgr trie) { + this.split_trail_trie = trie; + } +// /** +// * This function returns an HTML link to the given target. It serves a few +// * purposes: +// * 1) If $target is a Title, the correct URL to link to will be figured +// * out automatically. +// * 2) It automatically adds the usual classes for various types of link +// * targets: "new" for red links, "stub" for short articles, etc. +// * 3) It escapes all attribute values safely so there's no risk of XSS. +// * 4) It provides a default tooltip if the target is a Title (the page +// * name of the target). +// * link() replaces the old functions in the makeLink() family. +// * +// * @since 1.18 Method exists since 1.16 as non-static, made static in 1.18. +// * @deprecated since 1.28, use MediaWiki\Linker\LinkRenderer instead +// * +// * @param Title $target Can currently only be a Title, but this may +// * change to support Images, literal URLs, etc. +// * @param String $html The HTML contents of the element, i.e., +// * the link text. This is raw HTML and will not be escaped. If null, +// * defaults to the prefixed text of the Title; or if the Title is just a +// * fragment, the contents of the fragment. +// * @param array $customAttribs A key => value array of extra HTML attributes, +// * such as title and class. (href is ignored.) Classes will be +// * merged with the default classes, while other attributes will replace +// * default attributes. All passed attribute values will be HTML-escaped. +// * A false attribute value means to suppress that attribute. +// * @param array $query The query String to append to the URL +// * you're linking to, in key => value array form. Query keys and values +// * will be URL-encoded. +// * @param String|array $options String or array of strings: +// * 'known': Page is known to exist, so don't check if it does. +// * 'broken': Page is known not to exist, so don't check if it does. +// * 'noclasses': Don't add any classes automatically (includes "new", +// * "stub", "mw-redirect", "extiw"). Only use the class attribute +// * provided, if any, so you get a simple blue link with no funny i- +// * cons. +// * 'forcearticlepath': Use the article path always, even with a querystring. +// * Has compatibility issues on some setups, so avoid wherever possible. +// * 'http': Force a full URL with http:// as the scheme. +// * 'https': Force a full URL with https:// as the scheme. +// * 'stubThreshold' => (int): Stub threshold to use when determining link classes. +// * @return String HTML attribute +// */ +// public static function link( +// $target, $html = null, $customAttribs = [], $query = [], $options = [] +// ) { +// if ( !$target instanceof Title ) { +// wfWarn( __METHOD__ . ': Requires $target to be a Title Object.', 2 ); +// return "$html"; +// } +// +// if ( is_string( $query ) ) { +// // some functions withing core using this still hand over query strings +// wfDeprecated( __METHOD__ . ' with parameter $query as String (should be array)', '1.20' ); +// $query = wfCgiToArray( $query ); +// } +// +// $services = MediaWikiServices::getInstance(); +// $options = (array)$options; +// if ( $options ) { +// // Custom options, create new LinkRenderer +// if ( !isset( $options['stubThreshold'] ) ) { +// $defaultLinkRenderer = $services->getLinkRenderer(); +// $options['stubThreshold'] = $defaultLinkRenderer->getStubThreshold(); +// } +// $linkRenderer = $services->getLinkRendererFactory() +// ->createFromLegacyOptions( $options ); +// } else { +// $linkRenderer = $services->getLinkRenderer(); +// } +// +// if ( $html !== null ) { +// $text = new HtmlArmor( $html ); +// } else { +// $text = $html; // null +// } +// if ( in_array( 'known', $options, true ) ) { +// return $linkRenderer->makeKnownLink( $target, $text, $customAttribs, $query ); +// } elseif ( in_array( 'broken', $options, true ) ) { +// return $linkRenderer->makeBrokenLink( $target, $text, $customAttribs, $query ); +// } elseif ( in_array( 'noclasses', $options, true ) ) { +// return $linkRenderer->makePreloadedLink( $target, $text, '', $customAttribs, $query ); +// } else { +// return $linkRenderer->makeLink( $target, $text, $customAttribs, $query ); +// } +// } + public void Make_self_link_obj(Bry_bfr bfr, Xoa_ttl nt, byte[] html, byte[] query, byte[] trail, byte[] prefix) { + // MW.HOOK:SelfLinkBegin + if (html == Bry_.Empty) { + html = tmp.Add_bry_escape_html(nt.Get_prefixed_text()).To_bry_and_clear(); + } + byte[] inside = Bry_.Empty; + byte[][] split_trail = Split_trail(trail); + inside = split_trail[0]; + trail = split_trail[1]; + bfr.Add_str_a7(""); + bfr.Add_bry_many(prefix, html, inside); + bfr.Add_str_a7(""); + bfr.Add(trail); + } + public void Make_external_link(Bry_bfr bfr, byte[] url, byte[] text, boolean escape, byte[] link_type, Xomwh_atr_mgr attribs, byte[] title) { + tmp.Add_str_a7("external"); + if (link_type != null) { + tmp.Add_byte_space().Add(link_type); + } + Xomwh_atr_itm cls_itm = attribs.Get_by_or_make(Atr__class); + if (cls_itm.Val() != null) { + tmp.Add(cls_itm.Val()); + } + cls_itm.Val_(tmp.To_bry_and_clear()); + + if (escape) + text = tmp.Add_bry_escape_html(text).To_bry_and_clear(); + + if (title == null) + title = wg_title; + + byte[] new_rel = Get_external_link_rel(url, title); + Xomwh_atr_itm cur_rel_atr = attribs.Get_by_or_make(Atr__rel); + if (cur_rel_atr.Val() == null) { + cur_rel_atr.Val_(new_rel); + } + else { + // Merge the rel attributes. + byte[] cur_rel = cur_rel_atr.Val(); + Bry_split_.Split(new_rel, 0, new_rel.length, Byte_ascii.Space, Bool_.N, splitter); // $newRels = explode( ' ', $newRel ); + Bry_split_.Split(cur_rel, 0, cur_rel.length, Byte_ascii.Space, Bool_.N, splitter); // $oldRels = explode( ' ', $attribs['rel'] ); + cur_rel_atr.Val_(splitter.To_bry()); // $attribs['rel'] = implode( ' ', $combined ); + } + //$link = ''; + //$success = Hooks::run( 'LinkerMakeExternalLink', + // [ &$url, &$text, &$link, &$attribs, $linktype ] ); + //if ( !$success ) { + // wfDebug( "Hook LinkerMakeExternalLink changed the output of link " + // . "with url {$url} and text {$text} to {$link}\n", true ); + // return $link; + //} + attribs.Set(Atr__href, url); + + html_utl.Raw_element(bfr, Bry_.new_a7("a"), attribs, text); + } + private byte[] Get_external_link_rel(byte[] url, byte[] title) { + // global $wgNoFollowLinks, $wgNoFollowNsExceptions, $wgNoFollowDomainExceptions; + // $ns = $title ? $title->getNamespace() : false; + // if ( $wgNoFollowLinks && !in_array( $ns, $wgNoFollowNsExceptions ) + // && !wfMatchesDomainList( $url, $wgNoFollowDomainExceptions ) + // ) { + return Rel__nofollow; + // } + // return null; + } + public void Normalize_subpage_link(Xomw_linker__normalize_subpage_link rv, Xoa_ttl context_title, byte[] target, byte[] text) { + // Valid link forms: + // Foobar -- normal + // :Foobar -- override special treatment of prefix (images, language links) + // /Foobar -- convert to CurrentPage/Foobar + // /Foobar/ -- convert to CurrentPage/Foobar, strip the initial and final / from text + // ../ -- convert to CurrentPage, from CurrentPage/CurrentSubPage + // ../Foobar -- convert to CurrentPage/Foobar, + // (from CurrentPage/CurrentSubPage) + // ../Foobar/ -- convert to CurrentPage/Foobar, use 'Foobar' as text + // (from CurrentPage/CurrentSubPage) + + byte[] ret = target; // default return value is no change + + // Some namespaces don't allow subpages, + // so only perform processing if subpages are allowed + if (context_title != null && context_title.Ns().Subpages_enabled()) { + int hash = Bry_find_.Find_fwd(target, Byte_ascii.Hash); + byte[] suffix = null; + if (hash != Bry_find_.Not_found) { + suffix = Bry_.Mid(target, hash); + target = Bry_.Mid(target, 0, hash); + } + else { + suffix = Bry_.Empty; + } + // bug 7425 + target = Bry_.Trim(target); + // Look at the first character + if (target != Bry_.Empty && target[0] == Byte_ascii.Slash) { + // / at end means we don't want the slash to be shown + int target_len = target.length; + int trailing_slashes_bgn = Bry_find_.Find_bwd_while(target, target_len, 0, Byte_ascii.Slash) + 1; + byte[] no_slash = null; + if (trailing_slashes_bgn != target_len) { + no_slash = target = Bry_.Mid(target, 1, trailing_slashes_bgn); + } + else { + no_slash = Bry_.Mid(target, 1); + } + + ret = Bry_.Add(context_title.Get_prefixed_text(), Byte_ascii.Slash_bry, Bry_.Trim(no_slash), suffix); + if (text == Bry_.Empty) { + text = Bry_.Add(target, suffix); + } // this might be changed for ugliness reasons + } + else { + // check for .. subpage backlinks + int dot2_count = 0; + byte[] dot2_stripped = target; + while (Bry_.Match(dot2_stripped, 0, 3, Bry__dot2)) { + ++dot2_count; + dot2_stripped = Bry_.Mid(dot2_stripped, 3); + } + if (dot2_count > 0) { + byte[][] exploded = Bry_split_.Split(context_title.Get_prefixed_text(), Byte_ascii.Slash); + int exploded_len = exploded.length; + if (exploded_len > dot2_count) { // not allowed to go below top level page + // PORTED: ret = implode('/', array_slice($exploded, 0, -dot2_count)); + int implode_len = exploded_len - dot2_count; + for (int i = 0; i < implode_len; i++) { + if (i != 0) tmp.Add_byte(Byte_ascii.Slash); + tmp.Add(exploded[i]); + } + // / at the end means don't show full path + if (Bry_.Has_at_end(dot2_stripped, Byte_ascii.Slash)) { + dot2_stripped = Bry_.Mid(dot2_stripped, 0, dot2_stripped.length - 1); + if (text == Bry_.Empty) { + text = Bry_.Add(dot2_stripped, suffix); + } + } + dot2_stripped = Bry_.Trim(dot2_stripped); + if (dot2_stripped != Bry_.Empty) { + tmp.Add_bry_many(Byte_ascii.Slash_bry, dot2_stripped); + } + tmp.Add(suffix); + ret = tmp.To_bry_and_clear(); + } + } + } + } + + rv.Init(ret, text); + } + public byte[][] Split_trail(byte[] trail) { + int cur = 0; + int src_end = trail.length; + while (true) { + Object o = split_trail_trie.Match_at(trv, trail, cur, src_end); + if (o == null) break; + byte[] bry = (byte[])o; + cur += bry.length; + } + + if (cur == 0) { // no trail + split_trail_rv[0] = null; + split_trail_rv[1] = trail; + } + else { + split_trail_rv[0] = Bry_.Mid(trail, 0, cur); + split_trail_rv[1] = Bry_.Mid(trail, cur, src_end); + } + return split_trail_rv; + } + public void Make_image(Bry_bfr bfr, Xoa_ttl title, byte[] options, boolean holders) { + // Check if the options text is of the form "options|alt text" + // Options are: + // * thumbnail make a thumbnail with enlarge-icon and caption, alignment depends on lang + // * left no resizing, just left align. label is used for alt= only + // * right same, but right aligned + // * none same, but not aligned + // * ___px scale to ___ pixels width, no aligning. e.g. use in taxobox + // * center center the image + // * frame Keep original image size, no magnify-button. + // * framed Same as "frame" + // * frameless like 'thumb' but without a frame. Keeps user preferences for width + // * upright reduce width for upright images, rounded to full __0 px + // * border draw a 1px border around the image + // * alt Text for HTML alt attribute (defaults to empty) + // * class Set a class for img node + // * link Set the target of the image link. Can be external, interwiki, or local + // vertical-align values (no % or length right now): + // * baseline + // * sub + // * super + // * top + // * text-top + // * middle + // * bottom + // * text-bottom + + // Protect LanguageConverter markup when splitting into parts +// $parts = StringUtils::delimiterExplode( +// '-{', '}-', '|', $options, true /* allow nesting */ +// ); + + // Give extensions a chance to select the file revision for us +// $options = []; +// $descQuery = false; + // MW.HOOK:BeforeParserFetchFileAndTitle + + // Fetch and register the file (file title may be different via hooks) +// list($file, $title) = $this->fetchFileAndTitle($title, $options); + + // Get parameter map +// $handler = $file ? $file->getHandler() : false; + +// list($paramMap, $mwArray) = $this->getImageParams($handler); + +// if (!$file) { +// $this->addTrackingCategory('broken-file-category'); +// } + + // Process the input parameters +// $caption = ''; +// $params = [ 'frame' => [], 'handler' => [], +// 'horizAlign' => [], 'vertAlign' => [] ]; +// $seenformat = false; +// foreach ($parts as $part) { +// $part = trim($part); +// list($magicName, $value) = $mwArray->matchVariableStartToEnd($part); +// $validated = false; +// if (isset($paramMap[$magicName])) { +// list($type, $paramName) = $paramMap[$magicName]; + + // Special case; width and height come in one variable together +// if ($type === 'handler' && $paramName === 'width') { +// $parsedWidthParam = $this->parseWidthParam($value); +// if (isset($parsedWidthParam['width'])) { +// $width = $parsedWidthParam['width']; +// if ($handler->validateParam('width', $width)) { +// $params[$type]['width'] = $width; +// $validated = true; +// } +// } +// if (isset($parsedWidthParam['height'])) { +// $height = $parsedWidthParam['height']; +// if ($handler->validateParam('height', $height)) { +// $params[$type]['height'] = $height; +// $validated = true; +// } +// } + // else no validation -- T15436 +// } else { +// if ($type === 'handler') { +// // Validate handler parameter +// $validated = $handler->validateParam($paramName, $value); +// } else { +// // Validate @gplx.Internal protected parameters +// switch ($paramName) { +// case 'manualthumb': +// case 'alt': +// case 'class': + // @todo FIXME: Possibly check validity here for + // manualthumb? downstream behavior seems odd with + // missing manual thumbs. +// $validated = true; +// $value = $this->stripAltText($value, $holders); +// break; +// case 'link': +// $chars = self::EXT_LINK_URL_CLASS; +// $addr = self::EXT_LINK_ADDR; +// $prots = $this->mUrlProtocols; +// if ($value === '') { +// $paramName = 'no-link'; +// $value = true; +// $validated = true; +// } elseif (preg_match("/^((?i)$prots)/", $value)) { +// if (preg_match("/^((?i)$prots)$addr$chars*$/u", $value, $m)) { +// $paramName = 'link-url'; +// $this->mOutput->addExternalLink($value); +// if ($this->mOptions->getExternalLinkTarget()) { +// $params[$type]['link-target'] = $this->mOptions->getExternalLinkTarget(); +// } +// $validated = true; +// } +// } else { +// $linkTitle = Title::newFromText($value); +// if ($linkTitle) { +// $paramName = 'link-title'; +// $value = $linkTitle; +// $this->mOutput->addLink($linkTitle); +// $validated = true; +// } +// } +// break; +// case 'frameless': +// case 'framed': +// case 'thumbnail': +// // use first appearing option, discard others. +// $validated = !$seenformat; +// $seenformat = true; +// break; +// default: +// // Most other things appear to be empty or numeric... +// $validated = ($value === false || is_numeric(trim($value))); +// } +// } + +// if ($validated) { +// $params[$type][$paramName] = $value; +// } +// } +// } +// if (!$validated) { +// $caption = $part; +// } +// } + + // Process alignment parameters +// if ($params['horizAlign']) { +// $params['frame']['align'] = key($params['horizAlign']); +// } +// if ($params['vertAlign']) { +// $params['frame']['valign'] = key($params['vertAlign']); +// } + +// $params['frame']['caption'] = $caption; + + // Will the image be presented in a frame, with the caption below? +// $imageIsFramed = isset($params['frame']['frame']) +// || isset($params['frame']['framed']) +// || isset($params['frame']['thumbnail']) +// || isset($params['frame']['manualthumb']); + + // In the old days, [[Image:Foo|text...]] would set alt text. Later it + // came to also set the caption, ordinary text after the image -- which + // makes no sense, because that just repeats the text multiple times in + // screen readers. It *also* came to set the title attribute. + // Now that we have an alt attribute, we should not set the alt text to + // equal the caption: that's worse than useless, it just repeats the + // text. This is the framed/thumbnail case. If there's no caption, we + // use the unnamed parameter for alt text as well, just for the time be- + // ing, if the unnamed param is set and the alt param is not. + // For the future, we need to figure out if we want to tweak this more, + // e.g., introducing a title= parameter for the title; ignoring the un- + // named parameter entirely for images without a caption; adding an ex- + // plicit caption= parameter and preserving the old magic unnamed para- + // meter for BC; ... +// if ($imageIsFramed) { // Framed image +// if ($caption === '' && !isset($params['frame']['alt'])) { +// // No caption or alt text, add the filename as the alt text so +// // that screen readers at least get some description of the image +// $params['frame']['alt'] = $title->getText(); +// } + // Do not set $params['frame']['title'] because tooltips don't make sense + // for framed images +// } else { // Inline image +// if (!isset($params['frame']['alt'])) { +// // No alt text, use the "caption" for the alt text +// if ($caption !== '') { +// $params['frame']['alt'] = $this->stripAltText($caption, $holders); +// } else { +// // No caption, fall back to using the filename for the +// // alt text +// $params['frame']['alt'] = $title->getText(); +// } +// } + // Use the "caption" for the tooltip text +// $params['frame']['title'] = $this->stripAltText($caption, $holders); +// } + + // MW.HOOK:ParserMakeImageParams + + // Linker does the rest +// $time = isset($options['time']) ? $options['time'] : false; +// $ret = Linker::makeImageLink($this, $title, $file, $params['frame'], $params['handler'], +// $time, $descQuery, $this->mOptions->getThumbSize()); + + // Give the handler a chance to modify the parser Object +// if ($handler) { +// $handler->parserTransformHook($this, $file); +// } + +// return $ret; + } +// public function getImageParams($handler) { +// if ($handler) { +// $handlerClass = get_class($handler); +// } +// else { +// $handlerClass = ''; +// } +// if (!isset($this->mImageParams[$handlerClass])) { + // Initialise static lists +// static $internalParamNames = [ +// 'horizAlign' => [ 'left', 'right', 'center', 'none' ], +// 'vertAlign' => [ 'baseline', 'sub', 'super', 'top', 'text-top', 'middle', +// 'bottom', 'text-bottom' ], +// 'frame' => [ 'thumbnail', 'manualthumb', 'framed', 'frameless', +// 'upright', 'border', 'link', 'alt', 'class' ], +// ]; +// static $internalParamMap; +// if (!$internalParamMap) { +// $internalParamMap = []; +// foreach ($internalParamNames as $type => $names) { +// foreach ($names as $name) { +// $magicName = str_replace('-', '_', "img_$name"); +// $internalParamMap[$magicName] = [ $type, $name ]; +// } +// } +// } + + // Add handler params +// $paramMap = $internalParamMap; +// if ($handler) { +// $handlerParamMap = $handler->getParamMap(); +// foreach ($handlerParamMap as $magic => $paramName) { +// $paramMap[$magic] = [ 'handler', $paramName ]; +// } +// } +// $this->mImageParams[$handlerClass] = $paramMap; +// $this->mImageParamsMagicArray[$handlerClass] = new MagicWordArray(array_keys($paramMap)); +// } +// return [ $this->mImageParams[$handlerClass], $this->mImageParamsMagicArray[$handlerClass] ]; +// } +// /** +// * Make HTML for a thumbnail including image, border and caption +// * @param Title $title +// * @param File|boolean $file File Object or false if it doesn't exist +// * @param String $label +// * @param String $alt +// * @param String $align +// * @param array $params +// * @param boolean $framed +// * @param String $manualthumb +// * @return String +// */ +// public static function makeThumbLinkObj( Title $title, $file, $label = '', $alt, +// $align = 'right', $params = [], $framed = false, $manualthumb = "" +// ) { +// $frameParams = [ +// 'alt' => $alt, +// 'caption' => $label, +// 'align' => $align +// ]; +// if ( $framed ) { +// $frameParams['framed'] = true; +// } +// if ( $manualthumb ) { +// $frameParams['manualthumb'] = $manualthumb; +// } +// return self::makeThumbLink2( $title, $file, $frameParams, $params ); +// } +// +// /** +// * @param Title $title +// * @param File $file +// * @param array $frameParams +// * @param array $handlerParams +// * @param boolean $time +// * @param String $query +// * @return String +// */ +// public static function makeThumbLink2( Title $title, $file, $frameParams = [], +// $handlerParams = [], $time = false, $query = "" +// ) { +// $exists = $file && $file->exists(); +// +// $page = isset( $handlerParams['page'] ) ? $handlerParams['page'] : false; +// if ( !isset( $frameParams['align'] ) ) { +// $frameParams['align'] = 'right'; +// } +// if ( !isset( $frameParams['alt'] ) ) { +// $frameParams['alt'] = ''; +// } +// if ( !isset( $frameParams['title'] ) ) { +// $frameParams['title'] = ''; +// } +// if ( !isset( $frameParams['caption'] ) ) { +// $frameParams['caption'] = ''; +// } +// +// if ( empty( $handlerParams['width'] ) ) { +// // Reduce width for upright images when parameter 'upright' is used +// $handlerParams['width'] = isset( $frameParams['upright'] ) ? 130 : 180; +// } +// $thumb = false; +// $noscale = false; +// $manualthumb = false; +// +// if ( !$exists ) { +// $outerWidth = $handlerParams['width'] + 2; +// } else { +// if ( isset( $frameParams['manualthumb'] ) ) { +// # Use manually specified thumbnail +// $manual_title = Title::makeTitleSafe( NS_FILE, $frameParams['manualthumb'] ); +// if ( $manual_title ) { +// $manual_img = wfFindFile( $manual_title ); +// if ( $manual_img ) { +// $thumb = $manual_img->getUnscaledThumb( $handlerParams ); +// $manualthumb = true; +// } else { +// $exists = false; +// } +// } +// } elseif ( isset( $frameParams['framed'] ) ) { +// // Use image dimensions, don't scale +// $thumb = $file->getUnscaledThumb( $handlerParams ); +// $noscale = true; +// } else { +// # Do not present an image bigger than the source, for bitmap-style images +// # This is a hack to maintain compatibility with arbitrary pre-1.10 behavior +// $srcWidth = $file->getWidth( $page ); +// if ( $srcWidth && !$file->mustRender() && $handlerParams['width'] > $srcWidth ) { +// $handlerParams['width'] = $srcWidth; +// } +// $thumb = $file->transform( $handlerParams ); +// } +// +// if ( $thumb ) { +// $outerWidth = $thumb->getWidth() + 2; +// } else { +// $outerWidth = $handlerParams['width'] + 2; +// } +// } +// +// # ThumbnailImage::toHtml() already adds page= onto the end of DjVu URLs +// # So we don't need to pass it here in $query. However, the URL for the +// # zoom icon still needs it, so we make a unique query for it. See bug 14771 +// $url = $title->getLocalURL( $query ); +// if ( $page ) { +// $url = wfAppendQuery( $url, [ 'page' => $page ] ); +// } +// if ( $manualthumb +// && !isset( $frameParams['link-title'] ) +// && !isset( $frameParams['link-url'] ) +// && !isset( $frameParams['no-link'] ) ) { +// $frameParams['link-url'] = $url; +// } +// +// $s = "
" +// . "
"; +// +// if ( !$exists ) { +// $s .= self::makeBrokenImageLinkObj( $title, $frameParams['title'], '', '', '', $time == true ); +// $zoomIcon = ''; +// } elseif ( !$thumb ) { +// $s .= wfMessage( 'thumbnail_error', '' )->escaped(); +// $zoomIcon = ''; +// } else { +// if ( !$noscale && !$manualthumb ) { +// self::processResponsiveImages( $file, $thumb, $handlerParams ); +// } +// $params = [ +// 'alt' => $frameParams['alt'], +// 'title' => $frameParams['title'], +// 'img-class' => ( isset( $frameParams['class'] ) && $frameParams['class'] !== '' +// ? $frameParams['class'] . ' ' +// : '' ) . 'thumbimage' +// ]; +// $params = self::getImageLinkMTOParams( $frameParams, $query ) + $params; +// $s .= $thumb->toHtml( $params ); +// if ( isset( $frameParams['framed'] ) ) { +// $zoomIcon = ""; +// } else { +// $zoomIcon = Html::rawElement( 'div', [ 'class' => 'magnify' ], +// Html::rawElement( 'a', [ +// 'href' => $url, +// 'class' => '@gplx.Internal protected', +// 'title' => wfMessage( 'thumbnail-more' )->text() ], +// "" ) ); +// } +// } +// $s .= '
' . $zoomIcon . $frameParams['caption'] . "
"; +// return str_replace( "\n", ' ', $s ); +// } +// /** +// * Make a "broken" link to an image +// * +// * @since 1.16.3 +// * @param Title $title +// * @param String $label Link label (plain text) +// * @param String $query Query String +// * @param String $unused1 Unused parameter kept for b/c +// * @param String $unused2 Unused parameter kept for b/c +// * @param boolean $time A file of a certain timestamp was requested +// * @return String +// */ +// public static function makeBrokenImageLinkObj( $title, $label = '', +// $query = '', $unused1 = '', $unused2 = '', $time = false +// ) { +// if ( !$title instanceof Title ) { +// wfWarn( __METHOD__ . ': Requires $title to be a Title Object.' ); +// return "" . htmlspecialchars( $label ); +// } +// +// global $wgEnableUploads, $wgUploadMissingFileUrl, $wgUploadNavigationUrl; +// if ( $label == '' ) { +// $label = $title->getPrefixedText(); +// } +// $encLabel = htmlspecialchars( $label ); +// $currentExists = $time ? ( wfFindFile( $title ) != false ) : false; +// +// if ( ( $wgUploadMissingFileUrl || $wgUploadNavigationUrl || $wgEnableUploads ) +// && !$currentExists +// ) { +// $redir = RepoGroup::singleton()->getLocalRepo()->checkRedirect( $title ); +// +// if ( $redir ) { +// // We already know it's a redirect, so mark it +// // accordingly +// return self::link( +// $title, +// $encLabel, +// [ 'class' => 'mw-redirect' ], +// wfCgiToArray( $query ), +// [ 'known', 'noclasses' ] +// ); +// } +// +// $href = self::getUploadUrl( $title, $query ); +// +// return '
' . +// $encLabel . ''; +// } +// +// return self::link( $title, $encLabel, [], wfCgiToArray( $query ), [ 'known', 'noclasses' ] ); +// } +// /** +// * Create a direct link to a given uploaded file. +// * +// * @since 1.16.3 +// * @param Title $title +// * @param String $html Pre-sanitized HTML +// * @param String $time MW timestamp of file creation time +// * @return String HTML +// */ +// public static function makeMediaLinkObj( $title, $html = '', $time = false ) { +// $img = wfFindFile( $title, [ 'time' => $time ] ); +// return self::makeMediaLinkFile( $title, $img, $html ); +// } +// +// /** +// * Create a direct link to a given uploaded file. +// * This will make a broken link if $file is false. +// * +// * @since 1.16.3 +// * @param Title $title +// * @param File|boolean $file File Object or false +// * @param String $html Pre-sanitized HTML +// * @return String HTML +// * +// * @todo Handle invalid or missing images better. +// */ +// public static function makeMediaLinkFile( Title $title, $file, $html = '' ) { +// if ( $file && $file->exists() ) { +// $url = $file->getUrl(); +// $class = '@gplx.Internal protected'; +// } else { +// $url = self::getUploadUrl( $title ); +// $class = 'new'; +// } +// +// $alt = $title->getText(); +// if ( $html == '' ) { +// $html = $alt; +// } +// +// $ret = ''; +// $attribs = [ +// 'href' => $url, +// 'class' => $class, +// 'title' => $alt +// ]; +// +// if ( !Hooks::run( 'LinkerMakeMediaLinkFile', +// [ $title, $file, &$html, &$attribs, &$ret ] ) ) { +// wfDebug( "Hook LinkerMakeMediaLinkFile changed the output of link " +// . "with url {$url} and text {$html} to {$ret}\n", true ); +// return $ret; +// } +// +// return Html::rawElement( 'a', $attribs, $html ); +// } + private static final byte[] Bry__dot2 = Bry_.new_a7("../"); +} +class Linker_rel_splitter implements gplx.core.brys.Bry_split_wkr { + private final Hash_adp_bry hash = Hash_adp_bry.cs(); + private final Bry_bfr bfr = Bry_bfr_.New(); + public int Split(byte[] src, int itm_bgn, int itm_end) { // $combined = array_unique( array_merge( $newRels, $oldRels ) ); + byte[] val = (byte[])hash.Get_by_mid(src, itm_bgn, itm_end); + if (val == null) { + val = Bry_.Mid(src, itm_bgn, itm_end); + hash.Add_as_key_and_val(val); + if (bfr.Len_gt_0()) bfr.Add_byte_space(); + bfr.Add(val); + } + return Bry_split_.Rv__ok; + } + public byte[] To_bry() { + hash.Clear(); + return bfr.To_bry_and_clear(); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link.java b/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link.java new file mode 100644 index 000000000..66d2b27cd --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link.java @@ -0,0 +1,27 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +public class Xomw_linker__normalize_subpage_link { + public byte[] link; + public byte[] text; + public Xomw_linker__normalize_subpage_link Init(byte[] link, byte[] text) { + this.link = link; + this.text = text; + return this; + } +} diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link__tst.java b/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link__tst.java new file mode 100644 index 000000000..52242692f --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_linker__normalize_subpage_link__tst.java @@ -0,0 +1,43 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +import org.junit.*; import gplx.core.tests.*; +public class Xomw_linker__normalize_subpage_link__tst { + private final Xomw_linker__normalize_subpage_link__fxt fxt = new Xomw_linker__normalize_subpage_link__fxt(); + @Test public void None() {fxt.Test__normalize_subpage_link("A/B/C" , "Z" , "" , "Z" , "");} + @Test public void Hash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Y#Z" , "" , "A/B/C/Y#Z" , "/Y#Z");} + @Test public void Slash__basic() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z" , "" , "A/B/C/Z" , "/Z");} + @Test public void Slash__slash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z/" , "" , "A/B/C/Z" , "Z");} + @Test public void Dot2__empty() {fxt.Test__normalize_subpage_link("A/B/C" , "../" , "" , "A/B" , "");} + @Test public void Dot2__many() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z" , "z1" , "A/Z" , "z1");} + @Test public void Dot2__trailing() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z/" , "" , "A/Z" , "Z");} +} +class Xomw_linker__normalize_subpage_link__fxt { + private final Xomw_linker mgr = new Xomw_linker(); + private final Xowe_wiki wiki; + private final Xomw_linker__normalize_subpage_link normalize_subpage_link = new Xomw_linker__normalize_subpage_link(); + public Xomw_linker__normalize_subpage_link__fxt() { + Xoae_app app = Xoa_app_fxt.Make__app__edit(); + this.wiki = Xoa_app_fxt.Make__wiki__edit(app); + } + public void Test__normalize_subpage_link(String page_title_str, String link, String text, String expd_link, String expd_text) { + mgr.Normalize_subpage_link(normalize_subpage_link, wiki.Ttl_parse(Bry_.new_u8(page_title_str)), Bry_.new_u8(link), Bry_.new_u8(text)); + Gftest.Eq__str(expd_link, String_.new_u8(normalize_subpage_link.link)); + Gftest.Eq__str(expd_text, String_.new_u8(normalize_subpage_link.text)); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_linker__split_trail__tst.java b/400_xowa/src/gplx/xowa/mws/Xomw_linker__split_trail__tst.java new file mode 100644 index 000000000..a9fb0f647 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_linker__split_trail__tst.java @@ -0,0 +1,39 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*; +public class Xomw_linker__split_trail__tst { + private final Xomw_linker__split_trail__fxt fxt = new Xomw_linker__split_trail__fxt(); + @Test public void Basic() {fxt.Test__split_trail("abc def" , "abc" , " def");} + @Test public void None() {fxt.Test__split_trail(" abc" , null , " abc");} +} +class Xomw_linker__split_trail__fxt { + private final Xomw_linker linker = new Xomw_linker(); + private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); + public Xomw_linker__split_trail__fxt() { + String[] ary = new String[] {"a", "b", "c", "d", "e", "f"}; + for (String itm : ary) + trie.Add_str_str(itm, itm); + linker.Init_by_wiki(trie); + } + public void Test__split_trail(String trail_str, String expd_inside, String expd_trail) { + byte[][] split_trail = linker.Split_trail(Bry_.new_u8(trail_str)); + Gftest.Eq__str(expd_inside, String_.new_u8(split_trail[0])); + Gftest.Eq__str(expd_trail , String_.new_u8(split_trail[1])); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java new file mode 100644 index 000000000..6fe7a43cf --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java @@ -0,0 +1,538 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +import gplx.core.encoders.*; import gplx.langs.htmls.entitys.*; +import gplx.xowa.parsers.htmls.*; +import gplx.xowa.mws.parsers.*; +public class Xomw_sanitizer { + private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr(); + private final Mwh_atr_parser atr_parser = new Mwh_atr_parser(); + public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) { + atr_bldr.Atrs__clear(); + atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length); + int len = atr_bldr.Atrs__len(); + + // PORTED: Sanitizer.php|safeEncodeTagAttributes + for (int i = 0; i < len; i++) { + // $encAttribute = htmlspecialchars( $attribute ); + // $encValue = Sanitizer::safeEncodeAttribute( $value ); + // $attribs[] = "$encAttribute=\"$encValue\""; + Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i); + bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';" + bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end()); + bfr.Add_byte_eq().Add_byte_quote(); + bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode + bfr.Add_byte_quote(); + } + } + public void Normalize_char_references(Xomw_parser_bfr pbfr) { + // XO.PBFR + Bry_bfr src_bfr = pbfr.Src(); + byte[] src = src_bfr.Bfr(); + int src_bgn = 0; + int src_end = src_bfr.Len(); + Bry_bfr bfr = pbfr.Trg(); + pbfr.Switch(); + + Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end); + } + public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) { + // assert static structs + if (Normalize__dec == null) { + synchronized (Xomw_sanitizer.class) { + html_entities = Html_entities_new(); + Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary(); + Normalize__hex = Bool_ary_bldr.New_u8() + .Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9) + .Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z) + .Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z) + .To_ary(); + Normalize__ent = Bool_ary_bldr.New_u8() + .Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9) + .Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z) + .Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z) + .Set_rng(128, 255) + .To_ary(); + } + } + + // XO.BRY_BFR + boolean dirty = false; + int cur = src_bgn; + boolean called_by_bry = bfr == null; + + while (true) { + // search for "&" + int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur); + if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit + if (dirty) + bfr.Add_mid(src, cur, src_end); + break; + } + int ent_bgn = find_bgn + 1; // +1 to skip & + + // get regex; (a) dec ( ); (b) hex (ÿ); (c) entity (α); + boolean[] regex = null; + // check for #; + if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) { + ent_bgn++; + if (ent_bgn < src_end) { + byte nxt = src[ent_bgn]; + // check for x + if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) { + ent_bgn++; + regex = Normalize__hex; + } + } + if (regex == null) + regex = Normalize__dec; + } + else { + regex = Normalize__ent; + } + + // keep looping until invalid regex + int ent_end = ent_bgn; + byte b = Byte_ascii.Null; + for (int i = ent_bgn; i < src_end; i++) { + b = src[i]; + if (regex[b]) + ent_end++; + else + break; + } + + // mark dirty; can optimize later by checking if "<" already exists + dirty = true; + if (bfr == null) bfr = Bry_bfr_.New(); + bfr.Add_mid(src, cur, find_bgn); // add everything before & + + // invalid <- regex ended, but not at semic + if (b != Byte_ascii.Semic) { + bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&" + cur = find_bgn + 1; // position after "&" + continue; + } + + // do normalization + byte[] name = Bry_.Mid(src, ent_bgn, ent_end); + boolean ret = false; + if (regex == Normalize__ent) { + Normalize_entity(bfr, name); + ret = true; + } + else if (regex == Normalize__dec) { + ret = Dec_char_reference(bfr, name); + } + else if (regex == Normalize__hex) { + ret = Hex_char_reference(bfr, name); + } + if (!ret) { + bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&" + bfr.Add_bry_escape_html(src, find_bgn + 1, ent_end + 1); // "find_bgn + 1" to start after "&"; "ent_end + 1" to include ";" + } + + cur = ent_end + 1; // +1 to position after ";" + } + + // XO.BRY_BFR + if (dirty) { + if (called_by_bry) + return bfr.To_bry_and_clear(); + else + return Bry_.Empty; + } + else { + if (called_by_bry) { + if (src_bgn == 0 && src_end == src.length) + return src; + else + return Bry_.Mid(src, src_bgn, src_end); + } + else { + if (lone_bfr) + bfr.Add_mid(src, src_bgn, src_end); + return null; + } + } + } + + // If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, + // return the equivalent numeric entity reference (except for the core < + // > & "). If the entity is a MediaWiki-specific alias, returns + // the HTML equivalent. Otherwise, returns HTML-escaped text of + // pseudo-entity source (eg &foo;) + private void Normalize_entity(Bry_bfr bfr, byte[] name) { + Object o = html_entities.Get_by_bry(name); + if (o == null) { + bfr.Add_str_a7("&").Add(name).Add_byte_semic(); + } + else { + Xomw_html_ent entity = (Xomw_html_ent)o; + bfr.Add(entity.html); + } + } + + private boolean Dec_char_reference(Bry_bfr bfr, byte[] codepoint) { + int point = Bry_.To_int_or(codepoint, -1); + if (Validate_codepoint(point)) { + bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic(); + return true; + } + return false; + } + + private boolean Hex_char_reference(Bry_bfr bfr, byte[] codepoint) { + int point = Hex_utl_.Parse_or(codepoint, -1); + if (Validate_codepoint(point)) { + bfr.Add_str_a7("&#x"); + Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf( '&#x%x;', $point ) + bfr.Add_byte_semic(); + return true; + } + return false; + } + + private boolean Validate_codepoint(int codepoint) { + // U+000C is valid in HTML5 but not allowed in XML. + // U+000D is valid in XML but not allowed in HTML5. + // U+007F - U+009F are disallowed in HTML5 (control characters). + return codepoint == 0x09 + || codepoint == 0x0a + || (codepoint >= 0x20 && codepoint <= 0x7e) + || (codepoint >= 0xa0 && codepoint <= 0xd7ff) + || (codepoint >= 0xe000 && codepoint <= 0xfffd) + || (codepoint >= 0x10000 && codepoint <= 0x10ffff); + } + + private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent; + private static Hash_adp_bry html_entities; + private static Hash_adp_bry Html_entities_new() { + Bry_bfr tmp = Bry_bfr_.New(); + Hash_adp_bry rv = Hash_adp_bry.cs(); + + Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "רלמ", "‏"); + Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "رلم", "‏"); + + Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "<"); + Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", ">"); + Html_entities_set(rv, Xomw_html_ent.Type__char, 38, "amp", "&"); + Html_entities_set(rv, Xomw_html_ent.Type__char, 34, "quot", """); + + // List of all named character entities defined in HTML 4.01 + // https://www.w3.org/TR/html4/sgml/entities.html + // As well as ' which is only defined starting in XHTML1. + Html_entities_set(rv, tmp, "Aacute" , 193); + Html_entities_set(rv, tmp, "aacute" , 225); + Html_entities_set(rv, tmp, "Acirc" , 194); + Html_entities_set(rv, tmp, "acirc" , 226); + Html_entities_set(rv, tmp, "acute" , 180); + Html_entities_set(rv, tmp, "AElig" , 198); + Html_entities_set(rv, tmp, "aelig" , 230); + Html_entities_set(rv, tmp, "Agrave" , 192); + Html_entities_set(rv, tmp, "agrave" , 224); + Html_entities_set(rv, tmp, "alefsym" , 8501); + Html_entities_set(rv, tmp, "Alpha" , 913); + Html_entities_set(rv, tmp, "alpha" , 945); + Html_entities_set(rv, tmp, "amp" , 38); // XO: identical to Type__char entry; note that Type__char should be evaluated first + Html_entities_set(rv, tmp, "and" , 8743); + Html_entities_set(rv, tmp, "ang" , 8736); + Html_entities_set(rv, tmp, "apos" , 39); // New in XHTML & HTML 5; avoid in output for compatibility with IE. + Html_entities_set(rv, tmp, "Aring" , 197); + Html_entities_set(rv, tmp, "aring" , 229); + Html_entities_set(rv, tmp, "asymp" , 8776); + Html_entities_set(rv, tmp, "Atilde" , 195); + Html_entities_set(rv, tmp, "atilde" , 227); + Html_entities_set(rv, tmp, "Auml" , 196); + Html_entities_set(rv, tmp, "auml" , 228); + Html_entities_set(rv, tmp, "bdquo" , 8222); + Html_entities_set(rv, tmp, "Beta" , 914); + Html_entities_set(rv, tmp, "beta" , 946); + Html_entities_set(rv, tmp, "brvbar" , 166); + Html_entities_set(rv, tmp, "bull" , 8226); + Html_entities_set(rv, tmp, "cap" , 8745); + Html_entities_set(rv, tmp, "Ccedil" , 199); + Html_entities_set(rv, tmp, "ccedil" , 231); + Html_entities_set(rv, tmp, "cedil" , 184); + Html_entities_set(rv, tmp, "cent" , 162); + Html_entities_set(rv, tmp, "Chi" , 935); + Html_entities_set(rv, tmp, "chi" , 967); + Html_entities_set(rv, tmp, "circ" , 710); + Html_entities_set(rv, tmp, "clubs" , 9827); + Html_entities_set(rv, tmp, "cong" , 8773); + Html_entities_set(rv, tmp, "copy" , 169); + Html_entities_set(rv, tmp, "crarr" , 8629); + Html_entities_set(rv, tmp, "cup" , 8746); + Html_entities_set(rv, tmp, "curren" , 164); + Html_entities_set(rv, tmp, "dagger" , 8224); + Html_entities_set(rv, tmp, "Dagger" , 8225); + Html_entities_set(rv, tmp, "darr" , 8595); + Html_entities_set(rv, tmp, "dArr" , 8659); + Html_entities_set(rv, tmp, "deg" , 176); + Html_entities_set(rv, tmp, "Delta" , 916); + Html_entities_set(rv, tmp, "delta" , 948); + Html_entities_set(rv, tmp, "diams" , 9830); + Html_entities_set(rv, tmp, "divide" , 247); + Html_entities_set(rv, tmp, "Eacute" , 201); + Html_entities_set(rv, tmp, "eacute" , 233); + Html_entities_set(rv, tmp, "Ecirc" , 202); + Html_entities_set(rv, tmp, "ecirc" , 234); + Html_entities_set(rv, tmp, "Egrave" , 200); + Html_entities_set(rv, tmp, "egrave" , 232); + Html_entities_set(rv, tmp, "empty" , 8709); + Html_entities_set(rv, tmp, "emsp" , 8195); + Html_entities_set(rv, tmp, "ensp" , 8194); + Html_entities_set(rv, tmp, "Epsilon" , 917); + Html_entities_set(rv, tmp, "epsilon" , 949); + Html_entities_set(rv, tmp, "equiv" , 8801); + Html_entities_set(rv, tmp, "Eta" , 919); + Html_entities_set(rv, tmp, "eta" , 951); + Html_entities_set(rv, tmp, "ETH" , 208); + Html_entities_set(rv, tmp, "eth" , 240); + Html_entities_set(rv, tmp, "Euml" , 203); + Html_entities_set(rv, tmp, "euml" , 235); + Html_entities_set(rv, tmp, "euro" , 8364); + Html_entities_set(rv, tmp, "exist" , 8707); + Html_entities_set(rv, tmp, "fnof" , 402); + Html_entities_set(rv, tmp, "forall" , 8704); + Html_entities_set(rv, tmp, "frac12" , 189); + Html_entities_set(rv, tmp, "frac14" , 188); + Html_entities_set(rv, tmp, "frac34" , 190); + Html_entities_set(rv, tmp, "frasl" , 8260); + Html_entities_set(rv, tmp, "Gamma" , 915); + Html_entities_set(rv, tmp, "gamma" , 947); + Html_entities_set(rv, tmp, "ge" , 8805); + Html_entities_set(rv, tmp, "gt" , 62); + Html_entities_set(rv, tmp, "harr" , 8596); + Html_entities_set(rv, tmp, "hArr" , 8660); + Html_entities_set(rv, tmp, "hearts" , 9829); + Html_entities_set(rv, tmp, "hellip" , 8230); + Html_entities_set(rv, tmp, "Iacute" , 205); + Html_entities_set(rv, tmp, "iacute" , 237); + Html_entities_set(rv, tmp, "Icirc" , 206); + Html_entities_set(rv, tmp, "icirc" , 238); + Html_entities_set(rv, tmp, "iexcl" , 161); + Html_entities_set(rv, tmp, "Igrave" , 204); + Html_entities_set(rv, tmp, "igrave" , 236); + Html_entities_set(rv, tmp, "image" , 8465); + Html_entities_set(rv, tmp, "infin" , 8734); + Html_entities_set(rv, tmp, "int" , 8747); + Html_entities_set(rv, tmp, "Iota" , 921); + Html_entities_set(rv, tmp, "iota" , 953); + Html_entities_set(rv, tmp, "iquest" , 191); + Html_entities_set(rv, tmp, "isin" , 8712); + Html_entities_set(rv, tmp, "Iuml" , 207); + Html_entities_set(rv, tmp, "iuml" , 239); + Html_entities_set(rv, tmp, "Kappa" , 922); + Html_entities_set(rv, tmp, "kappa" , 954); + Html_entities_set(rv, tmp, "Lambda" , 923); + Html_entities_set(rv, tmp, "lambda" , 955); + Html_entities_set(rv, tmp, "lang" , 9001); + Html_entities_set(rv, tmp, "laquo" , 171); + Html_entities_set(rv, tmp, "larr" , 8592); + Html_entities_set(rv, tmp, "lArr" , 8656); + Html_entities_set(rv, tmp, "lceil" , 8968); + Html_entities_set(rv, tmp, "ldquo" , 8220); + Html_entities_set(rv, tmp, "le" , 8804); + Html_entities_set(rv, tmp, "lfloor" , 8970); + Html_entities_set(rv, tmp, "lowast" , 8727); + Html_entities_set(rv, tmp, "loz" , 9674); + Html_entities_set(rv, tmp, "lrm" , 8206); + Html_entities_set(rv, tmp, "lsaquo" , 8249); + Html_entities_set(rv, tmp, "lsquo" , 8216); + Html_entities_set(rv, tmp, "lt" , 60); + Html_entities_set(rv, tmp, "macr" , 175); + Html_entities_set(rv, tmp, "mdash" , 8212); + Html_entities_set(rv, tmp, "micro" , 181); + Html_entities_set(rv, tmp, "middot" , 183); + Html_entities_set(rv, tmp, "minus" , 8722); + Html_entities_set(rv, tmp, "Mu" , 924); + Html_entities_set(rv, tmp, "mu" , 956); + Html_entities_set(rv, tmp, "nabla" , 8711); + Html_entities_set(rv, tmp, "nbsp" , 160); + Html_entities_set(rv, tmp, "ndash" , 8211); + Html_entities_set(rv, tmp, "ne" , 8800); + Html_entities_set(rv, tmp, "ni" , 8715); + Html_entities_set(rv, tmp, "not" , 172); + Html_entities_set(rv, tmp, "notin" , 8713); + Html_entities_set(rv, tmp, "nsub" , 8836); + Html_entities_set(rv, tmp, "Ntilde" , 209); + Html_entities_set(rv, tmp, "ntilde" , 241); + Html_entities_set(rv, tmp, "Nu" , 925); + Html_entities_set(rv, tmp, "nu" , 957); + Html_entities_set(rv, tmp, "Oacute" , 211); + Html_entities_set(rv, tmp, "oacute" , 243); + Html_entities_set(rv, tmp, "Ocirc" , 212); + Html_entities_set(rv, tmp, "ocirc" , 244); + Html_entities_set(rv, tmp, "OElig" , 338); + Html_entities_set(rv, tmp, "oelig" , 339); + Html_entities_set(rv, tmp, "Ograve" , 210); + Html_entities_set(rv, tmp, "ograve" , 242); + Html_entities_set(rv, tmp, "oline" , 8254); + Html_entities_set(rv, tmp, "Omega" , 937); + Html_entities_set(rv, tmp, "omega" , 969); + Html_entities_set(rv, tmp, "Omicron" , 927); + Html_entities_set(rv, tmp, "omicron" , 959); + Html_entities_set(rv, tmp, "oplus" , 8853); + Html_entities_set(rv, tmp, "or" , 8744); + Html_entities_set(rv, tmp, "ordf" , 170); + Html_entities_set(rv, tmp, "ordm" , 186); + Html_entities_set(rv, tmp, "Oslash" , 216); + Html_entities_set(rv, tmp, "oslash" , 248); + Html_entities_set(rv, tmp, "Otilde" , 213); + Html_entities_set(rv, tmp, "otilde" , 245); + Html_entities_set(rv, tmp, "otimes" , 8855); + Html_entities_set(rv, tmp, "Ouml" , 214); + Html_entities_set(rv, tmp, "ouml" , 246); + Html_entities_set(rv, tmp, "para" , 182); + Html_entities_set(rv, tmp, "part" , 8706); + Html_entities_set(rv, tmp, "permil" , 8240); + Html_entities_set(rv, tmp, "perp" , 8869); + Html_entities_set(rv, tmp, "Phi" , 934); + Html_entities_set(rv, tmp, "phi" , 966); + Html_entities_set(rv, tmp, "Pi" , 928); + Html_entities_set(rv, tmp, "pi" , 960); + Html_entities_set(rv, tmp, "piv" , 982); + Html_entities_set(rv, tmp, "plusmn" , 177); + Html_entities_set(rv, tmp, "pound" , 163); + Html_entities_set(rv, tmp, "prime" , 8242); + Html_entities_set(rv, tmp, "Prime" , 8243); + Html_entities_set(rv, tmp, "prod" , 8719); + Html_entities_set(rv, tmp, "prop" , 8733); + Html_entities_set(rv, tmp, "Psi" , 936); + Html_entities_set(rv, tmp, "psi" , 968); + Html_entities_set(rv, tmp, "quot" , 34); + Html_entities_set(rv, tmp, "radic" , 8730); + Html_entities_set(rv, tmp, "rang" , 9002); + Html_entities_set(rv, tmp, "raquo" , 187); + Html_entities_set(rv, tmp, "rarr" , 8594); + Html_entities_set(rv, tmp, "rArr" , 8658); + Html_entities_set(rv, tmp, "rceil" , 8969); + Html_entities_set(rv, tmp, "rdquo" , 8221); + Html_entities_set(rv, tmp, "real" , 8476); + Html_entities_set(rv, tmp, "reg" , 174); + Html_entities_set(rv, tmp, "rfloor" , 8971); + Html_entities_set(rv, tmp, "Rho" , 929); + Html_entities_set(rv, tmp, "rho" , 961); + Html_entities_set(rv, tmp, "rlm" , 8207); + Html_entities_set(rv, tmp, "rsaquo" , 8250); + Html_entities_set(rv, tmp, "rsquo" , 8217); + Html_entities_set(rv, tmp, "sbquo" , 8218); + Html_entities_set(rv, tmp, "Scaron" , 352); + Html_entities_set(rv, tmp, "scaron" , 353); + Html_entities_set(rv, tmp, "sdot" , 8901); + Html_entities_set(rv, tmp, "sect" , 167); + Html_entities_set(rv, tmp, "shy" , 173); + Html_entities_set(rv, tmp, "Sigma" , 931); + Html_entities_set(rv, tmp, "sigma" , 963); + Html_entities_set(rv, tmp, "sigmaf" , 962); + Html_entities_set(rv, tmp, "sim" , 8764); + Html_entities_set(rv, tmp, "spades" , 9824); + Html_entities_set(rv, tmp, "sub" , 8834); + Html_entities_set(rv, tmp, "sube" , 8838); + Html_entities_set(rv, tmp, "sum" , 8721); + Html_entities_set(rv, tmp, "sup" , 8835); + Html_entities_set(rv, tmp, "sup1" , 185); + Html_entities_set(rv, tmp, "sup2" , 178); + Html_entities_set(rv, tmp, "sup3" , 179); + Html_entities_set(rv, tmp, "supe" , 8839); + Html_entities_set(rv, tmp, "szlig" , 223); + Html_entities_set(rv, tmp, "Tau" , 932); + Html_entities_set(rv, tmp, "tau" , 964); + Html_entities_set(rv, tmp, "there4" , 8756); + Html_entities_set(rv, tmp, "Theta" , 920); + Html_entities_set(rv, tmp, "theta" , 952); + Html_entities_set(rv, tmp, "thetasym" , 977); + Html_entities_set(rv, tmp, "thinsp" , 8201); + Html_entities_set(rv, tmp, "THORN" , 222); + Html_entities_set(rv, tmp, "thorn" , 254); + Html_entities_set(rv, tmp, "tilde" , 732); + Html_entities_set(rv, tmp, "times" , 215); + Html_entities_set(rv, tmp, "trade" , 8482); + Html_entities_set(rv, tmp, "Uacute" , 218); + Html_entities_set(rv, tmp, "uacute" , 250); + Html_entities_set(rv, tmp, "uarr" , 8593); + Html_entities_set(rv, tmp, "uArr" , 8657); + Html_entities_set(rv, tmp, "Ucirc" , 219); + Html_entities_set(rv, tmp, "ucirc" , 251); + Html_entities_set(rv, tmp, "Ugrave" , 217); + Html_entities_set(rv, tmp, "ugrave" , 249); + Html_entities_set(rv, tmp, "uml" , 168); + Html_entities_set(rv, tmp, "upsih" , 978); + Html_entities_set(rv, tmp, "Upsilon" , 933); + Html_entities_set(rv, tmp, "upsilon" , 965); + Html_entities_set(rv, tmp, "Uuml" , 220); + Html_entities_set(rv, tmp, "uuml" , 252); + Html_entities_set(rv, tmp, "weierp" , 8472); + Html_entities_set(rv, tmp, "Xi" , 926); + Html_entities_set(rv, tmp, "xi" , 958); + Html_entities_set(rv, tmp, "Yacute" , 221); + Html_entities_set(rv, tmp, "yacute" , 253); + Html_entities_set(rv, tmp, "yen" , 165); + Html_entities_set(rv, tmp, "Yuml" , 376); + Html_entities_set(rv, tmp, "yuml" , 255); + Html_entities_set(rv, tmp, "Zeta" , 918); + Html_entities_set(rv, tmp, "zeta" , 950); + Html_entities_set(rv, tmp, "zwj" , 8205); + Html_entities_set(rv, tmp, "zwnj" , 8204); + return rv; + } + private static void Html_entities_set(Hash_adp_bry rv, Bry_bfr tmp, String name_str, int code) { + byte[] html_bry = tmp.Add_str_a7("&#").Add_int_variable(code).Add_byte_semic().To_bry_and_clear(); + Html_entities_set(rv, Xomw_html_ent.Type__entity, code, name_str, html_bry); + } + private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, String html_str) {Html_entities_set(rv, type, code, name_str, Bry_.new_u8(html_str));} + private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, byte[] html_bry) { + byte[] name_bry = Bry_.new_u8(name_str); + rv.Add_if_dupe_use_1st(name_bry, new Xomw_html_ent(type, code, name_bry, html_bry)); // Add_dupe needed b/c "lt" and co. are added early; ignore subsequent call + } +} +class Xomw_html_ent { + public Xomw_html_ent(byte type, int code, byte[] name, byte[] html) { + this.type = type; + this.code = code; + this.name = name; + this.html = html; + } + public final byte type; + public final int code; + public final byte[] name; + public final byte[] html; + public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3; +} +class Bool_ary_bldr { + private final boolean[] ary; + public Bool_ary_bldr(int len) { + this.ary = new boolean[len]; + } + public Bool_ary_bldr Set_many(int... v) { + int len = v.length; + for (int i = 0; i < len; i++) + ary[v[i]] = true; + return this; + } + public Bool_ary_bldr Set_rng(int bgn, int end) { + for (int i = bgn; i <= end; i++) + ary[i] = true; + return this; + } + public boolean[] To_ary() { + return ary; + } + public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);} +} diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer__tst.java b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer__tst.java new file mode 100644 index 000000000..dfd711538 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer__tst.java @@ -0,0 +1,44 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws; import gplx.*; import gplx.xowa.*; +import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*; +public class Xomw_sanitizer__tst { + private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt(); + @Test public void Text() {fxt.Test__normalize_char_references("abc" , "abc");} + @Test public void Dec() {fxt.Test__normalize_char_references("" , "&#08;");} + @Test public void Dec__invalid() {fxt.Test__normalize_char_references(" " , " ");} + @Test public void Hex() {fxt.Test__normalize_char_references("ÿ" , "ÿ");} + @Test public void Entity() {fxt.Test__normalize_char_references("α" , "α");} + @Test public void Entity__lt() {fxt.Test__normalize_char_references("<" , "<");} + @Test public void Invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&(invalid);");} + @Test public void Many() { + fxt.Test__normalize_char_references + ( "a b α c ÿ d &(invalid); e" + , "a b α c ÿ d &(invalid); e" + ); + } +} +class Xomw_sanitizer__fxt { + private final Xomw_sanitizer sanitizer = new Xomw_sanitizer(); + private final Bry_bfr tmp = Bry_bfr_.New(); + public void Test__normalize_char_references(String src_str, String expd) { + byte[] src_bry = Bry_.new_u8(src_str); + sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length); + Gftest.Eq__str(expd, tmp.To_str_and_clear()); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_elem.java b/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_elem.java new file mode 100644 index 000000000..5beee99d1 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_elem.java @@ -0,0 +1,26 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +public class Xomw_html_elem { + public Xomw_html_elem(byte[] name) { + this.name = name; + } + public byte[] Name() {return name;} private final byte[] name; // EX: "a", "div", "img" + +// private static final Hash_adp_bry void_elements = Hash_adp_bry.cs().Add_many_str("area", "super", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"); +} diff --git a/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_utl.java b/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_utl.java new file mode 100644 index 000000000..be42394f4 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/htmls/Xomw_html_utl.java @@ -0,0 +1,267 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +import gplx.core.btries.*; +import gplx.langs.phps.utls.*; +public class Xomw_html_utl { + private final Bry_bfr tmp = Bry_bfr_.New(); + private final Btrie_rv trv = new Btrie_rv(); + public void Raw_element(Bry_bfr bfr, byte[] element, Xomwh_atr_mgr attribs, byte[] contents) { + Bry_.Lcase__all(element); // XO:lcase element + + Open_element__lcased(bfr, element, attribs); + if (void_elements.Has(element)) { + bfr.Del_by_1().Add(Bry__elem__lhs__inl); + } + else { + bfr.Add(contents); + Close_element__lcased(bfr, element); + } + } + private void Open_element__lcased(Bry_bfr bfr, byte[] element, Xomwh_atr_mgr attribs) { + // This is not required in HTML5, but let's do it anyway, for + // consistency and better compression. + // $element = strtolower($element); // XO:handled by callers + + // Remove invalid input types + if (Bry_.Eq(element, Tag__input)) { + // PORTED.HEADER:valid_input_types + byte[] type_atr_val = attribs.Get_val_or_null(Atr__type); + if (type_atr_val != null && !valid_input_types.Has(type_atr_val)) { + attribs.Del(Atr__type); + } + } + + // According to standard the default type for