diff --git a/100_core/src/gplx/Array_.java b/100_core/src/gplx/Array_.java index 3efe7d815..cb0771c15 100644 --- a/100_core/src/gplx/Array_.java +++ b/100_core/src/gplx/Array_.java @@ -54,6 +54,15 @@ public class Array_ { Copy_to(src, 0, trg, 0, copy_len); return trg; } + public static Object Extract_by_pos(Object src, int src_bgn) { + return Extract_by_pos(src, src_bgn, Array.getLength(src)); + } + public static Object Extract_by_pos(Object src, int src_bgn, int src_end) { + int trg_len = src_end - src_bgn; + Object trg = Create(Component_type(src), trg_len); + Copy_to(src, src_bgn, trg, 0, src_end - src_bgn); + return trg; + } public static List_adp To_list(Object ary) { int aryLen = Array_.Len(ary); List_adp rv = List_adp_.New(); diff --git a/100_core/src/gplx/Keyval_.java b/100_core/src/gplx/Keyval_.java index 8fb562139..970fe6409 100644 --- a/100_core/src/gplx/Keyval_.java +++ b/100_core/src/gplx/Keyval_.java @@ -70,6 +70,14 @@ public class Keyval_ { Ary__to_str__nest__ary(bfr, 0, true, ary); return bfr.To_str_and_clear(); } + public static Object[] Ary__to_objary__val(Keyval[] ary) { + int ary_len = ary.length; + Object[] rv = new Object[ary_len]; + for (int i = 0; i < ary_len; i++) { + rv[i] = ary[i].Val(); + } + return rv; + } private static void Ary__to_str__nest__ary(Bry_bfr bfr, int indent, boolean is_kv, Object[] ary) { int len = ary.length; for (int i = 0; i < len; ++i) { diff --git a/100_core/src/gplx/core/tests/Gftest.java b/100_core/src/gplx/core/tests/Gftest.java index 44f583d72..31dc41d1a 100644 --- a/100_core/src/gplx/core/tests/Gftest.java +++ b/100_core/src/gplx/core/tests/Gftest.java @@ -17,6 +17,7 @@ package gplx.core.tests; import gplx.*; import gplx.core.*; import gplx.core.brys.*; public class Gftest { private static final Bry_bfr bfr = Bry_bfr_.New(); + public static void Eq__ary(Object[] expd, Object[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__obj, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(boolean[] expd, boolean[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__bool, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(int[] expd, int[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__int, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(long[] expd, long[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__long, expd, actl, msg_fmt, msg_args);} @@ -151,12 +152,14 @@ public class Gftest { } private static void Write__itm(Bry_bfr bfr, int type_id, Object ary, int len, int idx) { if (idx < len) { + Object val = Array_.Get_at(ary, idx); switch (type_id) { - case Type_ids_.Id__bool: bfr.Add_yn(Bool_.Cast(Array_.Get_at(ary, idx))); break; - case Type_ids_.Id__bry: bfr.Add_safe((byte[])Array_.Get_at(ary, idx)); break; - case Type_ids_.Id__long: bfr.Add_long_variable(Long_.cast(Array_.Get_at(ary, idx))); break; - case Type_ids_.Id__int: bfr.Add_int_variable(Int_.Cast(Array_.Get_at(ary, idx))); break; - case Type_ids_.Id__byte: bfr.Add_int_variable((int)(Byte_.Cast(Array_.Get_at(ary, idx)))); break; + case Type_ids_.Id__bool: bfr.Add_yn(Bool_.Cast(val)); break; + case Type_ids_.Id__bry: bfr.Add_safe((byte[])val); break; + case Type_ids_.Id__long: bfr.Add_long_variable(Long_.cast(val)); break; + case Type_ids_.Id__int: bfr.Add_int_variable(Int_.Cast(val)); break; + case Type_ids_.Id__byte: bfr.Add_int_variable((int)(Byte_.Cast(val))); break; + case Type_ids_.Id__obj: bfr.Add_str_u8(Object_.Xto_str_strict_or_null_mark(val)); break; default: throw Err_.new_unhandled_default(type_id); } } @@ -182,6 +185,7 @@ public class Gftest { case Type_ids_.Id__long: eq = Long_.cast(expd_obj) == Long_.cast(actl_obj); break; case Type_ids_.Id__int: eq = Int_.Cast(expd_obj) == Int_.Cast(actl_obj); break; case Type_ids_.Id__byte: eq = Byte_.Cast(expd_obj) == Byte_.Cast(actl_obj); break; + case Type_ids_.Id__obj: eq = Object_.Eq(expd_obj, actl_obj); break; } } if (!eq) { diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index dd7eb67b3..5ecaea81e 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -14,7 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import gplx.langs.regxs.*; import gplx.core.intls.*; +import gplx.core.intls.*; import gplx.langs.regxs.*; import gplx.xowa.parsers.*; import gplx.xowa.xtns.scribunto.procs.*; public class Scrib_lib_ustring implements Scrib_lib { @@ -118,30 +118,9 @@ public class Scrib_lib_ustring implements Scrib_lib { AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true); return rslt.Init_many_list(tmp_list); } - private Scrib_lib_ustring_gsub_mgr[] gsub_mgr_ary = Scrib_lib_ustring_gsub_mgr.Ary_empty; - private int gsub_mgr_max = 0, gsub_mgr_len = -1; - private final Object gsub_mgr_lock = new Object(); public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { - boolean rv = false; - synchronized (gsub_mgr_lock) { // handle recursive gsub calls; PAGE:en.d:כלב; DATE:2016-01-22 - Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - int new_len = gsub_mgr_len + 1; - if (new_len == gsub_mgr_max) { - this.gsub_mgr_max = new_len == 0 ? 2 : new_len * 2; - Scrib_lib_ustring_gsub_mgr[] new_gsub_mgr_ary = new Scrib_lib_ustring_gsub_mgr[gsub_mgr_max]; - Array_.Copy(gsub_mgr_ary, new_gsub_mgr_ary); - gsub_mgr_ary = new_gsub_mgr_ary; - } - Scrib_lib_ustring_gsub_mgr cur = gsub_mgr_ary[new_len]; - if (cur == null) { - cur = new Scrib_lib_ustring_gsub_mgr(core, regx_converter); - gsub_mgr_ary[new_len] = cur; - } - this.gsub_mgr_len = new_len; - rv = cur.Exec(args, rslt); - --gsub_mgr_len; - } - return rv; + Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core, new Scrib_regx_converter()); + return gsub_mgr.Exec(args, rslt); } public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { // String text = Scrib_kv_utl_.Val_to_str(values, 0); @@ -195,189 +174,3 @@ public class Scrib_lib_ustring implements Scrib_lib { private static final int Base1 = 1 , End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab" } -class Scrib_lib_ustring_gsub_mgr { - private Scrib_regx_converter regx_converter; - public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) {this.core = core; this.regx_converter = regx_converter;} private Scrib_core core; - private byte tmp_repl_tid = Repl_tid_null; private byte[] tmp_repl_bry = null; - private Hash_adp repl_hash = null; private Scrib_lua_proc repl_func = null; - private int repl_count = 0; - public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) { - Object text_obj = args.Cast_obj_or_null(0); - String text = String_.as_(text_obj); - if (text == null) text = Object_.Xto_str_strict_or_empty(text_obj); - String regx = args.Xstr_str_or_null(1); // NOTE: @pattern sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 - if (args.Len() == 2) return rslt.Init_obj(text); // if no replace arg, return self; PAGE:en.d:'orse; DATE:2013-10-13 - Object repl_obj = args.Cast_obj_or_null(2); - regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow); - int limit = args.Cast_int_or(3, -1); - repl_count = 0; - Identify_repl(repl_obj); - String repl = Exec_repl(tmp_repl_tid, tmp_repl_bry, text, regx, limit); - return rslt.Init_many_objs(repl, repl_count); - } - private void Identify_repl(Object repl_obj) { - Class repl_type = repl_obj.getClass(); - if (Object_.Eq(repl_type, String_.Cls_ref_type)) { - tmp_repl_tid = Repl_tid_string; - tmp_repl_bry = Bry_.new_u8((String)repl_obj); - } - else if (Object_.Eq(repl_type, Int_.Cls_ref_type)) { // NOTE:@replace sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 - tmp_repl_tid = Repl_tid_string; - tmp_repl_bry = Bry_.new_u8(Int_.To_str(Int_.Cast(repl_obj))); - } - else if (Object_.Eq(repl_type, Keyval[].class)) { - tmp_repl_tid = Repl_tid_table; - Keyval[] repl_tbl = (Keyval[])repl_obj; - if (repl_hash == null) - repl_hash = Hash_adp_.New(); - else - repl_hash.Clear(); - int repl_tbl_len = repl_tbl.length; - for (int i = 0; i < repl_tbl_len; i++) { - Keyval repl_itm = repl_tbl[i]; - String repl_itm_val = repl_itm.Val_to_str_or_empty(); - repl_hash.Add(repl_itm.Key(), Bry_.new_u8(repl_itm_val)); - } - } - else if (Object_.Eq(repl_type, Scrib_lua_proc.class)) { - tmp_repl_tid = Repl_tid_luacbk; - repl_func = (Scrib_lua_proc)repl_obj; - } - else if (Object_.Eq(repl_type, Double_.Cls_ref_type)) { // NOTE:@replace sometimes double; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2; DATE:2016-04-21 - tmp_repl_tid = Repl_tid_string; - tmp_repl_bry = Bry_.new_u8(Double_.To_str(Double_.cast(repl_obj))); - } - else throw Err_.new_unhandled(Type_.Name(repl_type)); - } - private String Exec_repl(byte repl_tid, byte[] repl_bry, String text, String regx, int limit) { - Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); - Regx_match[] rslts = regx_mgr.Match_all(text, 0); - if ( rslts.length == 0 // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php - || regx_mgr.Pattern_is_invalid() // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02 - ) return text; - rslts = regx_converter.Adjust_balanced(rslts); - Bry_bfr tmp_bfr = Bry_bfr_.New(); - int len = rslts.length; - int pos = 0; - for (int i = 0; i < len; i++) { - if (limit > -1 && repl_count == limit) break; - Regx_match rslt = rslts[i]; - tmp_bfr.Add_str_u8(String_.Mid(text, pos, rslt.Find_bgn())); // NOTE: regx returns char pos (not bry); must add as String, not bry; DATE:2013-07-17 - if (!Exec_repl_itm(tmp_bfr, repl_tid, repl_bry, text, rslt)) { // will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22; - tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end())); - } - pos = rslt.Find_end(); - ++repl_count; - } - int text_len = String_.Len(text); - if (pos < text_len) - tmp_bfr.Add_str_u8(String_.Mid(text, pos, text_len)); // NOTE: regx returns char pos (not bry); must add as String, not bry; DATE:2013-07-17 - return tmp_bfr.To_str_and_clear(); - } - private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, byte[] repl_bry, String text, Regx_match match) { - switch (repl_tid) { - case Repl_tid_string: - int len = repl_bry.length; - for (int i = 0; i < len; i++) { - byte b = repl_bry[i]; - switch (b) { - case Byte_ascii.Percent: { - ++i; - if (i == len) // % at end of stream; just add %; - tmp_bfr.Add_byte(b); - else { - b = repl_bry[i]; - switch (b) { - case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4: - case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: - int idx = b - Byte_ascii.Num_0; - if (idx == 0) // NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 - tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end())); - else { // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 - idx -= List_adp_.Base1; - if (idx < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures - Regx_group grp = match.Groups()[idx]; - tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) - } - else { - tmp_bfr.Add_byte(Byte_ascii.Percent); - tmp_bfr.Add_byte(b); - } - } - break; - case Byte_ascii.Percent: - tmp_bfr.Add_byte(Byte_ascii.Percent); - break; - default: // not a number; add literal - tmp_bfr.Add_byte(Byte_ascii.Percent); - tmp_bfr.Add_byte(b); - break; - } - } - break; - } - default: - tmp_bfr.Add_byte(b); - break; - } - } - break; - case Repl_tid_table: { - int match_bgn = -1, match_end = -1; - Regx_group[] grps = match.Groups(); - if (grps.length == 0) { - match_bgn = match.Find_bgn(); - match_end = match.Find_end(); - } - else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15 - Regx_group grp = grps[0]; - match_bgn = grp.Bgn(); - match_end = grp.End(); - } - String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) - Object actl_repl_obj = repl_hash.Get_by(find_str); - if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31 - tmp_bfr.Add_str_u8(find_str); - else - tmp_bfr.Add((byte[])actl_repl_obj); - break; - } - case Repl_tid_luacbk: { - // TOMBSTONE: was causing garbled text on PAGE:en.w:Template:Infobox_kommune DATE:2018-07-02 - /* - String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end()); - Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str); - */ - Keyval[] luacbk_args = null; - Regx_group[] grps = match.Groups(); - int grps_len = grps.length; - if (grps_len == 0) { // no match; use original String - String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end()); - luacbk_args = Scrib_kv_utl_.base1_obj_(find_str); - } - else { // match; build ary of matches; (see UStringLibrary.php) - luacbk_args = new Keyval[grps_len]; - for (int i = 0; i < grps_len; i++) { - Regx_group grp = grps[i]; - String find_str = String_.Mid(text, grp.Bgn(), grp.End()); - luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str); - } - } - /* - */ - Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args); - if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22 - return false; - else { // ArrayIndex check - Object rslt_obj = rslts[0].Val(); // 0th idx has result - tmp_bfr.Add_str_u8(Object_.Xto_str_strict_or_empty(rslt_obj)); // NOTE: always convert to String; rslt_obj can be int; PAGE:en.d:seven DATE:2016-04-27 - } - break; - } - default: throw Err_.new_unhandled(repl_tid); - } - return true; - } - private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3; - public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0]; -} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java index f945bbf44..1b05c9ce0 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java @@ -14,7 +14,8 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import org.junit.*; import gplx.langs.regxs.*; import gplx.xowa.xtns.scribunto.engines.mocks.*; +import org.junit.*; import gplx.core.tests.*; +import gplx.langs.regxs.*; import gplx.xowa.xtns.scribunto.engines.mocks.*; public class Scrib_lib_ustring__gsub__tst { private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib; @Before public void init() { @@ -30,6 +31,9 @@ public class Scrib_lib_ustring__gsub__tst { // TOMBSTONE: tested with local MW and {{#invoke:Test|test16|a|[^]|b}} -> Lua error: Missing close-bracket for character set beginning at pattern character 1.; DATE:2018-07-02 // Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20 } + @Test public void Find__int() {// PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06 + fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(1, "[1]", "2", 1), "2;1"); // NOTE: text is integer (lua / php are type-less) + } @Test public void Replace__none() {// PURPOSE: gsub with no replace argument should not fail; EX:d:'orse; DATE:2013-10-14 fxt.Test__proc__objs__flat(lib, Scrib_lib_ustring.Invk_gsub, Object_.Ary("text", "regx") , "text"); // NOTE: repl, limit deliberately omitted } @@ -99,6 +103,20 @@ public class Scrib_lib_ustring__gsub__tst { Tfds.Eq(Bool_.Y, Regx_adp_.Match("\0", "[\\x]")); // \0 matched by any_char Tfds.Eq(Bool_.Y, Regx_adp_.Match("\0", "[\\X]")); // \0 matched by !any_char } + @Test public void Luacbk__basic() { + String text = "ad2f1e3z"; + String regx = "([1d])([2e])([3f])"; + Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"B", "d", "2", "f"}, new Object[]{"Y", "1", "e", "3"}); + fxt.Init__cbk(proc); + Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2"); + } + @Test public void Luacbk__anypos() { + String text = "ad2f1e3z"; + String regx = "()([1d])([2e])([3f])"; // "()" is anypos, which inserts find_pos to results + Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"B", 1, "d", "2", "f"}, new Object[]{"Y", 4, "1", "e", "3"}); + fxt.Init__cbk(proc); + Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2"); + } private void Exec_gsub(String text, Object regx, int limit, Object repl, String expd) { fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(text, regx, repl, limit), expd); } @@ -133,3 +151,17 @@ class Mock_proc__empty extends Mock_proc_fxt { private final String find, rep return String_.Eq(text, find) ? Keyval_.Ary(Keyval_.new_("0", repl)) : Keyval_.Ary_empty; } } +class Mock_proc__verify_args extends Mock_proc_fxt { private final Object[][] expd_ary; + private int expd_idx = -1; + public Mock_proc__verify_args(int id, Object[]... expd_ary) {super(id, "number"); + this.expd_ary = expd_ary; + } + @Override public Keyval[] Exec_by_scrib(Keyval[] args) { + Object[] expd_args = expd_ary[++expd_idx]; + Object rv = expd_args[0]; + expd_args = (Object[])Array_.Extract_by_pos(expd_args, 1); + Object[] actl_args = Keyval_.Ary__to_objary__val(args); + Gftest.Eq__ary(expd_args, actl_args, "failed lua_cbk"); + return Keyval_.Ary(Keyval_.int_(0, rv)); + } +} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java new file mode 100644 index 000000000..6377961c9 --- /dev/null +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java @@ -0,0 +1,235 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; +import gplx.langs.regxs.*; +import gplx.xowa.xtns.scribunto.procs.*; +class Scrib_lib_ustring_gsub_mgr { + private final Scrib_core core; + private final Scrib_regx_converter regx_converter; + private byte[] repl_bry; private Hash_adp repl_hash; private Scrib_lua_proc repl_func; + private int repl_count = 0; + public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) { + this.core = core; + this.regx_converter = regx_converter; + } + public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) { + // get @text; NOTE: sometimes int; DATE:2013-11-06 + String text = args.Xstr_str_or_null(0); + if (args.Len() == 2) return rslt.Init_obj(text); // if no @replace, return @text; PAGE:en.d:'orse; DATE:2013-10-13 + + // get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 + String regx = args.Xstr_str_or_null(1); + regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow); + + // get @repl + Object repl_obj = args.Cast_obj_or_null(2); + byte repl_tid = Identify_repl(repl_obj); + + // get @limit; reset repl_count + int limit = args.Cast_int_or(3, -1); + repl_count = 0; + + // do repl + String repl = Exec_repl(repl_tid, text, regx, limit); + return rslt.Init_many_objs(repl, repl_count); + } + private byte Identify_repl(Object repl_obj) { + byte repl_tid = Repl_tid_null; + // @repl can be String, int, table, func + Class repl_type = repl_obj.getClass(); + if (Object_.Eq(repl_type, String_.Cls_ref_type)) { + repl_tid = Repl_tid_string; + repl_bry = Bry_.new_u8((String)repl_obj); + } + else if (Object_.Eq(repl_type, Int_.Cls_ref_type)) { // NOTE:@replace sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 + repl_tid = Repl_tid_string; + repl_bry = Bry_.new_u8(Int_.To_str(Int_.Cast(repl_obj))); + } + else if (Object_.Eq(repl_type, Keyval[].class)) { + repl_tid = Repl_tid_table; + repl_hash = Hash_adp_.New(); + Keyval[] kvs = (Keyval[])repl_obj; + int kvs_len = kvs.length; + for (int i = 0; i < kvs_len; i++) { + Keyval kv = kvs[i]; + repl_hash.Add(kv.Key(), Bry_.new_u8(kv.Val_to_str_or_empty())); + } + } + else if (Object_.Eq(repl_type, Scrib_lua_proc.class)) { + repl_tid = Repl_tid_luacbk; + repl_func = (Scrib_lua_proc)repl_obj; + } + else if (Object_.Eq(repl_type, Double_.Cls_ref_type)) { // NOTE:@replace sometimes double; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2; DATE:2016-04-21 + repl_tid = Repl_tid_string; + repl_bry = Bry_.new_u8(Double_.To_str(Double_.cast(repl_obj))); + } + else + throw Err_.new_unhandled(Type_.Name(repl_type)); + return repl_tid; + } + private String Exec_repl(byte repl_tid, String text, String regx, int limit) { + // parse regx + Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); + if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02) + + // exec regx + Regx_match[] rslts = regx_mgr.Match_all(text, 0); + if (rslts.length == 0) return text; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php + rslts = regx_converter.Adjust_balanced(rslts); + + Bry_bfr tmp_bfr = Bry_bfr_.New(); + int rslts_len = rslts.length; + int text_pos = 0; + for (int i = 0; i < rslts_len; i++) { + if (repl_count == limit) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified + + // add text up to find.bgn + Regx_match rslt = rslts[i]; + tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17 + + // replace result + if (!Exec_repl_itm(tmp_bfr, repl_tid, text, rslt)) { + // will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22; + tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end())); + } + + // update + text_pos = rslt.Find_end(); + repl_count++; + } + + // add rest of String + int text_len = String_.Len(text); + if (text_pos < text_len) + tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17 + return tmp_bfr.To_str_and_clear(); + } + private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, String text, Regx_match match) { + switch (repl_tid) { + case Repl_tid_string: + int len = repl_bry.length; + for (int i = 0; i < len; i++) { + byte b = repl_bry[i]; + switch (b) { + case Byte_ascii.Percent: { + ++i; + if (i == len) // % at end of stream; just add %; + tmp_bfr.Add_byte(b); + else { + b = repl_bry[i]; + switch (b) { + case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4: + case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: + int idx = b - Byte_ascii.Num_0; + if (idx == 0) // NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 + tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end())); + else { // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 + idx -= List_adp_.Base1; + if (idx < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures + Regx_group grp = match.Groups()[idx]; + tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) + } + else { + tmp_bfr.Add_byte(Byte_ascii.Percent); + tmp_bfr.Add_byte(b); + } + } + break; + case Byte_ascii.Percent: + tmp_bfr.Add_byte(Byte_ascii.Percent); + break; + default: // not a number; add literal + tmp_bfr.Add_byte(Byte_ascii.Percent); + tmp_bfr.Add_byte(b); + break; + } + } + break; + } + default: + tmp_bfr.Add_byte(b); + break; + } + } + break; + case Repl_tid_table: { + int match_bgn = -1, match_end = -1; + Regx_group[] grps = match.Groups(); + if (grps.length == 0) { + match_bgn = match.Find_bgn(); + match_end = match.Find_end(); + } + else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15 + Regx_group grp = grps[0]; + match_bgn = grp.Bgn(); + match_end = grp.End(); + } + String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) + Object actl_repl_obj = repl_hash.Get_by(find_str); + if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31 + tmp_bfr.Add_str_u8(find_str); + else + tmp_bfr.Add((byte[])actl_repl_obj); + break; + } + case Repl_tid_luacbk: { + Keyval[] luacbk_args = null; + Regx_group[] grps = match.Groups(); + int grps_len = grps.length; + // no grps; pass 1 arg based on @match: EX: ("ace", "[b-d]"); args -> ("c") + if (grps_len == 0) { + String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end()); + luacbk_args = Scrib_kv_utl_.base1_obj_(find_str); + } + // grps exist; pass n args based on grp[n].match; EX: ("acfg", "([b-d])([e-g])"); args -> ("c", "f") + else { + // memoize any_pos args for loop + boolean any_pos = regx_converter.Any_pos(); + Keyval[] capt_ary = regx_converter.Capt_ary(); + int capt_ary_len = capt_ary.length; + + // loop grps; for each grp, create corresponding arg in luacbk + luacbk_args = new Keyval[grps_len]; + for (int i = 0; i < grps_len; i++) { + Regx_group grp = grps[i]; + + // anypos will create @offset arg; everything else creates a @match arg based on grp + Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val()) + ? (Object)grp.Bgn() + : (Object)String_.Mid(text, grp.Bgn(), grp.End()); + luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val); + } + } + + // do callback + Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args); + + // eval result + if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22 + return false; + else { // ArrayIndex check + Object rslt_obj = rslts[0].Val(); // 0th idx has result + tmp_bfr.Add_str_u8(Object_.Xto_str_strict_or_empty(rslt_obj)); // NOTE: always convert to String; rslt_obj can be int; PAGE:en.d:seven DATE:2016-04-27 + } + break; + } + default: throw Err_.new_unhandled(repl_tid); + } + return true; + } + private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3; + public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0]; +} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java index 02d7f2976..112ee5c41 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java @@ -37,6 +37,8 @@ public class Scrib_regx_converter { int len = pat.length; int grps_len = 0; int bct = 0; + + // REF.MW: https://github.com/wikimedia/mediawiki-extensions-Scribunto/blob/master/includes/engines/LuaCommon/UstringLibrary.php#L415 for (int i = 0; i < len; i++) { int i_end = i + 1; q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08 @@ -44,24 +46,28 @@ public class Scrib_regx_converter { switch (cur) { case Byte_ascii.Pow: q_flag = i != 0; - bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07 + bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07 break; case Byte_ascii.Dollar: q_flag = i < len - 1; bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal); break; case Byte_ascii.Paren_bgn: { + // fail if "(EOS" if (i + 1 >= len) throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i_end)); int grp_idx = grp_mgr.Capt__len() + 1; - boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; // current is "()" + + // check for "()"; enables anypos flag + boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; if (is_empty_capture) any_pos = true; - bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?"; grp_mgr.Capt__add__real(grp_idx, is_empty_capture); + bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?"; break; } case Byte_ascii.Paren_end: + // fail if ")" without preceding "(" if (grp_mgr.Open__len() <= 0) throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i_end)); grp_mgr.Open__pop();