1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Scribunto: Handle anypos flag [#337]

This commit is contained in:
gnosygnu 2019-01-27 21:18:20 -05:00
parent f44a1874a8
commit 4626203f16
7 changed files with 306 additions and 219 deletions

View File

@ -54,6 +54,15 @@ public class Array_ {
Copy_to(src, 0, trg, 0, copy_len);
return trg;
}
public static Object Extract_by_pos(Object src, int src_bgn) {
return Extract_by_pos(src, src_bgn, Array.getLength(src));
}
public static Object Extract_by_pos(Object src, int src_bgn, int src_end) {
int trg_len = src_end - src_bgn;
Object trg = Create(Component_type(src), trg_len);
Copy_to(src, src_bgn, trg, 0, src_end - src_bgn);
return trg;
}
public static List_adp To_list(Object ary) {
int aryLen = Array_.Len(ary);
List_adp rv = List_adp_.New();

View File

@ -70,6 +70,14 @@ public class Keyval_ {
Ary__to_str__nest__ary(bfr, 0, true, ary);
return bfr.To_str_and_clear();
}
public static Object[] Ary__to_objary__val(Keyval[] ary) {
int ary_len = ary.length;
Object[] rv = new Object[ary_len];
for (int i = 0; i < ary_len; i++) {
rv[i] = ary[i].Val();
}
return rv;
}
private static void Ary__to_str__nest__ary(Bry_bfr bfr, int indent, boolean is_kv, Object[] ary) {
int len = ary.length;
for (int i = 0; i < len; ++i) {

View File

@ -17,6 +17,7 @@ package gplx.core.tests; import gplx.*; import gplx.core.*;
import gplx.core.brys.*;
public class Gftest {
private static final Bry_bfr bfr = Bry_bfr_.New();
public static void Eq__ary(Object[] expd, Object[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__obj, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(boolean[] expd, boolean[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__bool, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(int[] expd, int[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__int, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(long[] expd, long[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__long, expd, actl, msg_fmt, msg_args);}
@ -151,12 +152,14 @@ public class Gftest {
}
private static void Write__itm(Bry_bfr bfr, int type_id, Object ary, int len, int idx) {
if (idx < len) {
Object val = Array_.Get_at(ary, idx);
switch (type_id) {
case Type_ids_.Id__bool: bfr.Add_yn(Bool_.Cast(Array_.Get_at(ary, idx))); break;
case Type_ids_.Id__bry: bfr.Add_safe((byte[])Array_.Get_at(ary, idx)); break;
case Type_ids_.Id__long: bfr.Add_long_variable(Long_.cast(Array_.Get_at(ary, idx))); break;
case Type_ids_.Id__int: bfr.Add_int_variable(Int_.Cast(Array_.Get_at(ary, idx))); break;
case Type_ids_.Id__byte: bfr.Add_int_variable((int)(Byte_.Cast(Array_.Get_at(ary, idx)))); break;
case Type_ids_.Id__bool: bfr.Add_yn(Bool_.Cast(val)); break;
case Type_ids_.Id__bry: bfr.Add_safe((byte[])val); break;
case Type_ids_.Id__long: bfr.Add_long_variable(Long_.cast(val)); break;
case Type_ids_.Id__int: bfr.Add_int_variable(Int_.Cast(val)); break;
case Type_ids_.Id__byte: bfr.Add_int_variable((int)(Byte_.Cast(val))); break;
case Type_ids_.Id__obj: bfr.Add_str_u8(Object_.Xto_str_strict_or_null_mark(val)); break;
default: throw Err_.new_unhandled_default(type_id);
}
}
@ -182,6 +185,7 @@ public class Gftest {
case Type_ids_.Id__long: eq = Long_.cast(expd_obj) == Long_.cast(actl_obj); break;
case Type_ids_.Id__int: eq = Int_.Cast(expd_obj) == Int_.Cast(actl_obj); break;
case Type_ids_.Id__byte: eq = Byte_.Cast(expd_obj) == Byte_.Cast(actl_obj); break;
case Type_ids_.Id__obj: eq = Object_.Eq(expd_obj, actl_obj); break;
}
}
if (!eq) {

View File

@ -14,7 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.langs.regxs.*; import gplx.core.intls.*;
import gplx.core.intls.*; import gplx.langs.regxs.*;
import gplx.xowa.parsers.*;
import gplx.xowa.xtns.scribunto.procs.*;
public class Scrib_lib_ustring implements Scrib_lib {
@ -118,30 +118,9 @@ public class Scrib_lib_ustring implements Scrib_lib {
AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
}
private Scrib_lib_ustring_gsub_mgr[] gsub_mgr_ary = Scrib_lib_ustring_gsub_mgr.Ary_empty;
private int gsub_mgr_max = 0, gsub_mgr_len = -1;
private final Object gsub_mgr_lock = new Object();
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
boolean rv = false;
synchronized (gsub_mgr_lock) { // handle recursive gsub calls; PAGE:en.d:כלב; DATE:2016-01-22
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
int new_len = gsub_mgr_len + 1;
if (new_len == gsub_mgr_max) {
this.gsub_mgr_max = new_len == 0 ? 2 : new_len * 2;
Scrib_lib_ustring_gsub_mgr[] new_gsub_mgr_ary = new Scrib_lib_ustring_gsub_mgr[gsub_mgr_max];
Array_.Copy(gsub_mgr_ary, new_gsub_mgr_ary);
gsub_mgr_ary = new_gsub_mgr_ary;
}
Scrib_lib_ustring_gsub_mgr cur = gsub_mgr_ary[new_len];
if (cur == null) {
cur = new Scrib_lib_ustring_gsub_mgr(core, regx_converter);
gsub_mgr_ary[new_len] = cur;
}
this.gsub_mgr_len = new_len;
rv = cur.Exec(args, rslt);
--gsub_mgr_len;
}
return rv;
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core, new Scrib_regx_converter());
return gsub_mgr.Exec(args, rslt);
}
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
@ -195,189 +174,3 @@ public class Scrib_lib_ustring implements Scrib_lib {
private static final int Base1 = 1
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
}
class Scrib_lib_ustring_gsub_mgr {
private Scrib_regx_converter regx_converter;
public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) {this.core = core; this.regx_converter = regx_converter;} private Scrib_core core;
private byte tmp_repl_tid = Repl_tid_null; private byte[] tmp_repl_bry = null;
private Hash_adp repl_hash = null; private Scrib_lua_proc repl_func = null;
private int repl_count = 0;
public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) {
Object text_obj = args.Cast_obj_or_null(0);
String text = String_.as_(text_obj);
if (text == null) text = Object_.Xto_str_strict_or_empty(text_obj);
String regx = args.Xstr_str_or_null(1); // NOTE: @pattern sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
if (args.Len() == 2) return rslt.Init_obj(text); // if no replace arg, return self; PAGE:en.d:'orse; DATE:2013-10-13
Object repl_obj = args.Cast_obj_or_null(2);
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
int limit = args.Cast_int_or(3, -1);
repl_count = 0;
Identify_repl(repl_obj);
String repl = Exec_repl(tmp_repl_tid, tmp_repl_bry, text, regx, limit);
return rslt.Init_many_objs(repl, repl_count);
}
private void Identify_repl(Object repl_obj) {
Class<?> repl_type = repl_obj.getClass();
if (Object_.Eq(repl_type, String_.Cls_ref_type)) {
tmp_repl_tid = Repl_tid_string;
tmp_repl_bry = Bry_.new_u8((String)repl_obj);
}
else if (Object_.Eq(repl_type, Int_.Cls_ref_type)) { // NOTE:@replace sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
tmp_repl_tid = Repl_tid_string;
tmp_repl_bry = Bry_.new_u8(Int_.To_str(Int_.Cast(repl_obj)));
}
else if (Object_.Eq(repl_type, Keyval[].class)) {
tmp_repl_tid = Repl_tid_table;
Keyval[] repl_tbl = (Keyval[])repl_obj;
if (repl_hash == null)
repl_hash = Hash_adp_.New();
else
repl_hash.Clear();
int repl_tbl_len = repl_tbl.length;
for (int i = 0; i < repl_tbl_len; i++) {
Keyval repl_itm = repl_tbl[i];
String repl_itm_val = repl_itm.Val_to_str_or_empty();
repl_hash.Add(repl_itm.Key(), Bry_.new_u8(repl_itm_val));
}
}
else if (Object_.Eq(repl_type, Scrib_lua_proc.class)) {
tmp_repl_tid = Repl_tid_luacbk;
repl_func = (Scrib_lua_proc)repl_obj;
}
else if (Object_.Eq(repl_type, Double_.Cls_ref_type)) { // NOTE:@replace sometimes double; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2; DATE:2016-04-21
tmp_repl_tid = Repl_tid_string;
tmp_repl_bry = Bry_.new_u8(Double_.To_str(Double_.cast(repl_obj)));
}
else throw Err_.new_unhandled(Type_.Name(repl_type));
}
private String Exec_repl(byte repl_tid, byte[] repl_bry, String text, String regx, int limit) {
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_match[] rslts = regx_mgr.Match_all(text, 0);
if ( rslts.length == 0 // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
|| regx_mgr.Pattern_is_invalid() // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02
) return text;
rslts = regx_converter.Adjust_balanced(rslts);
Bry_bfr tmp_bfr = Bry_bfr_.New();
int len = rslts.length;
int pos = 0;
for (int i = 0; i < len; i++) {
if (limit > -1 && repl_count == limit) break;
Regx_match rslt = rslts[i];
tmp_bfr.Add_str_u8(String_.Mid(text, pos, rslt.Find_bgn())); // NOTE: regx returns char pos (not bry); must add as String, not bry; DATE:2013-07-17
if (!Exec_repl_itm(tmp_bfr, repl_tid, repl_bry, text, rslt)) { // will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
pos = rslt.Find_end();
++repl_count;
}
int text_len = String_.Len(text);
if (pos < text_len)
tmp_bfr.Add_str_u8(String_.Mid(text, pos, text_len)); // NOTE: regx returns char pos (not bry); must add as String, not bry; DATE:2013-07-17
return tmp_bfr.To_str_and_clear();
}
private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, byte[] repl_bry, String text, Regx_match match) {
switch (repl_tid) {
case Repl_tid_string:
int len = repl_bry.length;
for (int i = 0; i < len; i++) {
byte b = repl_bry[i];
switch (b) {
case Byte_ascii.Percent: {
++i;
if (i == len) // % at end of stream; just add %;
tmp_bfr.Add_byte(b);
else {
b = repl_bry[i];
switch (b) {
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
int idx = b - Byte_ascii.Num_0;
if (idx == 0) // NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
else { // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
idx -= List_adp_.Base1;
if (idx < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
Regx_group grp = match.Groups()[idx];
tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
}
else {
tmp_bfr.Add_byte(Byte_ascii.Percent);
tmp_bfr.Add_byte(b);
}
}
break;
case Byte_ascii.Percent:
tmp_bfr.Add_byte(Byte_ascii.Percent);
break;
default: // not a number; add literal
tmp_bfr.Add_byte(Byte_ascii.Percent);
tmp_bfr.Add_byte(b);
break;
}
}
break;
}
default:
tmp_bfr.Add_byte(b);
break;
}
}
break;
case Repl_tid_table: {
int match_bgn = -1, match_end = -1;
Regx_group[] grps = match.Groups();
if (grps.length == 0) {
match_bgn = match.Find_bgn();
match_end = match.Find_end();
}
else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15
Regx_group grp = grps[0];
match_bgn = grp.Bgn();
match_end = grp.End();
}
String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
Object actl_repl_obj = repl_hash.Get_by(find_str);
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
tmp_bfr.Add_str_u8(find_str);
else
tmp_bfr.Add((byte[])actl_repl_obj);
break;
}
case Repl_tid_luacbk: {
// TOMBSTONE: was causing garbled text on PAGE:en.w:Template:Infobox_kommune DATE:2018-07-02
/*
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
*/
Keyval[] luacbk_args = null;
Regx_group[] grps = match.Groups();
int grps_len = grps.length;
if (grps_len == 0) { // no match; use original String
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
}
else { // match; build ary of matches; (see UStringLibrary.php)
luacbk_args = new Keyval[grps_len];
for (int i = 0; i < grps_len; i++) {
Regx_group grp = grps[i];
String find_str = String_.Mid(text, grp.Bgn(), grp.End());
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str);
}
}
/*
*/
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22
return false;
else { // ArrayIndex check
Object rslt_obj = rslts[0].Val(); // 0th idx has result
tmp_bfr.Add_str_u8(Object_.Xto_str_strict_or_empty(rslt_obj)); // NOTE: always convert to String; rslt_obj can be int; PAGE:en.d:seven DATE:2016-04-27
}
break;
}
default: throw Err_.new_unhandled(repl_tid);
}
return true;
}
private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3;
public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0];
}

View File

@ -14,7 +14,8 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import org.junit.*; import gplx.langs.regxs.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.langs.regxs.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
public class Scrib_lib_ustring__gsub__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() {
@ -30,6 +31,9 @@ public class Scrib_lib_ustring__gsub__tst {
// TOMBSTONE: tested with local MW and {{#invoke:Test|test16|a|[^]|b}} -> Lua error: Missing close-bracket for character set beginning at pattern character 1.; DATE:2018-07-02
// Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20
}
@Test public void Find__int() {// PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(1, "[1]", "2", 1), "2;1"); // NOTE: text is integer (lua / php are type-less)
}
@Test public void Replace__none() {// PURPOSE: gsub with no replace argument should not fail; EX:d:'orse; DATE:2013-10-14
fxt.Test__proc__objs__flat(lib, Scrib_lib_ustring.Invk_gsub, Object_.Ary("text", "regx") , "text"); // NOTE: repl, limit deliberately omitted
}
@ -99,6 +103,20 @@ public class Scrib_lib_ustring__gsub__tst {
Tfds.Eq(Bool_.Y, Regx_adp_.Match("\0", "[\\x]")); // \0 matched by any_char
Tfds.Eq(Bool_.Y, Regx_adp_.Match("\0", "[\\X]")); // \0 matched by !any_char
}
@Test public void Luacbk__basic() {
String text = "ad2f1e3z";
String regx = "([1d])([2e])([3f])";
Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"B", "d", "2", "f"}, new Object[]{"Y", "1", "e", "3"});
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2");
}
@Test public void Luacbk__anypos() {
String text = "ad2f1e3z";
String regx = "()([1d])([2e])([3f])"; // "()" is anypos, which inserts find_pos to results
Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"B", 1, "d", "2", "f"}, new Object[]{"Y", 4, "1", "e", "3"});
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2");
}
private void Exec_gsub(String text, Object regx, int limit, Object repl, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(text, regx, repl, limit), expd);
}
@ -133,3 +151,17 @@ class Mock_proc__empty extends Mock_proc_fxt { private final String find, rep
return String_.Eq(text, find) ? Keyval_.Ary(Keyval_.new_("0", repl)) : Keyval_.Ary_empty;
}
}
class Mock_proc__verify_args extends Mock_proc_fxt { private final Object[][] expd_ary;
private int expd_idx = -1;
public Mock_proc__verify_args(int id, Object[]... expd_ary) {super(id, "number");
this.expd_ary = expd_ary;
}
@Override public Keyval[] Exec_by_scrib(Keyval[] args) {
Object[] expd_args = expd_ary[++expd_idx];
Object rv = expd_args[0];
expd_args = (Object[])Array_.Extract_by_pos(expd_args, 1);
Object[] actl_args = Keyval_.Ary__to_objary__val(args);
Gftest.Eq__ary(expd_args, actl_args, "failed lua_cbk");
return Keyval_.Ary(Keyval_.int_(0, rv));
}
}

View File

@ -0,0 +1,235 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.langs.regxs.*;
import gplx.xowa.xtns.scribunto.procs.*;
class Scrib_lib_ustring_gsub_mgr {
private final Scrib_core core;
private final Scrib_regx_converter regx_converter;
private byte[] repl_bry; private Hash_adp repl_hash; private Scrib_lua_proc repl_func;
private int repl_count = 0;
public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) {
this.core = core;
this.regx_converter = regx_converter;
}
public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get @text; NOTE: sometimes int; DATE:2013-11-06
String text = args.Xstr_str_or_null(0);
if (args.Len() == 2) return rslt.Init_obj(text); // if no @replace, return @text; PAGE:en.d:'orse; DATE:2013-10-13
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
String regx = args.Xstr_str_or_null(1);
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
// get @repl
Object repl_obj = args.Cast_obj_or_null(2);
byte repl_tid = Identify_repl(repl_obj);
// get @limit; reset repl_count
int limit = args.Cast_int_or(3, -1);
repl_count = 0;
// do repl
String repl = Exec_repl(repl_tid, text, regx, limit);
return rslt.Init_many_objs(repl, repl_count);
}
private byte Identify_repl(Object repl_obj) {
byte repl_tid = Repl_tid_null;
// @repl can be String, int, table, func
Class<?> repl_type = repl_obj.getClass();
if (Object_.Eq(repl_type, String_.Cls_ref_type)) {
repl_tid = Repl_tid_string;
repl_bry = Bry_.new_u8((String)repl_obj);
}
else if (Object_.Eq(repl_type, Int_.Cls_ref_type)) { // NOTE:@replace sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
repl_tid = Repl_tid_string;
repl_bry = Bry_.new_u8(Int_.To_str(Int_.Cast(repl_obj)));
}
else if (Object_.Eq(repl_type, Keyval[].class)) {
repl_tid = Repl_tid_table;
repl_hash = Hash_adp_.New();
Keyval[] kvs = (Keyval[])repl_obj;
int kvs_len = kvs.length;
for (int i = 0; i < kvs_len; i++) {
Keyval kv = kvs[i];
repl_hash.Add(kv.Key(), Bry_.new_u8(kv.Val_to_str_or_empty()));
}
}
else if (Object_.Eq(repl_type, Scrib_lua_proc.class)) {
repl_tid = Repl_tid_luacbk;
repl_func = (Scrib_lua_proc)repl_obj;
}
else if (Object_.Eq(repl_type, Double_.Cls_ref_type)) { // NOTE:@replace sometimes double; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2; DATE:2016-04-21
repl_tid = Repl_tid_string;
repl_bry = Bry_.new_u8(Double_.To_str(Double_.cast(repl_obj)));
}
else
throw Err_.new_unhandled(Type_.Name(repl_type));
return repl_tid;
}
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
// parse regx
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
// exec regx
Regx_match[] rslts = regx_mgr.Match_all(text, 0);
if (rslts.length == 0) return text; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
rslts = regx_converter.Adjust_balanced(rslts);
Bry_bfr tmp_bfr = Bry_bfr_.New();
int rslts_len = rslts.length;
int text_pos = 0;
for (int i = 0; i < rslts_len; i++) {
if (repl_count == limit) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified
// add text up to find.bgn
Regx_match rslt = rslts[i];
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
// replace result
if (!Exec_repl_itm(tmp_bfr, repl_tid, text, rslt)) {
// will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
// update
text_pos = rslt.Find_end();
repl_count++;
}
// add rest of String
int text_len = String_.Len(text);
if (text_pos < text_len)
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
return tmp_bfr.To_str_and_clear();
}
private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, String text, Regx_match match) {
switch (repl_tid) {
case Repl_tid_string:
int len = repl_bry.length;
for (int i = 0; i < len; i++) {
byte b = repl_bry[i];
switch (b) {
case Byte_ascii.Percent: {
++i;
if (i == len) // % at end of stream; just add %;
tmp_bfr.Add_byte(b);
else {
b = repl_bry[i];
switch (b) {
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
int idx = b - Byte_ascii.Num_0;
if (idx == 0) // NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
else { // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
idx -= List_adp_.Base1;
if (idx < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
Regx_group grp = match.Groups()[idx];
tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
}
else {
tmp_bfr.Add_byte(Byte_ascii.Percent);
tmp_bfr.Add_byte(b);
}
}
break;
case Byte_ascii.Percent:
tmp_bfr.Add_byte(Byte_ascii.Percent);
break;
default: // not a number; add literal
tmp_bfr.Add_byte(Byte_ascii.Percent);
tmp_bfr.Add_byte(b);
break;
}
}
break;
}
default:
tmp_bfr.Add_byte(b);
break;
}
}
break;
case Repl_tid_table: {
int match_bgn = -1, match_end = -1;
Regx_group[] grps = match.Groups();
if (grps.length == 0) {
match_bgn = match.Find_bgn();
match_end = match.Find_end();
}
else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15
Regx_group grp = grps[0];
match_bgn = grp.Bgn();
match_end = grp.End();
}
String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
Object actl_repl_obj = repl_hash.Get_by(find_str);
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
tmp_bfr.Add_str_u8(find_str);
else
tmp_bfr.Add((byte[])actl_repl_obj);
break;
}
case Repl_tid_luacbk: {
Keyval[] luacbk_args = null;
Regx_group[] grps = match.Groups();
int grps_len = grps.length;
// no grps; pass 1 arg based on @match: EX: ("ace", "[b-d]"); args -> ("c")
if (grps_len == 0) {
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
}
// grps exist; pass n args based on grp[n].match; EX: ("acfg", "([b-d])([e-g])"); args -> ("c", "f")
else {
// memoize any_pos args for loop
boolean any_pos = regx_converter.Any_pos();
Keyval[] capt_ary = regx_converter.Capt_ary();
int capt_ary_len = capt_ary.length;
// loop grps; for each grp, create corresponding arg in luacbk
luacbk_args = new Keyval[grps_len];
for (int i = 0; i < grps_len; i++) {
Regx_group grp = grps[i];
// anypos will create @offset arg; everything else creates a @match arg based on grp
Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val())
? (Object)grp.Bgn()
: (Object)String_.Mid(text, grp.Bgn(), grp.End());
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
}
}
// do callback
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
// eval result
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22
return false;
else { // ArrayIndex check
Object rslt_obj = rslts[0].Val(); // 0th idx has result
tmp_bfr.Add_str_u8(Object_.Xto_str_strict_or_empty(rslt_obj)); // NOTE: always convert to String; rslt_obj can be int; PAGE:en.d:seven DATE:2016-04-27
}
break;
}
default: throw Err_.new_unhandled(repl_tid);
}
return true;
}
private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3;
public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0];
}

View File

@ -37,6 +37,8 @@ public class Scrib_regx_converter {
int len = pat.length;
int grps_len = 0;
int bct = 0;
// REF.MW: https://github.com/wikimedia/mediawiki-extensions-Scribunto/blob/master/includes/engines/LuaCommon/UstringLibrary.php#L415
for (int i = 0; i < len; i++) {
int i_end = i + 1;
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
@ -44,24 +46,28 @@ public class Scrib_regx_converter {
switch (cur) {
case Byte_ascii.Pow:
q_flag = i != 0;
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
break;
case Byte_ascii.Dollar:
q_flag = i < len - 1;
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
break;
case Byte_ascii.Paren_bgn: {
// fail if "(EOS"
if (i + 1 >= len)
throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i_end));
int grp_idx = grp_mgr.Capt__len() + 1;
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; // current is "()"
// check for "()"; enables anypos flag
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end;
if (is_empty_capture)
any_pos = true;
bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?<m$n>";
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?<m$n>";
break;
}
case Byte_ascii.Paren_end:
// fail if ")" without preceding "("
if (grp_mgr.Open__len() <= 0)
throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i_end));
grp_mgr.Open__pop();