diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java index 4e2cb8cab..6f77b0708 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java @@ -61,6 +61,10 @@ public class Scrib_lib_ustring__gsub__tst { @Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21 Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0"); } + @Test public void Replace__utf8() { // PURPOSE:do not cut off utf8-strings PAGE:en.d:𠮟 DATE:2020-05-31 + String regx = "^[\t]*(.-)[\t]*$"; // from mwtext.trim + Exec_gsub("𠮟a", regx, -1, "%1", "𠮟a;1"); // fails with "𠮟;1" + } @Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16 Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1"); } @@ -171,6 +175,17 @@ public class Scrib_lib_ustring__gsub__tst { fxt.Init__cbk(proc); Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd); } + @Test public void Luacbk__anypos__utf8() { // PURPOSE:handle UTF-8 chars with anypos match ISSUE#:726; DATE:2020-05-29 + String text = "𤭢 a"; + String regx = "()[𤭢a]()"; + String expd = "B C;2"; + Mock_proc__verify_args proc = new Mock_proc__verify_args(0 + , new Object[]{"B", 1, 2} // fails if 3 instead of 2 + , new Object[]{"C", 3, 4} + ); + fxt.Init__cbk(proc); + Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd); + } // Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"}); // fxt.Init__cbk(proc); // Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|28–0\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); } diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java index 6606926aa..57652f03d 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java @@ -1,6 +1,6 @@ /* XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com +Copyright (C) 2012-2020 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. @@ -13,11 +13,34 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import gplx.langs.regxs.*; -import gplx.objects.strings.unicodes.*; -import gplx.xowa.xtns.scribunto.libs.patterns.*; -import gplx.xowa.xtns.scribunto.procs.*; +package gplx.xowa.xtns.scribunto.libs; + +import gplx.Bool_; +import gplx.Bry_; +import gplx.Bry_bfr; +import gplx.Byte_ascii; +import gplx.Char_; +import gplx.Double_; +import gplx.Err_; +import gplx.Hash_adp; +import gplx.Hash_adp_; +import gplx.Int_; +import gplx.Keyval; +import gplx.Keyval_; +import gplx.List_adp_; +import gplx.Object_; +import gplx.String_; +import gplx.Type_; +import gplx.langs.regxs.Regx_group; +import gplx.langs.regxs.Regx_match; +import gplx.objects.strings.unicodes.Ustring_; +import gplx.xowa.xtns.scribunto.Scrib_core; +import gplx.xowa.xtns.scribunto.Scrib_kv_utl_; +import gplx.xowa.xtns.scribunto.Scrib_lua_proc; +import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher; +import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args; +import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt; + public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES private final Scrib_core core; private String src_str; @@ -108,7 +131,7 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures Regx_group grp = match.Groups()[idx - 1]; - tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) + tmp_bfr.Add_str_u8(grp.Val()); // NOTE: changed from String_.Mid(src_str, grp.Bgn(), grp.End()); DATE:2020-05-31 } // NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20 else if (idx == 1) { @@ -136,18 +159,15 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES } break; case Repl_tid_table: { - int match_bgn = -1, match_end = -1; Regx_group[] grps = match.Groups(); + String find_str = null; if (grps.length == 0) { - match_bgn = match.Find_bgn(); - match_end = match.Find_end(); + find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end()); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) } else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15 Regx_group grp = grps[0]; - match_bgn = grp.Bgn(); - match_end = grp.End(); + find_str = grp.Val(); } - String find_str = String_.Mid(src_str, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) Object actl_repl_obj = repl_hash.Get_by(find_str); if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31 tmp_bfr.Add_str_u8(find_str); @@ -176,10 +196,17 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES for (int i = 0; i < grps_len; i++) { Regx_group grp = grps[i]; - // anypos will create @offset arg; everything else creates a @match arg based on grp - Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val()) - ? (Object)(grp.Bgn() + List_adp_.Base1) // NOTE: must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17; - : (Object)String_.Mid(src_str, grp.Bgn(), grp.End()); + // anypos will create @offset arg; everything else creates a @match arg based on grp; FOOTNOTE:CAPTURES + boolean anyposExists = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val()); + Object val = null; + if (anyposExists) { + // emptyCapture ("anypos" or `()`) must pass integer position; must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17; + val = (Object)(grp.Bgn() + List_adp_.Base1); + } + else { + // standardCapture must pass string match + val = grp.Val(); + } luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val); } } @@ -203,3 +230,30 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3; public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0]; } +/* +== FOOTNOTE:CAPTURES [ISSUE#:726; DATE:2020-05-17] == +There are two types of captures: +* '''basicCaptures''': EX: given `abcd`, `a(bc)d` captureValues will be 1, 3 b/c `(bc)` captures the start / end of the match +* '''emptyCaptures''': EX: given `abcd`, `()bcd` captureValues will be 1, 2 b/c `()` captures the position of the match + +The above captureValues are base0 b/c Str_find_mgr__xowa uses base0 +* Keep in mind that XOWA is base0 b/c it works directly with byte arrays and need base0 to index into these 0-based arrays + +In contrast, Lua is base1. However, this base1-ness is not exposed anywhere, except in gsubs's FunctionCallback. +Even then, it is only exposed for emptyCaptures, not basicCaptures due to how Lua passes parameters + +For example, consider this code: +``` +function p.test_726_anypos() + mw.ustring.gsub("abcd", "a(bc)d", function(arg1) + mw.log('basic', arg1); -- arg1 is the matched string or "bc" + end) + + mw.ustring.gsub("abcd", "()bcd", function(arg1) + mw.log('empty', arg1); -- arg1 is the position of the empty capture or "2" + end) +end +``` + +SEE:FOOTNOTE:REGX_GROUP +*/ \ No newline at end of file diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java index cfc06399c..c4fe4934c 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java @@ -122,15 +122,35 @@ class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher { for (int i = 0; i < captures_len; i += 2) { int capture_bgn = captures[i]; int capture_end = captures[i + 1]; - // ISSUE#:726; DATE:2020-05-17; - // NOTE: capture values are base-0 and are added by any pattern captures, including: - // * standard captures EX: `a(bc)d` for `abcd` will have 1, 3 - // * empty captures EX: `()bc` for `abcd` will have 1, 2 - // Note that empty captures will be normalized to base-1 in Scrib_lib_ustring_gsub_mgr inside the any_pos code - capture_bgn = src_ucs.Map_data_to_char(capture_bgn); - capture_end = src_ucs.Map_data_to_char(capture_end); - groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end)); + // FOOTNOTE:REGX_GROUP + int bgn_in_chars = src_ucs.Map_data_to_char(capture_bgn); + int end_in_chars = src_ucs.Map_data_to_char(capture_end); + String val = String_.Mid(src_ucs.Src(), bgn_in_chars, end_in_chars); + groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, val); } return groups; } } + +/* +== FOOTNOTE:REGX_GROUP [ISSUE#:726; DATE:2020-05-17] == +The XOWA Regx_group is a quasi-adapter for java.util.regex.Matcher and its group-related methods. + +Consider a Regx_group with varName `grp` and a Matcher with varName `match` +* `grp.Bgn()` <- `match.start()` +* `grp.End()` <- `match.end()` +* `grp.Val()` <- `match.group(i)` + +Note that all callers of `grp` would be expecting REGEX convention (not LUA pattern convention). As such: +* '''base0''': `grp.Bgn()` and `grp.End()` must be base0 not base1 (REGEX is base0) +** Fortunately, Str_find_mgr__xowa uses base0, so there is no need to convert from base1 to base0 +** However, Scrib_lib_ustring_gsub_mgr will convert base0 to base1 in the gsub FunctionCallback code '''IF''' anypos is present in the pattern +* '''charIndexes''': `grp.Bgn()` and `grp.End()` should represent charIndexes, not byteIndexes (REGEX is chars) +** Str_find_mgr__xowa uses codepointIndexes b/c of Ustring_ucs +** In theory, should convert to charIndexes b/c REGEX uses charIndexes. However: +*** Regx_group.Bgn() is only used by anypos for LuaCallbacks +*** anypos needs codepointIndexes +*** so, be lazy, and don't bother double converting to charIndex only to convert back to codepointIndex + +SEE:FOOTNOTE:CAPTURES +*/ \ No newline at end of file