Scribunto: Create match string using char-indexes, not codepointIndexes (to handle UTF-8 strings) [#726]

staging
gnosygnu 4 years ago
parent be072de8d9
commit 35f2027b20

@ -61,6 +61,10 @@ public class Scrib_lib_ustring__gsub__tst {
@Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21 @Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21
Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0"); Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0");
} }
@Test public void Replace__utf8() { // PURPOSE:do not cut off utf8-strings PAGE:en.d:𠮟 DATE:2020-05-31
String regx = "^[\t]*(.-)[\t]*$"; // from mwtext.trim
Exec_gsub("𠮟a", regx, -1, "%1", "𠮟a;1"); // fails with "𠮟;1"
}
@Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16 @Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1"); Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1");
} }
@ -171,6 +175,17 @@ public class Scrib_lib_ustring__gsub__tst {
fxt.Init__cbk(proc); fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd); Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
} }
@Test public void Luacbk__anypos__utf8() { // PURPOSE:handle UTF-8 chars with anypos match ISSUE#:726; DATE:2020-05-29
String text = "𤭢 a";
String regx = "()[𤭢a]()";
String expd = "B C;2";
Mock_proc__verify_args proc = new Mock_proc__verify_args(0
, new Object[]{"B", 1, 2} // fails if 3 instead of 2
, new Object[]{"C", 3, 4}
);
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
}
// Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"}); // Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"});
// fxt.Init__cbk(proc); // fxt.Init__cbk(proc);
// Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|280\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); } // Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|280\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); }

@ -1,6 +1,6 @@
/* /*
XOWA: the XOWA Offline Wiki Application XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3, XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0. or alternatively under the terms of the Apache License Version 2.0.
@ -13,11 +13,34 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/ */
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; package gplx.xowa.xtns.scribunto.libs;
import gplx.langs.regxs.*;
import gplx.objects.strings.unicodes.*; import gplx.Bool_;
import gplx.xowa.xtns.scribunto.libs.patterns.*; import gplx.Bry_;
import gplx.xowa.xtns.scribunto.procs.*; import gplx.Bry_bfr;
import gplx.Byte_ascii;
import gplx.Char_;
import gplx.Double_;
import gplx.Err_;
import gplx.Hash_adp;
import gplx.Hash_adp_;
import gplx.Int_;
import gplx.Keyval;
import gplx.Keyval_;
import gplx.List_adp_;
import gplx.Object_;
import gplx.String_;
import gplx.Type_;
import gplx.langs.regxs.Regx_group;
import gplx.langs.regxs.Regx_match;
import gplx.objects.strings.unicodes.Ustring_;
import gplx.xowa.xtns.scribunto.Scrib_core;
import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
import gplx.xowa.xtns.scribunto.Scrib_lua_proc;
import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt;
public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
private final Scrib_core core; private final Scrib_core core;
private String src_str; private String src_str;
@ -108,7 +131,7 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
// NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02 // NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
Regx_group grp = match.Groups()[idx - 1]; Regx_group grp = match.Groups()[idx - 1];
tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings) tmp_bfr.Add_str_u8(grp.Val()); // NOTE: changed from String_.Mid(src_str, grp.Bgn(), grp.End()); DATE:2020-05-31
} }
// NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20 // NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20
else if (idx == 1) { else if (idx == 1) {
@ -136,18 +159,15 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
} }
break; break;
case Repl_tid_table: { case Repl_tid_table: {
int match_bgn = -1, match_end = -1;
Regx_group[] grps = match.Groups(); Regx_group[] grps = match.Groups();
String find_str = null;
if (grps.length == 0) { if (grps.length == 0) {
match_bgn = match.Find_bgn(); find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end()); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
match_end = match.Find_end();
} }
else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15 else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15
Regx_group grp = grps[0]; Regx_group grp = grps[0];
match_bgn = grp.Bgn(); find_str = grp.Val();
match_end = grp.End();
} }
String find_str = String_.Mid(src_str, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
Object actl_repl_obj = repl_hash.Get_by(find_str); Object actl_repl_obj = repl_hash.Get_by(find_str);
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31 if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
tmp_bfr.Add_str_u8(find_str); tmp_bfr.Add_str_u8(find_str);
@ -176,10 +196,17 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
for (int i = 0; i < grps_len; i++) { for (int i = 0; i < grps_len; i++) {
Regx_group grp = grps[i]; Regx_group grp = grps[i];
// anypos will create @offset arg; everything else creates a @match arg based on grp // anypos will create @offset arg; everything else creates a @match arg based on grp; FOOTNOTE:CAPTURES
Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val()) boolean anyposExists = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val());
? (Object)(grp.Bgn() + List_adp_.Base1) // NOTE: must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17; Object val = null;
: (Object)String_.Mid(src_str, grp.Bgn(), grp.End()); if (anyposExists) {
// emptyCapture ("anypos" or `()`) must pass integer position; must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17;
val = (Object)(grp.Bgn() + List_adp_.Base1);
}
else {
// standardCapture must pass string match
val = grp.Val();
}
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val); luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
} }
} }
@ -203,3 +230,30 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3; private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3;
public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0]; public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0];
} }
/*
== FOOTNOTE:CAPTURES [ISSUE#:726; DATE:2020-05-17] ==
There are two types of captures:
* '''basicCaptures''': EX: given `abcd`, `a(bc)d` captureValues will be 1, 3 b/c `(bc)` captures the start / end of the match
* '''emptyCaptures''': EX: given `abcd`, `()bcd` captureValues will be 1, 2 b/c `()` captures the position of the match
The above captureValues are base0 b/c Str_find_mgr__xowa uses base0
* Keep in mind that XOWA is base0 b/c it works directly with byte arrays and need base0 to index into these 0-based arrays
In contrast, Lua is base1. However, this base1-ness is not exposed anywhere, except in gsubs's FunctionCallback.
Even then, it is only exposed for emptyCaptures, not basicCaptures due to how Lua passes parameters
For example, consider this code:
```
function p.test_726_anypos()
mw.ustring.gsub("abcd", "a(bc)d", function(arg1)
mw.log('basic', arg1); -- arg1 is the matched string or "bc"
end)
mw.ustring.gsub("abcd", "()bcd", function(arg1)
mw.log('empty', arg1); -- arg1 is the position of the empty capture or "2"
end)
end
```
SEE:FOOTNOTE:REGX_GROUP
*/

@ -122,15 +122,35 @@ class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher {
for (int i = 0; i < captures_len; i += 2) { for (int i = 0; i < captures_len; i += 2) {
int capture_bgn = captures[i]; int capture_bgn = captures[i];
int capture_end = captures[i + 1]; int capture_end = captures[i + 1];
// ISSUE#:726; DATE:2020-05-17; // FOOTNOTE:REGX_GROUP
// NOTE: capture values are base-0 and are added by any pattern captures, including: int bgn_in_chars = src_ucs.Map_data_to_char(capture_bgn);
// * standard captures EX: `a(bc)d` for `abcd` will have 1, 3 int end_in_chars = src_ucs.Map_data_to_char(capture_end);
// * empty captures EX: `()bc` for `abcd` will have 1, 2 String val = String_.Mid(src_ucs.Src(), bgn_in_chars, end_in_chars);
// Note that empty captures will be normalized to base-1 in Scrib_lib_ustring_gsub_mgr inside the any_pos code groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, val);
capture_bgn = src_ucs.Map_data_to_char(capture_bgn);
capture_end = src_ucs.Map_data_to_char(capture_end);
groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end));
} }
return groups; return groups;
} }
} }
/*
== FOOTNOTE:REGX_GROUP [ISSUE#:726; DATE:2020-05-17] ==
The XOWA Regx_group is a quasi-adapter for java.util.regex.Matcher and its group-related methods.
Consider a Regx_group with varName `grp` and a Matcher with varName `match`
* `grp.Bgn()` <- `match.start()`
* `grp.End()` <- `match.end()`
* `grp.Val()` <- `match.group(i)`
Note that all callers of `grp` would be expecting REGEX convention (not LUA pattern convention). As such:
* '''base0''': `grp.Bgn()` and `grp.End()` must be base0 not base1 (REGEX is base0)
** Fortunately, Str_find_mgr__xowa uses base0, so there is no need to convert from base1 to base0
** However, Scrib_lib_ustring_gsub_mgr will convert base0 to base1 in the gsub FunctionCallback code '''IF''' anypos is present in the pattern
* '''charIndexes''': `grp.Bgn()` and `grp.End()` should represent charIndexes, not byteIndexes (REGEX is chars)
** Str_find_mgr__xowa uses codepointIndexes b/c of Ustring_ucs
** In theory, should convert to charIndexes b/c REGEX uses charIndexes. However:
*** Regx_group.Bgn() is only used by anypos for LuaCallbacks
*** anypos needs codepointIndexes
*** so, be lazy, and don't bother double converting to charIndex only to convert back to codepointIndex
SEE:FOOTNOTE:CAPTURES
*/
Loading…
Cancel
Save