Scribunto: Create match string using char-indexes, not codepointIndexes (to handle UTF-8 strings) [#726]

staging
gnosygnu 4 years ago
parent be072de8d9
commit 35f2027b20

@ -61,6 +61,10 @@ public class Scrib_lib_ustring__gsub__tst {
@Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21
Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0");
}
@Test public void Replace__utf8() { // PURPOSE:do not cut off utf8-strings PAGE:en.d:𠮟 DATE:2020-05-31
String regx = "^[\t]*(.-)[\t]*$"; // from mwtext.trim
Exec_gsub("𠮟a", regx, -1, "%1", "𠮟a;1"); // fails with "𠮟;1"
}
@Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1");
}
@ -171,6 +175,17 @@ public class Scrib_lib_ustring__gsub__tst {
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
}
@Test public void Luacbk__anypos__utf8() { // PURPOSE:handle UTF-8 chars with anypos match ISSUE#:726; DATE:2020-05-29
String text = "𤭢 a";
String regx = "()[𤭢a]()";
String expd = "B C;2";
Mock_proc__verify_args proc = new Mock_proc__verify_args(0
, new Object[]{"B", 1, 2} // fails if 3 instead of 2
, new Object[]{"C", 3, 4}
);
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
}
// Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"});
// fxt.Init__cbk(proc);
// Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|280\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); }

@ -1,6 +1,6 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
@ -13,11 +13,34 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.langs.regxs.*;
import gplx.objects.strings.unicodes.*;
import gplx.xowa.xtns.scribunto.libs.patterns.*;
import gplx.xowa.xtns.scribunto.procs.*;
package gplx.xowa.xtns.scribunto.libs;
import gplx.Bool_;
import gplx.Bry_;
import gplx.Bry_bfr;
import gplx.Byte_ascii;
import gplx.Char_;
import gplx.Double_;
import gplx.Err_;
import gplx.Hash_adp;
import gplx.Hash_adp_;
import gplx.Int_;
import gplx.Keyval;
import gplx.Keyval_;
import gplx.List_adp_;
import gplx.Object_;
import gplx.String_;
import gplx.Type_;
import gplx.langs.regxs.Regx_group;
import gplx.langs.regxs.Regx_match;
import gplx.objects.strings.unicodes.Ustring_;
import gplx.xowa.xtns.scribunto.Scrib_core;
import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
import gplx.xowa.xtns.scribunto.Scrib_lua_proc;
import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt;
public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
private final Scrib_core core;
private String src_str;
@ -108,7 +131,7 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
// NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
Regx_group grp = match.Groups()[idx - 1];
tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
tmp_bfr.Add_str_u8(grp.Val()); // NOTE: changed from String_.Mid(src_str, grp.Bgn(), grp.End()); DATE:2020-05-31
}
// NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20
else if (idx == 1) {
@ -136,18 +159,15 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
}
break;
case Repl_tid_table: {
int match_bgn = -1, match_end = -1;
Regx_group[] grps = match.Groups();
String find_str = null;
if (grps.length == 0) {
match_bgn = match.Find_bgn();
match_end = match.Find_end();
find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end()); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
}
else { // group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15
Regx_group grp = grps[0];
match_bgn = grp.Bgn();
match_end = grp.End();
find_str = grp.Val();
}
String find_str = String_.Mid(src_str, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
Object actl_repl_obj = repl_hash.Get_by(find_str);
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
tmp_bfr.Add_str_u8(find_str);
@ -176,10 +196,17 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
for (int i = 0; i < grps_len; i++) {
Regx_group grp = grps[i];
// anypos will create @offset arg; everything else creates a @match arg based on grp
Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val())
? (Object)(grp.Bgn() + List_adp_.Base1) // NOTE: must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17;
: (Object)String_.Mid(src_str, grp.Bgn(), grp.End());
// anypos will create @offset arg; everything else creates a @match arg based on grp; FOOTNOTE:CAPTURES
boolean anyposExists = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val());
Object val = null;
if (anyposExists) {
// emptyCapture ("anypos" or `()`) must pass integer position; must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17;
val = (Object)(grp.Bgn() + List_adp_.Base1);
}
else {
// standardCapture must pass string match
val = grp.Val();
}
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
}
}
@ -203,3 +230,30 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3;
public static final Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0];
}
/*
== FOOTNOTE:CAPTURES [ISSUE#:726; DATE:2020-05-17] ==
There are two types of captures:
* '''basicCaptures''': EX: given `abcd`, `a(bc)d` captureValues will be 1, 3 b/c `(bc)` captures the start / end of the match
* '''emptyCaptures''': EX: given `abcd`, `()bcd` captureValues will be 1, 2 b/c `()` captures the position of the match
The above captureValues are base0 b/c Str_find_mgr__xowa uses base0
* Keep in mind that XOWA is base0 b/c it works directly with byte arrays and need base0 to index into these 0-based arrays
In contrast, Lua is base1. However, this base1-ness is not exposed anywhere, except in gsubs's FunctionCallback.
Even then, it is only exposed for emptyCaptures, not basicCaptures due to how Lua passes parameters
For example, consider this code:
```
function p.test_726_anypos()
mw.ustring.gsub("abcd", "a(bc)d", function(arg1)
mw.log('basic', arg1); -- arg1 is the matched string or "bc"
end)
mw.ustring.gsub("abcd", "()bcd", function(arg1)
mw.log('empty', arg1); -- arg1 is the position of the empty capture or "2"
end)
end
```
SEE:FOOTNOTE:REGX_GROUP
*/

@ -122,15 +122,35 @@ class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher {
for (int i = 0; i < captures_len; i += 2) {
int capture_bgn = captures[i];
int capture_end = captures[i + 1];
// ISSUE#:726; DATE:2020-05-17;
// NOTE: capture values are base-0 and are added by any pattern captures, including:
// * standard captures EX: `a(bc)d` for `abcd` will have 1, 3
// * empty captures EX: `()bc` for `abcd` will have 1, 2
// Note that empty captures will be normalized to base-1 in Scrib_lib_ustring_gsub_mgr inside the any_pos code
capture_bgn = src_ucs.Map_data_to_char(capture_bgn);
capture_end = src_ucs.Map_data_to_char(capture_end);
groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end));
// FOOTNOTE:REGX_GROUP
int bgn_in_chars = src_ucs.Map_data_to_char(capture_bgn);
int end_in_chars = src_ucs.Map_data_to_char(capture_end);
String val = String_.Mid(src_ucs.Src(), bgn_in_chars, end_in_chars);
groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, val);
}
return groups;
}
}
/*
== FOOTNOTE:REGX_GROUP [ISSUE#:726; DATE:2020-05-17] ==
The XOWA Regx_group is a quasi-adapter for java.util.regex.Matcher and its group-related methods.
Consider a Regx_group with varName `grp` and a Matcher with varName `match`
* `grp.Bgn()` <- `match.start()`
* `grp.End()` <- `match.end()`
* `grp.Val()` <- `match.group(i)`
Note that all callers of `grp` would be expecting REGEX convention (not LUA pattern convention). As such:
* '''base0''': `grp.Bgn()` and `grp.End()` must be base0 not base1 (REGEX is base0)
** Fortunately, Str_find_mgr__xowa uses base0, so there is no need to convert from base1 to base0
** However, Scrib_lib_ustring_gsub_mgr will convert base0 to base1 in the gsub FunctionCallback code '''IF''' anypos is present in the pattern
* '''charIndexes''': `grp.Bgn()` and `grp.End()` should represent charIndexes, not byteIndexes (REGEX is chars)
** Str_find_mgr__xowa uses codepointIndexes b/c of Ustring_ucs
** In theory, should convert to charIndexes b/c REGEX uses charIndexes. However:
*** Regx_group.Bgn() is only used by anypos for LuaCallbacks
*** anypos needs codepointIndexes
*** so, be lazy, and don't bother double converting to charIndex only to convert back to codepointIndex
SEE:FOOTNOTE:CAPTURES
*/
Loading…
Cancel
Save