Scribunto: Create match string using char-indexes, not codepointIndexes (to handle UTF-8 strings) [#726]

2024-10-27 20:34:16 +00:00 · 2020-05-31 01:19:24 -04:00 · 2020-05-31 01:19:24 -04:00 · 35f2027b20
commit 35f2027b20
parent be072de8d9
3 changed files with 114 additions and 25 deletions
--- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustringgsubtst.java
+++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustringgsubtst.java
@ -61,6 +61,10 @@ public class Scrib_lib_ustring__gsub__tst {
 	@Test public void Replace__double() {	// PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21
 		Exec_gsub("abcd", 1	 , -1, 1.23d	, "abcd;0");
 	}
 	@Test public void Replace__utf8() {	// PURPOSE:do not cut off utf8-strings PAGE:en.d:𠮟 DATE:2020-05-31
 		String regx = "^[\t]*(.-)[\t]*$"; // from mwtext.trim
 		Exec_gsub("𠮟a", regx, -1, "%1", "𠮟a;1"); // fails with "𠮟;1"
 	}
 	@Test public void Replace__anypos() {	// PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
 		Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1");
 	}
@ -171,6 +175,17 @@ public class Scrib_lib_ustring__gsub__tst {
 		fxt.Init__cbk(proc);
 		Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
 	}
 	@Test public void Luacbk__anypos__utf8() { // PURPOSE:handle UTF-8 chars with anypos match ISSUE#:726; DATE:2020-05-29
 		String text = "𤭢 a";
 		String regx = "()[𤭢a]()";
 		String expd = "B C;2";
 		Mock_proc__verify_args proc = new Mock_proc__verify_args(0
 			, new Object[]{"B", 1, 2} // fails if 3 instead of 2
 			, new Object[]{"C", 3, 4}
 			);
 		fxt.Init__cbk(proc);
 		Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), expd);
 	}
 //  Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"});
 //  fxt.Init__cbk(proc);
 //  Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|28–0\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2");	}
--- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java
+++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java
@ -1,6 +1,6 @@
 /*
 XOWA: the XOWA Offline Wiki Application
-Copyright (C) 2012-2017 gnosygnu@gmail.com
+Copyright (C) 2012-2020 gnosygnu@gmail.com
 XOWA is licensed under the terms of the General Public License (GPL) Version 3,
 or alternatively under the terms of the Apache License Version 2.0.
@ -13,11 +13,34 @@ The terms of each license can be found in the source code repository:
 GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
 Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
-package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
+package gplx.xowa.xtns.scribunto.libs;
-import gplx.langs.regxs.*;
+
-import gplx.objects.strings.unicodes.*;
+import gplx.Bool_;
-import gplx.xowa.xtns.scribunto.libs.patterns.*;
+import gplx.Bry_;
-import gplx.xowa.xtns.scribunto.procs.*;
+import gplx.Bry_bfr;
 import gplx.Byte_ascii;
 import gplx.Char_;
 import gplx.Double_;
 import gplx.Err_;
 import gplx.Hash_adp;
 import gplx.Hash_adp_;
 import gplx.Int_;
 import gplx.Keyval;
 import gplx.Keyval_;
 import gplx.List_adp_;
 import gplx.Object_;
 import gplx.String_;
 import gplx.Type_;
 import gplx.langs.regxs.Regx_group;
 import gplx.langs.regxs.Regx_match;
 import gplx.objects.strings.unicodes.Ustring_;
 import gplx.xowa.xtns.scribunto.Scrib_core;
 import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
 import gplx.xowa.xtns.scribunto.Scrib_lua_proc;
 import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher;
 import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args;
 import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt;
 public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
 	private final    Scrib_core core;
 	private String src_str;
@ -108,7 +131,7 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
 										// NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
 										else if (idx - 1 < match.Groups().length) {	// retrieve numbered capture; TODO_OLD: support more than 9 captures
 											Regx_group grp = match.Groups()[idx - 1];
-											tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End()));	// NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
+											tmp_bfr.Add_str_u8(grp.Val());	// NOTE: changed from String_.Mid(src_str, grp.Bgn(), grp.End()); DATE:2020-05-31
 										}
 										// NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20
 										else if (idx == 1) {
@ -136,18 +159,15 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
 				}
 				break;
 			case Repl_tid_table: {
 				int match_bgn = -1, match_end = -1;
 				Regx_group[] grps = match.Groups();
 				String find_str = null;
 				if (grps.length == 0) {
-					match_bgn = match.Find_bgn();
+					find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end());	// NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
 					match_end = match.Find_end();
 				}
 				else {	// group exists, take first one (logic matches Scribunto); PAGE:en.w:Bannered_routes_of_U.S._Route_60; DATE:2014-08-15
 					Regx_group grp = grps[0];
-					match_bgn = grp.Bgn();
+					find_str = grp.Val();
 					match_end = grp.End();
 				}
 				String find_str = String_.Mid(src_str, match_bgn, match_end);	// NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
 				Object actl_repl_obj = repl_hash.Get_by(find_str);
 				if (actl_repl_obj == null)			// match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
 					tmp_bfr.Add_str_u8(find_str);
@ -176,10 +196,17 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
 					for (int i = 0; i < grps_len; i++) {
 						Regx_group grp = grps[i];
-						// anypos will create @offset arg; everything else creates a @match arg based on grp
+						// anypos will create @offset arg; everything else creates a @match arg based on grp; FOOTNOTE:CAPTURES
-						Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val())
+						boolean anyposExists = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val());
-								? (Object)(grp.Bgn() + List_adp_.Base1) // NOTE: must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17;
+						Object val = null;
-								: (Object)String_.Mid(src_str, grp.Bgn(), grp.End());
+						if (anyposExists) {
 							// emptyCapture ("anypos" or `()`) must pass integer position; must normalize to base-1 b/c lua callbacks expect base-1 arguments, not base-0; ISSUE#:726; DATE:2020-05-17;
 							val = (Object)(grp.Bgn() + List_adp_.Base1);
 						}
 						else {
 							// standardCapture must pass string match
 							val = grp.Val();
 						}
 						luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
 					}
 				}
@ -203,3 +230,30 @@ public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
 	private static final byte Repl_tid_null = 0, Repl_tid_string = 1, Repl_tid_table = 2, Repl_tid_luacbk = 3;
 	public static final    Scrib_lib_ustring_gsub_mgr[] Ary_empty = new Scrib_lib_ustring_gsub_mgr[0];
 }
 /*
 == FOOTNOTE:CAPTURES [ISSUE#:726; DATE:2020-05-17] ==
 There are two types of captures:
 * '''basicCaptures''': EX: given `abcd`, `a(bc)d` captureValues will be 1, 3 b/c `(bc)` captures the start / end of the match
 * '''emptyCaptures''': EX: given `abcd`, `()bcd`  captureValues will be 1, 2 b/c `()`   captures the position of the match
 The above captureValues are base0 b/c Str_find_mgr__xowa uses base0
 * Keep in mind that XOWA is base0 b/c it works directly with byte arrays and need base0 to index into these 0-based arrays
 In contrast, Lua is base1. However, this base1-ness is not exposed anywhere, except in gsubs's FunctionCallback.
 Even then, it is only exposed for emptyCaptures, not basicCaptures due to how Lua passes parameters
 For example, consider this code:
 ```
 function p.test_726_anypos()
    mw.ustring.gsub("abcd", "a(bc)d", function(arg1)
        mw.log('basic', arg1); -- arg1 is the matched string or "bc"
    end)
    mw.ustring.gsub("abcd", "()bcd", function(arg1)
        mw.log('empty', arg1); -- arg1 is the position of the empty capture or "2"
    end)
 end
 ```
 SEE:FOOTNOTE:REGX_GROUP
 */
--- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java
+++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/patterns/Scrib_pattern_matcher__xowa.java
@ -122,15 +122,35 @@ class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher {
 		for (int i = 0; i < captures_len; i += 2) {
 			int capture_bgn = captures[i];
 			int capture_end = captures[i + 1];
-			// ISSUE#:726; DATE:2020-05-17;
+			// FOOTNOTE:REGX_GROUP
-			// NOTE: capture values are base-0 and are added by any pattern captures, including:
+			int bgn_in_chars = src_ucs.Map_data_to_char(capture_bgn);
-			// * standard captures EX: `a(bc)d` for `abcd` will have 1, 3
+			int end_in_chars = src_ucs.Map_data_to_char(capture_end);
-			// * empty captures EX: `()bc` for `abcd` will have 1, 2
+ 			String val = String_.Mid(src_ucs.Src(), bgn_in_chars, end_in_chars);
-			// Note that empty captures will be normalized to base-1 in Scrib_lib_ustring_gsub_mgr inside the any_pos code
+			groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, val);
 			capture_bgn = src_ucs.Map_data_to_char(capture_bgn);
 			capture_end = src_ucs.Map_data_to_char(capture_end);
 			groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end));
 		}
 		return groups;
 	}
 }
 /*
 == FOOTNOTE:REGX_GROUP [ISSUE#:726; DATE:2020-05-17] ==
 The XOWA Regx_group is a quasi-adapter for java.util.regex.Matcher and its group-related methods.
 Consider a Regx_group with varName `grp` and a Matcher with varName `match`
 * `grp.Bgn()` <- `match.start()`
 * `grp.End()` <- `match.end()`
 * `grp.Val()` <- `match.group(i)`
 Note that all callers of `grp` would be expecting REGEX convention (not LUA pattern convention). As such:
 * '''base0''': `grp.Bgn()` and `grp.End()` must be base0 not base1 (REGEX is base0)
 ** Fortunately, Str_find_mgr__xowa uses base0, so there is no need to convert from base1 to base0
 ** However, Scrib_lib_ustring_gsub_mgr will convert base0 to base1 in the gsub FunctionCallback code '''IF''' anypos is present in the pattern
 * '''charIndexes''': `grp.Bgn()` and `grp.End()` should represent charIndexes, not byteIndexes (REGEX is chars)
 ** Str_find_mgr__xowa uses codepointIndexes b/c of Ustring_ucs
 ** In theory, should convert to charIndexes b/c REGEX uses charIndexes. However:
 *** Regx_group.Bgn() is only used by anypos for LuaCallbacks
 *** anypos needs codepointIndexes
 *** so, be lazy, and don't bother double converting to charIndex only to convert back to codepointIndex
 SEE:FOOTNOTE:CAPTURES
 */