Scribunto: Do not fail in ustring.find if negative bgn is large [#366]

2025-06-13 12:54:14 +00:00 · 2019-02-24 16:14:34 -05:00 · 2019-02-24 16:14:34 -05:00 · 3fd759b020
commit 3fd759b020
parent 91cbb34fa5
7 changed files with 392 additions and 180 deletions
--- a/100_core/src/gplx/core/intls/Utf8_.java
+++ b/100_core/src/gplx/core/intls/Utf8_.java
@ -88,7 +88,7 @@ public class Utf8_ {
 		for (int i = pos - 1; i >= stop; i--) {	// start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
 			byte b = bry[i];
 			int char_len = Len_of_char_by_1st_byte(b);
-			switch (char_len) {	// if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return
+			switch (char_len) {	// if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: € = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return
 				case 2: if (pos - i == 1) return i; break;
 				case 3: if (pos - i == 2) return i; break;
 				case 4: if (pos - i == 3) return i; break;
@ -111,7 +111,7 @@ public class Utf8_ {
 		// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
 		for (int i = 0; i < 4; i++) {
 			int char_len = Len_of_char_by_1st_byte(b);
-			switch (char_len) {	// if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
+			switch (char_len) {	// if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: € = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
 				case 2: if (i == 1) return pos; break;
 				case 3: if (i == 2) return pos; break;
 				case 4: if (i == 3) return pos; break;
@ -141,3 +141,48 @@ public class Utf8_ {
 	, Codepoint_surrogate_end = 0xDFFF
 	;
 }
+/*
+== Definitions ==
+=== a7 vs u8 ===
+* a7 -> ASCII (7 bits)
+* u8 -> UTF-8 (8 bytes)
+
+In retrospect, better abbreviations would have been:
+* ascii -> ASCII
+* utf08 -> UTF-8
+* utf16 -> UTF-16
+
+=== General ===
+==== Byte ====
+* Standard definition; 8 bits (2^8 or 256)
+
+==== Codepoint ====
+* Represents 1 atomic character but can be composed of multiple bytes
+** Examples:
+<pre>
+1 byte : "a"  (letter a)
+2 bytes: "¢"  (cent)
+3 bytes: "€"  (euro)
+4 bytes: "𤭢" (Chinese character)
+</pre>
+* Defined by unicode as a sequence of 4 hexadecimals (2 bytes) or 8 hexadecimals (4 bytes); REF:http://www.unicode.org
+** 4 hexadecimal is 2 bytes (2^(4 * 4) -> 2^16)
+
+==== char ====
+* Java definition of a codepoint which is encoded as 2 bytes (2^16 or 65,536)
+* For Western langauges: 1 codepoint equals 1 char (2 bytes);
+** For example, chars like "a", "œ", "é" are 1 Java char
+* For Eastern langauges: 1 codepoint can equal 2 chars (4 bytes);
+** For example, chars like "駣" are 2 Java chars though they represent 1 conceptual codepoint (in English terms, "駣" is a single letter just like the letter "a")
+
+==== Supplementary characters ====
+* Represents a codepoint which is defined by 3 or 4 bytes
+* Is defined by 1 surrogate pair
+** lo-surrogate : 2 bytes
+** hi-surrogate : 2 bytes
+
+=== Conventions ===
+* Codepoints will be rendered as one int (4 bytes), not 4 hexadecimals (1 byte) 8 hexadecimal (4 bytes)
+* The "char" datatype will rarely be used in code; instead byte arrays or codepoint-ints will be used
+* The "character" word will not be used in comments; instead the "codepoint" word will be used
+*/
--- a/400_xowa/src/gplx/core/intls/String_surrogate_utl.java
+++ b/400_xowa/src/gplx/core/intls/String_surrogate_utl.java
@ -1,34 +0,0 @@
-/*
-XOWA: the XOWA Offline Wiki Application
-Copyright (C) 2012-2017 gnosygnu@gmail.com
-
-XOWA is licensed under the terms of the General Public License (GPL) Version 3,
-or alternatively under the terms of the Apache License Version 2.0.
-
-You may use XOWA according to either of these licenses as is most appropriate
-for your project on a case-by-case basis.
-
-The terms of each license can be found in the source code repository:
-
-GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
-Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
-*/
-package gplx.core.intls; import gplx.*; import gplx.core.*;
-public class String_surrogate_utl {
-	public int Byte_pos() {return byte_pos;} int byte_pos;
-	public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx)				{return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);}	
-	public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx)	{return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);}	
-	private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) {
-		int char_count = 0, codepoint_count = 0;
-		byte_pos = byte_bgn;
-		while (true) {
-			if (	stop_idx == (stop_idx_is_char ? char_count : codepoint_count)		// requested # of chars found
-				||	byte_pos >= src_len													// eos reached; DATE:2014-09-02
-				) return codepoint_count - char_count;
-			int char_len_in_bytes = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]);
-			++char_count;												// char_count always incremented by 1
-			codepoint_count += (char_len_in_bytes == 4) ? 2 : 1;		// codepoint_count incremented by 2 if surrogate pair; else 1
-			byte_pos += char_len_in_bytes;
-		} 
-	}	
-}
--- a/400_xowa/src/gplx/core/intls/String_surrogate_utl_tst.java
+++ b/400_xowa/src/gplx/core/intls/String_surrogate_utl_tst.java
@ -1,55 +0,0 @@
-/*
-XOWA: the XOWA Offline Wiki Application
-Copyright (C) 2012-2017 gnosygnu@gmail.com
-
-XOWA is licensed under the terms of the General Public License (GPL) Version 3,
-or alternatively under the terms of the Apache License Version 2.0.
-
-You may use XOWA according to either of these licenses as is most appropriate
-for your project on a case-by-case basis.
-
-The terms of each license can be found in the source code repository:
-
-GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
-Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
-*/
-package gplx.core.intls; import gplx.*; import gplx.core.*;
-import org.junit.*;
-public class String_surrogate_utl_tst {
-	@Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt();
-	@Test   public void Char_idx() {
-		String test_str = "aé𡼾bî𡼾";
-		fxt.Test_count_surrogates__char_idx			(test_str,  0, 1, 0,  1);		// a
-		fxt.Test_count_surrogates__char_idx			(test_str,  0, 2, 0,  3);		// aé
-		fxt.Test_count_surrogates__char_idx			(test_str,  0, 3, 1,  7);		// aé𡼾
-		fxt.Test_count_surrogates__char_idx			(test_str,  7, 1, 0,  8);		// b
-		fxt.Test_count_surrogates__char_idx			(test_str,  7, 2, 0, 10);		// bî
-		fxt.Test_count_surrogates__char_idx			(test_str,  7, 3, 1, 14);		// bî𡼾
-		fxt.Test_count_surrogates__char_idx			(test_str,  0, 6, 2, 14);		// aé𡼾bî𡼾
-		fxt.Test_count_surrogates__char_idx			(test_str, 14, 7, 0, 14);		// PURPOSE: test out of bounds; DATE:2014-09-02
-	}
-	@Test   public void Codepoint_idx() {
-		String test_str = "aé𡼾bî𡼾";
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  0, 1, 0,  1);		// a
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  0, 2, 0,  3);		// aé
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  0, 4, 1,  7);		// aé𡼾
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  7, 1, 0,  8);		// b
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  7, 2, 0, 10);		// bî
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  7, 4, 1, 14);		// bî𡼾
-		fxt.Test_count_surrogates__codepoint_idx	(test_str,  0, 8, 2, 14);		// aé𡼾bî𡼾
-	}
-}
-class String_surrogate_utl_fxt {
-	private String_surrogate_utl codepoint_utl = new String_surrogate_utl();
-	public void Clear() {}
-	public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
-		byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
-		Tfds.Eq(expd_count	, codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx));
-		Tfds.Eq(expd_pos	, codepoint_utl.Byte_pos());
-	}
-	public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
-		byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
-		Tfds.Eq(expd_count	, codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count");
-		Tfds.Eq(expd_pos	, codepoint_utl.Byte_pos(), "pos");
-	}
-}
--- a/400_xowa/src/gplx/core/intls/Utf16_mapper.java
+++ b/400_xowa/src/gplx/core/intls/Utf16_mapper.java
@ -0,0 +1,79 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.core.intls; import gplx.*; import gplx.core.*;
+public class Utf16_mapper {
+	private final    int[] ary;
+	private final    int dim_len;
+	public byte[] Src_bry() {return src_bry;} private final    byte[] src_bry;
+	public String Src_str() {return src_str;} private final    String src_str;
+	public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
+	public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
+	public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
+	public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
+	public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
+	public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
+	public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
+	public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
+	public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
+	public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
+	public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
+		// create ary
+		this.src_str = src_str;
+		this.src_bry = src_bry;
+		this.dim_len = src_bry_len + 1; // +1 to capture end + 1
+		int ary_len = dim_len * Dims_total;
+		this.ary = new int[dim_len * Dims_total];
+		for (int i = 0; i < ary_len; i++)
+			ary[i] = Invalid;
+
+		// init
+		int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
+
+		// loop till EOS
+		while (true) {
+			// update
+			ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
+			ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
+			ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
+			ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
+
+			// exit if EOS
+			if (pos_in_bytes >= src_bry_len) break;
+
+			// get lengths
+			int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
+			int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
+
+			// increment
+			pos_in_bytes += cur_len_in_bytes;
+			pos_in_chars += cur_len_in_chars;
+			pos_in_codes += 1;
+		}
+
+		// set lens
+		this.len_in_codes = pos_in_codes;
+		this.len_in_chars = pos_in_chars;
+	}
+
+	public static final int
+	  Invalid            = -1
+	, Dims_total         = 4
+	, Dims_code_for_byte = 0
+	, Dims_byte_for_code = 1
+	, Dims_code_for_char = 2
+	, Dims_char_for_code = 3
+	;
+}
--- a/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java
+++ b/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java
@ -0,0 +1,62 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.core.intls; import gplx.*; import gplx.core.*;
+import org.junit.*; import gplx.core.tests.*;
+public class Utf16_mapper_tst {
+	private final    Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
+	@Test  public void A() {
+		fxt.Test__map("a¢€𤭢"
+			, Int_ary_.New( 0,  1, -1,  2, -1, -1,  3, -1, -1, -1,  4)
+			, Int_ary_.New( 0,  1,  3,  6, 10, -1, -1, -1, -1, -1, -1)
+			, Int_ary_.New( 0,  1,  2, -1,  3, -1,  4, -1, -1, -1, -1)
+			, Int_ary_.New( 0,  1,  2,  4,  6, -1, -1, -1, -1, -1, -1)
+			);
+	}
+}
+class Utf16_mapper_fxt {
+	public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		int src_len = src_bry.length;
+		Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
+		Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
+		Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
+		Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
+		Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
+	}
+	private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
+		int actl_len = src_len + 1;
+		int[] actl = new int[actl_len];
+		for (int i = 0; i < actl_len; i++) {
+			int v = -1;
+			switch (dim_type) {
+				case Utf16_mapper.Dims_code_for_byte:
+					v = mapper.Get_code_for_byte_or_neg1(i);
+					break;
+				case Utf16_mapper.Dims_byte_for_code:
+					v = mapper.Get_byte_for_code_or_neg1(i);
+					break;
+				case Utf16_mapper.Dims_code_for_char:
+					v = mapper.Get_code_for_char_or_neg1(i);
+					break;
+				case Utf16_mapper.Dims_char_for_code:
+					v = mapper.Get_char_for_code_or_neg1(i);
+					break;
+			}
+			actl[i] = v;
+		}
+		Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
+	}
+}
--- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java
+++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java
@ -18,7 +18,6 @@ import gplx.core.intls.*; import gplx.langs.regxs.*;
 import gplx.xowa.parsers.*;
 import gplx.xowa.xtns.scribunto.procs.*;
 public class Scrib_lib_ustring implements Scrib_lib {
-	private final    String_surrogate_utl surrogate_utl = new String_surrogate_utl();
 	public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
 	public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
 	public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
@ -48,74 +47,92 @@ public class Scrib_lib_ustring implements Scrib_lib {
 	public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub";
 	private static final    String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub);
 	public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) {
-		String text_str		= args.Xstr_str_or_null(0);
-		String regx			= args.Pull_str(1);
-		int bgn_char_idx	= args.Cast_int_or(2, 1);
-		boolean plain			= args.Cast_bool_or_n(3);
-		synchronized (surrogate_utl) {
-			byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
-			bgn_char_idx = Bgn_adjust(text_str, bgn_char_idx);
+		// get args
+		String text_str	       = args.Xstr_str_or_null(0);
+		String find_str        = args.Pull_str(1);
+		int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
+		boolean plain             = args.Cast_bool_or_n(3);

-			// regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
-			// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
-			if (String_.Len_eq_0(regx))	// regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false
-				return rslt.Init_many_objs(bgn_char_idx + Scrib_lib_ustring.Base1, bgn_char_idx + Scrib_lib_ustring.Base1 - 1);
+		// init text vars
+		byte[] text_bry = Bry_.new_u8(text_str);
+		int text_bry_len = text_bry.length;
+		Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23

-			// NOTE: adjust for 2-len chars (surrogates); PAGE:en.d:iglesia DATE:2017-04-23
-			int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx);		// NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
-			int bgn_codepoint_idx = bgn_char_idx + bgn_adj;
-			int bgn_byte_pos = surrogate_utl.Byte_pos();
-			if (plain) {
-				int pos = String_.FindFwd(text_str, regx, bgn_codepoint_idx);
-				boolean found = pos != Bry_find_.Not_found;
-				return found 
-					? rslt.Init_many_objs(pos + Scrib_lib_ustring.Base1, pos + Scrib_lib_ustring.Base1 + String_.Len(regx) - Scrib_lib_ustring.End_adj)
-					: rslt.Init_ary_empty()
-					;
-			}
-			Scrib_regx_converter regx_converter = new Scrib_regx_converter();
-			regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
-			Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
-			Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx);	// NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
-			int len = regx_rslts.length;
-			if (len == 0) return rslt.Init_ary_empty();
-			List_adp tmp_list = List_adp_.New();
-			Regx_match match = regx_rslts[0];					// NOTE: take only 1st result; DATE:2014-08-27
-			int match_find_bgn_codepoint = match.Find_bgn();	// NOTE: java regex returns results in codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
-			int match_find_bgn_adj = -surrogate_utl.Count_surrogates__codepoint_idx1(text_bry, text_bry_len, bgn_byte_pos, match_find_bgn_codepoint - bgn_codepoint_idx); // NOTE: convert from java regex codepoint to lua / php char_idx; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
-			tmp_list.Add(match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1);
-			tmp_list.Add(match.Find_end()		  + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
-			//Tfds.Dbg  (match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1
-			//			,match.Find_end()		  + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
-			AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
-			return rslt.Init_many_list(tmp_list);
+		// convert bgn from base_1 to base_0
+		int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
+
+		/*
+		int offset = 0;
+		if (bgn_as_codes > 0) { // NOTE: MW.BASE
+			// $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
 		}
-	}
-	private int Bgn_adjust(String text, int bgn) {	// adjust to handle bgn < 0 or bgn > len (which PHP allows)			
-		if (bgn > 0) bgn -= Scrib_lib_ustring.Base1;
-		int text_len = String_.Len(text);
-		if		(bgn < 0)			// negative number means search from rear of String
-			bgn += text_len;		// NOTE: PHP has extra + 1 for Base 1
-		else if (bgn > text_len)	// bgn > text_len; confine to text_len; NOTE: PHP has extra + 1 for Base 1
-			bgn = text_len;			// NOTE: PHP has extra + 1 for Base 1
-		return bgn;
+		else {
+			bgn_as_codes_base1 = 0; // NOTE: MW.BASE1
+			offset = 0; // -1?
+		}
+		*/
+
+		// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
+		// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
+		// NOTE: not in MW; is this needed? DATE:2019-02-24
+		if (String_.Len_eq_0(find_str))
+			return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1);
+
+		// if plain, just do literal match of find and exit
+		if (plain) {
+			// find pos by literal match
+			byte[] find_bry = Bry_.new_u8(find_str);
+			int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes));
+
+			// nothing found; return empty
+			if (pos == Bry_find_.Not_found)
+				return rslt.Init_ary_empty();
+
+			// bgn: convert pos from bytes back to codes; also adjust for base1
+			int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1;
+
+			// end: add find.Len_in_codes and adjust end for PHP/LUA
+			Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length);
+			int end = bgn + find_map.Len_in_codes() - End_adj;
+
+			return rslt.Init_many_objs(bgn, end);
+		}
+
+		// run regex
+		Scrib_regx_converter regx_converter = new Scrib_regx_converter();
+		Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
+		if (regx_rslts.length == 0) return rslt.Init_ary_empty();
+
+		// add to tmp_list
+		Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
+		List_adp tmp_list = List_adp_.New();
+		tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_bgn()) + Scrib_lib_ustring.Base1);
+		tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
+		AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
+		return rslt.Init_many_list(tmp_list);
 	}
 	public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
-		String text = args.Xstr_str_or_null(0);		// Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
-		if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 
+		// get args
+		String text_str        = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
+		String find_str        = args.Cast_str_or_null(1);
+		int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
+
+		// validate / adjust
+		if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 
+			return rslt.Init_many_list(List_adp_.Noop);
+		byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
+		Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
+		int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
+
+		// run regex
 		Scrib_regx_converter regx_converter = new Scrib_regx_converter();
-		String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
-		int bgn = args.Cast_int_or(2, 1);
-		bgn = Bgn_adjust(text, bgn);
-		Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
-		Regx_match[] regx_rslts = regx_adp.Match_all(text, bgn);
-		int len = regx_rslts.length;
-		if (len == 0) return rslt.Init_null();	// return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
+		Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
+		if (regx_rslts.length == 0) return rslt.Init_null();	// return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30

 		// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
 		regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
 		List_adp tmp_list = List_adp_.New();
-		AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true);
+		AddCapturesFromMatch(tmp_list, regx_rslts[0], text_str, regx_converter.Capt_ary(), true);
 		return rslt.Init_many_list(tmp_list);
 	}
 	public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
@ -143,6 +160,35 @@ public class Scrib_lib_ustring implements Scrib_lib {
 		AddCapturesFromMatch(tmp_list, match, text, capt, true);	// NOTE: was incorrectly set as false; DATE:2014-04-23
 		return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
 	}
+	private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) {
+		// convert bgn from base_1 to base_0
+		int bgn_as_codes = bgn_as_codes_base1;
+		if (bgn_as_codes > 0) 
+			bgn_as_codes -= Scrib_lib_ustring.Base1;
+		// TOMBSTONE: do not adjust negative numbers for base1; fails tests
+		// else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1;
+
+		// adjust bgn for negative-numbers and large positive-numbers
+		// NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php
+		// NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ
+		if      (bgn_as_codes < 0)               // negative number means search from rear of String
+			bgn_as_codes += len_in_codes;        // NOTE:IGNORE_BASE_1_ADJ
+		else if (bgn_as_codes > len_in_codes)    // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
+			bgn_as_codes = len_in_codes;         // NOTE:IGNORE_BASE_1_ADJ
+
+		// will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23
+		if (bgn_as_codes < 0)
+			bgn_as_codes = 0;
+		return bgn_as_codes;
+	}
+	private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
+		// convert regex from lua to java
+		find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G);
+
+		// run regex
+		Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
+		return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes));	// NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
+	}
 	private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
 		int capts_len = capts == null ? 0 : capts.length;
 		if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
@ -171,6 +217,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
 		}
 		return rv;
 	}
-	private static final int Base1 = 1
+	private static final int
+	  Base1 = 1
 	, End_adj = 1;	// lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
 }
--- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustringfindtst.java
+++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustringfindtst.java
@ -14,39 +14,107 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
 Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
-import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
+import org.junit.*;
+import gplx.core.consoles.*;
+import gplx.xowa.xtns.scribunto.engines.mocks.*;
 public class Scrib_lib_ustring__find__tst {
-	private final    Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
-	@Before public void init() {
+	private final    Scrib_lib_ustring__find__fxt fxt = new Scrib_lib_ustring__find__fxt();
+	@Test  public void Plain() {
+		fxt.Test__find("aabaab"        , "b"    ,  2, Bool_.Y, "3;3"); // bytes=1
+		fxt.Test__find("€€b€€b"        , "b"    ,  2, Bool_.Y, "3;3"); // bytes=3
+		fxt.Test__find("𤭢𤭢b𤭢𤭢b"    , "b"    ,  2, Bool_.Y, "3;3"); // bytes=4
+		fxt.Test__find("()()"          , "("    ,  2, Bool_.Y, "3;3"); // exact match; note that "(" is invalid regx
+		fxt.Test__find("abcd"          , ""     ,  2, Bool_.Y, "2;1"); // empty find should return values; EX:w:Fool's_mate; DATE:2014-03-04
+		fxt.Test__find("a€b"           , "€"    ,  1, Bool_.Y, "2;2"); // find is bytes=3
+	}
+	@Test   public void Bgn__negative() {
+		fxt.Test__find("abab"          , "b"    , -1, Bool_.Y, "4;4"); // search from back of String
+		fxt.Test__find("abab"          , "b"    , -9, Bool_.Y, "2;2"); // do not throw error if negative index > text.length; ISSUE#:366; DATE:2019-02-23
+		fxt.Test__find("𤭢"            , "𤭢"   , -1, Bool_.Y, "1;1"); // fails if "" b/c it would have counted -1 as -1 char instead of -1 codepoint
+	}
+	@Test  public void Regx__simple() {
+		fxt.Test__find("abcd"          , "b"       ,  1, Bool_.N, "2;2");   // basic
+		fxt.Test__find("abad"          , "a"       ,  2, Bool_.N, "3;3");   // bgn
+		fxt.Test__find("abcd"          , "x"       ,  1, Bool_.N, "");      // no-match
+		fxt.Test__find("abcd"          , ""        ,  2, Bool_.N, "2;1");   // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
+	}
+	@Test   public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
+		fxt.Test__find(123             , "2"       ,  1, Bool_.N, "2;2");
+	}
+	@Test  public void Regx__groups() {
+		fxt.Test__find("a bcd e"       , "(b(c)d)" ,  2, Bool_.N, "3;5;bcd;c"); // groups
+		fxt.Test__find("a bcd e"       , "()(b)"   ,  2, Bool_.N, "3;3;3;b");   // groups; empty capture
+	}
+	@Test  public void Regx__caret() {
+		fxt.Test__find("abcd"          , "^(c)"    ,  3, Bool_.N, "3;3;c");	// ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07
+	}
+	@Test   public void Regx__return_is_int() {
+		fxt.Test__find("a"             , "()"      ,  2, Bool_.N, "2;1;2");
+	}
+	@Test  public void Surrogate__find__value() {	// PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
+		fxt.Test__find("aé𡼾\nbî𡼾\n"  , "\n"      ,  1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
+		fxt.Test__find("aé𡼾\nbî𡼾\n"  , "\n"      ,  5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
+	}
+	@Test  public void Surrogate__find__empty() {	// PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
+		fxt.Test__find("aé𡼾\nbî𡼾\n"  , ""        ,  1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
+		fxt.Test__find("aé𡼾\nbî𡼾\n"  , ""        ,  5, Bool_.N, "5;4"); // 8 b/c \n starts at pos 8 (super 1)
+	}
+}
+class Scrib_lib_ustring__find__fxt {
+	private boolean dbg = false;
+	private final    Mock_scrib_fxt fxt = new Mock_scrib_fxt();
+	private Scrib_lib lib;
+	public Scrib_lib_ustring__find__fxt() {
 		fxt.Clear();
 		lib = fxt.Core().Lib_ustring().Init();
 	}
-	@Test  public void Basic() {
-		Exec_find("abcd"	, "b"				, 1, Bool_.N, "2;2");				// basic
-		Exec_find("abac"	, "a"				, 2, Bool_.N, "3;3");				// bgn
-		Exec_find("()()"	, "("				, 2, Bool_.Y, "3;3");				// plain; note that ( would "break" regx
-		Exec_find("a bcd e"	, "(b(c)d)"			, 2, Bool_.N, "3;5;bcd;c");			// groups
-		Exec_find("a bcd e"	, "()(b)"			, 2, Bool_.N, "3;3;3;b");			// groups; empty capture
-		Exec_find("abcd"	, "x"				, 1, Bool_.N, "");					// empty
-		Exec_find("abcd"	, ""				, 2, Bool_.Y, "2;1");				// empty regx should return values; plain; EX:w:Fool's_mate; DATE:2014-03-04
-		Exec_find("abcd"	, ""				, 2, Bool_.N, "2;1");				// empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
-		Exec_find("abcd"	, "^(c)"			, 3, Bool_.N, "3;3;c");				// ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07
-	}
-	@Test   public void Arg_int() {	// PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
-		fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(123, "2", 1, Bool_.N), "2;2");
-	}
-	@Test   public void Return_int() {
-		fxt.Test__proc__kvps__vals(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_("a", "()", 2, Bool_.N), 2, 1, 2);
-	}
-	@Test  public void Surrogate__find__value() {	// PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
-		Exec_find("aé𡼾\nbî𡼾\n"	, "\n"		, 1, Bool_.N, "4;4");				// 4 b/c \n starts at pos 4 (super 1)
-		Exec_find("aé𡼾\nbî𡼾\n"	, "\n"		, 5, Bool_.N, "8;8");				// 8 b/c \n starts at pos 8 (super 1)
-	}
-	@Test  public void Surrogate__find__empty() {	// PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
-		Exec_find("aé𡼾\nbî𡼾\n"	, ""		, 1, Bool_.N, "1;0");				// 4 b/c \n starts at pos 4 (super 1)
-//			Exec_find("aé𡼾\nbî𡼾\n"	, ""		, 5, Bool_.N, "8;8");				// 8 b/c \n starts at pos 8 (super 1)
-	}
-	private void Exec_find(String text, String regx, int bgn, boolean plain, String expd) {
+	public Scrib_lib_ustring__find__fxt Dbg_y_() {dbg = Bool_.Y; return this;}
+	public Scrib_lib_ustring__find__fxt Dbg_n_() {dbg = Bool_.N; return this;}
+	public void Test__find(String text, String regx, int bgn, boolean plain, String expd) {
+		if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd));
 		fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
 	}
+	public void Test__find(int text, String regx, int bgn, boolean plain, String expd) {
+		if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd));
+		fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
+	}
+	private String Bld_test_string(Object text, String regx, int bgn, boolean plain, String expd) {
+		/*
+		{| class=wikitable
+		! rslt !! expd !! actl !! code
+		|}
+		*/
+		String invk = "{{" + String_.Format("#invoke:Sandbox/Gnosygnu|ustring_find|{0}|{1}|{2}|{3}", Object_.Xto_str_strict_or_empty(text), regx, bgn, plain ? Bool_.True_str : Bool_.False_str) + "}}";
+		Bry_bfr bfr = Bry_bfr_.New();
+		bfr.Add_str_a7("|-\n");
+		bfr.Add_str_u8("| {{#ifeq:" + invk + "|" + expd + "|<span style='color:green'>pass</span>|<span style='color:red'>fail</span>}}\n");
+		bfr.Add_str_u8("| " + expd + "\n");
+		bfr.Add_str_u8("| " + invk + "\n");
+		bfr.Add_str_u8("| <nowiki>" + invk + "</nowiki>\n");
+		return bfr.To_str();
+	}
 }
+/*
+TEST:
+* URL: https://en.wikipedia.org/wiki/Project:Sandbox
+* CODE:
+{{#invoke:Sandbox/Gnosygnu|ustring_find|abab|b|3|true}}
+
+MODULE: 
+* URL: https://en.wikipedia.org/wiki/Module:Sandbox/Gnosygnu
+* CODE:
+function p.ustring_find(frame)
+  local args = frame.args;
+  local rslt = {mw.ustring.find(args[1], args[2], tonumber(args[3]), args[4] == 'true')};
+
+  local rv = '';
+  local rslt_len = #rslt;
+  for i=1,rslt_len do
+    if i ~= 1 then
+      rv = rv .. ';'
+    end
+    rv = rv .. rslt[i]
+  end
+  return rv;
+end
+*/