From 8ef5854eb7e8518b31dea87edef09ce8eca0c0d8 Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sat, 16 Mar 2019 23:50:26 -0400 Subject: [PATCH] Scribunto: Iterate regx by codepoint not by bytes [#383] --- 100_core/src/gplx/core/intls/Utf8_.java | 1 + 100_core/src/gplx/core/tests/Gftest.java | 1 + .../src/gplx/core/intls/Unicode_string.java | 49 ++++++++ .../src/gplx/core/intls/Unicode_string_.java | 48 ++++++++ .../gplx/core/intls/Unicode_string_multi.java | 81 +++++++++++++ .../gplx/core/intls/Unicode_string_tst.java | 110 ++++++++++++++++++ .../src/gplx/core/intls/Utf16_mapper.java | 79 ------------- .../src/gplx/core/intls/Utf16_mapper_tst.java | 62 ---------- .../scribunto/libs/Scrib_lib_ustring.java | 37 +++--- .../libs/Scrib_lib_ustring_gsub_mgr.java | 2 +- .../scribunto/libs/Scrib_regx_converter.java | 110 +++++++++++------- .../libs/Scrib_regx_converter_tst.java | 7 +- 12 files changed, 380 insertions(+), 207 deletions(-) create mode 100644 400_xowa/src/gplx/core/intls/Unicode_string.java create mode 100644 400_xowa/src/gplx/core/intls/Unicode_string_.java create mode 100644 400_xowa/src/gplx/core/intls/Unicode_string_multi.java create mode 100644 400_xowa/src/gplx/core/intls/Unicode_string_tst.java delete mode 100644 400_xowa/src/gplx/core/intls/Utf16_mapper.java delete mode 100644 400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java diff --git a/100_core/src/gplx/core/intls/Utf8_.java b/100_core/src/gplx/core/intls/Utf8_.java index 5e4135cd3..66230a89f 100644 --- a/100_core/src/gplx/core/intls/Utf8_.java +++ b/100_core/src/gplx/core/intls/Utf8_.java @@ -52,6 +52,7 @@ public class Utf8_ { default: throw Err_.new_wo_type("invalid initial utf8 byte", "byte", b); } } + public static int Len_of_char_by_bytes_len(int v) {return v == 4 ? 2 : 1;} // 1 to 3 UTF bytes will encode up to 65,536 and fit in 1 char public static byte[] Get_char_at_pos_as_bry(byte[] bry, int pos) { int len = Len_of_char_by_1st_byte(bry[pos]); return Bry_.Mid(bry, pos, pos + len); diff --git a/100_core/src/gplx/core/tests/Gftest.java b/100_core/src/gplx/core/tests/Gftest.java index 31dc41d1a..891d71473 100644 --- a/100_core/src/gplx/core/tests/Gftest.java +++ b/100_core/src/gplx/core/tests/Gftest.java @@ -19,6 +19,7 @@ public class Gftest { private static final Bry_bfr bfr = Bry_bfr_.New(); public static void Eq__ary(Object[] expd, Object[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__obj, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(boolean[] expd, boolean[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__bool, expd, actl, msg_fmt, msg_args);} + public static void Eq__ary(int[] expd, int[] actl) {Eq__array(Type_ids_.Id__int, expd, actl, "");} public static void Eq__ary(int[] expd, int[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__int, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(long[] expd, long[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__long, expd, actl, msg_fmt, msg_args);} public static void Eq__ary(byte[] expd, byte[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__byte, expd, actl, msg_fmt, msg_args);} diff --git a/400_xowa/src/gplx/core/intls/Unicode_string.java b/400_xowa/src/gplx/core/intls/Unicode_string.java new file mode 100644 index 000000000..423a5c1f4 --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Unicode_string.java @@ -0,0 +1,49 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +public interface Unicode_string { + boolean Tid_is_single(); + String Src_string(); + byte[] Src_bytes(); + int Len_codes(); + int Len_chars(); + int Len_bytes(); + int Val_codes(int i); + int Pos_codes_to_bytes(int i); + int Pos_codes_to_chars(int i); + int Pos_bytes_to_codes(int i); + int Pos_chars_to_codes(int i); +} +class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint + private final int[] codes; + public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) { + this.src_string = src_string; + this.src_bytes = src_bytes; + this.codes = codes; + this.codes_len = codes_len; + } + public boolean Tid_is_single() {return true;} + public String Src_string() {return src_string;} private final String src_string; + public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes; + public int Len_codes() {return codes_len;} private final int codes_len; + public int Len_chars() {return codes_len;} + public int Len_bytes() {return codes_len;} + public int Val_codes(int i) {return codes[i];} + public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;} + public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;} + public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;} + public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;} +} diff --git a/400_xowa/src/gplx/core/intls/Unicode_string_.java b/400_xowa/src/gplx/core/intls/Unicode_string_.java new file mode 100644 index 000000000..189dd06c1 --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Unicode_string_.java @@ -0,0 +1,48 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +public class Unicode_string_ { + public static Unicode_string New(String orig) { + // null + if (orig == null) + return new Unicode_string_single(null, null, null, 0); + + // init bytes + byte[] bytes = Bry_.new_u8(orig); + int bytes_len = bytes.length; + + // init codes + int[] codes = new int[bytes_len]; + int codes_len = 0; + + // loop + int bytes_pos = 0; + int chars_pos = 0; + while (bytes_pos < bytes_len) { + // set codes + codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos); + + // increment + int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]); + bytes_pos += cur_byte_len; + chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len); + codes_len += 1; + } + return codes_len == bytes_len + ? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len) + : (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos); + } +} diff --git a/400_xowa/src/gplx/core/intls/Unicode_string_multi.java b/400_xowa/src/gplx/core/intls/Unicode_string_multi.java new file mode 100644 index 000000000..4aca2f80e --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Unicode_string_multi.java @@ -0,0 +1,81 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +class Unicode_string_multi implements Unicode_string { + private final int[] codes; + private final int[] codes_to_bytes; + private final int[] codes_to_chars; + private final int[] bytes_to_codes; + private final int[] chars_to_codes; + + public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) { + // set member vars + this.src = src; + this.bytes = bytes; + this.bytes_len = bytes_len; + this.codes = codes; + this.codes_len = codes_len; + this.chars_len = chars_len; + + // init maps + this.codes_to_bytes = new int[codes_len + Adj_end]; + this.codes_to_chars = new int[codes_len + Adj_end]; + this.bytes_to_codes = New_int_ary(bytes_len); + this.chars_to_codes = New_int_ary(chars_len); + + // init loop + int codes_pos = 0; + int bytes_pos = 0; + int chars_pos = 0; + + // loop till EOS + while (true) { + // update + codes_to_bytes[codes_pos] = bytes_pos; + codes_to_chars[codes_pos] = chars_pos; + bytes_to_codes[bytes_pos] = codes_pos; + chars_to_codes[chars_pos] = codes_pos; + + if (bytes_pos == bytes_len) break; + + // increment + int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]); + bytes_pos += cur_byte_len; + chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len); + codes_pos += 1; + } + } + public boolean Tid_is_single() {return false;} + public String Src_string() {return src;} private final String src; + public byte[] Src_bytes() {return bytes;} private final byte[] bytes; + public int Len_codes() {return codes_len;} private final int codes_len; + public int Len_chars() {return chars_len;} private final int chars_len; + public int Len_bytes() {return bytes_len;} private final int bytes_len; + public int Val_codes(int i) {return codes[i];} + public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];} + public int Pos_codes_to_chars(int i) {return codes_to_chars[i];} + public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;} + public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;} + + private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3] + private static int[] New_int_ary(int len) { + int rv_len = len + Adj_end; + int[] rv = new int[rv_len]; + for (int i = 0; i < rv_len; i++) + rv[i] = Invalid; + return rv; + } +} diff --git a/400_xowa/src/gplx/core/intls/Unicode_string_tst.java b/400_xowa/src/gplx/core/intls/Unicode_string_tst.java new file mode 100644 index 000000000..f817f0ac5 --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Unicode_string_tst.java @@ -0,0 +1,110 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +import org.junit.*; import gplx.core.tests.*; +public class Unicode_string_tst { + private final Unicode_string_fxt fxt = new Unicode_string_fxt(); + @Test public void Null() { + fxt.Init(null); + fxt.Test__Len(0, 0, 0); + } + @Test public void Blank() { + fxt.Init(""); + fxt.Test__Len(0, 0, 0); + } + @Test public void Single() { + fxt.Init("Abc"); + fxt.Test__Len(3, 3, 3); + fxt.Test__Val_codes(65, 98, 99); + fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3); + fxt.Test__Pos_codes_to_chars(0, 1, 2, 3); + fxt.Test__Pos_chars_to_codes(0, 1, 2, 3); + fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3); + } + @Test public void Multi() { + fxt.Init("a¢€𤭢"); + fxt.Test__Len(4, 5, 10); + fxt.Test__Val_codes(97, 162, 8364, 150370); + fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10); + fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5); + fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4); + fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4); + } +} +class Unicode_string_fxt { + private Unicode_string under; + public void Init(String src) { + this.under = Unicode_string_.New(src); + } + public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) { + Gftest.Eq__int(expd_codes, under.Len_codes(), "codes"); + Gftest.Eq__int(expd_chars, under.Len_chars(), "chars"); + Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes"); + } + public void Test__Val_codes(int... expd) { + int actl_len = under.Len_codes(); + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) + actl[i] = under.Val_codes(i); + Gftest.Eq__ary(expd, actl); + } + public void Test__Pos_codes_to_bytes(int... expd) { + int actl_len = under.Len_codes() + 1; + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) + actl[i] = under.Pos_codes_to_bytes(i); + Gftest.Eq__ary(expd, actl); + } + public void Test__Pos_codes_to_chars(int... expd) { + int actl_len = under.Len_codes() + 1; + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) + actl[i] = under.Pos_codes_to_chars(i); + Gftest.Eq__ary(expd, actl); + } + public void Test__Pos_bytes_to_codes(int... expd) { + int actl_len = under.Len_bytes() + 1; + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) { + int val = 0; + try { + val = under.Pos_bytes_to_codes(i); + } + catch (Exception exc) { + val = -1; + Err_.Noop(exc); + } + actl[i] = val; + } + Gftest.Eq__ary(expd, actl); + } + public void Test__Pos_chars_to_codes(int... expd) { + int actl_len = under.Len_chars() + 1; + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) { + int val = 0; + try { + val = under.Pos_chars_to_codes(i); + } + catch (Exception exc) { + val = -1; + Err_.Noop(exc); + } + actl[i] = val; + } + Gftest.Eq__ary(expd, actl); + } +} diff --git a/400_xowa/src/gplx/core/intls/Utf16_mapper.java b/400_xowa/src/gplx/core/intls/Utf16_mapper.java deleted file mode 100644 index b38e02fe8..000000000 --- a/400_xowa/src/gplx/core/intls/Utf16_mapper.java +++ /dev/null @@ -1,79 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.core.intls; import gplx.*; import gplx.core.*; -public class Utf16_mapper { - private final int[] ary; - private final int dim_len; - public byte[] Src_bry() {return src_bry;} private final byte[] src_bry; - public String Src_str() {return src_str;} private final String src_str; - public int Len_in_codes() {return len_in_codes;} private int len_in_codes; - public int Len_in_chars() {return len_in_chars;} private int len_in_chars; - public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;} - public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;} - public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;} - public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;} - public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;} - public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;} - public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;} - public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;} - public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) { - // create ary - this.src_str = src_str; - this.src_bry = src_bry; - this.dim_len = src_bry_len + 1; // +1 to capture end + 1 - int ary_len = dim_len * Dims_total; - this.ary = new int[dim_len * Dims_total]; - for (int i = 0; i < ary_len; i++) - ary[i] = Invalid; - - // init - int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0; - - // loop till EOS - while (true) { - // update - ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes; - ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes; - ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes; - ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars; - - // exit if EOS - if (pos_in_bytes >= src_bry_len) break; - - // get lengths - int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]); - int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04 - - // increment - pos_in_bytes += cur_len_in_bytes; - pos_in_chars += cur_len_in_chars; - pos_in_codes += 1; - } - - // set lens - this.len_in_codes = pos_in_codes; - this.len_in_chars = pos_in_chars; - } - - public static final int - Invalid = -1 - , Dims_total = 4 - , Dims_code_for_byte = 0 - , Dims_byte_for_code = 1 - , Dims_code_for_char = 2 - , Dims_char_for_code = 3 - ; -} diff --git a/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java b/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java deleted file mode 100644 index 3f91e541d..000000000 --- a/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java +++ /dev/null @@ -1,62 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.core.intls; import gplx.*; import gplx.core.*; -import org.junit.*; import gplx.core.tests.*; -public class Utf16_mapper_tst { - private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt(); - @Test public void A() { - fxt.Test__map("a¢€𤭢" - , Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4) - , Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1) - , Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1) - , Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1) - ); - } -} -class Utf16_mapper_fxt { - public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) { - byte[] src_bry = Bry_.new_u8(src_str); - int src_len = src_bry.length; - Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len); - Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte); - Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code); - Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char); - Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code); - } - private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) { - int actl_len = src_len + 1; - int[] actl = new int[actl_len]; - for (int i = 0; i < actl_len; i++) { - int v = -1; - switch (dim_type) { - case Utf16_mapper.Dims_code_for_byte: - v = mapper.Get_code_for_byte_or_neg1(i); - break; - case Utf16_mapper.Dims_byte_for_code: - v = mapper.Get_byte_for_code_or_neg1(i); - break; - case Utf16_mapper.Dims_code_for_char: - v = mapper.Get_code_for_char_or_neg1(i); - break; - case Utf16_mapper.Dims_char_for_code: - v = mapper.Get_char_for_code_or_neg1(i); - break; - } - actl[i] = v; - } - Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type)); - } -} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index 8e8f84f65..517b7700a 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -54,12 +54,10 @@ public class Scrib_lib_ustring implements Scrib_lib { boolean plain = args.Cast_bool_or_n(3); // init text vars - byte[] text_bry = Bry_.new_u8(text_str); - int text_bry_len = text_bry.length; - Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 // convert bgn from base_1 to base_0 - int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes()); + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes()); /* int offset = 0; @@ -81,33 +79,33 @@ public class Scrib_lib_ustring implements Scrib_lib { // if plain, just do literal match of find and exit if (plain) { // find pos by literal match - byte[] find_bry = Bry_.new_u8(find_str); - int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes)); + Unicode_string find_ucs = Unicode_string_.New(find_str); + byte[] find_bry = find_ucs.Src_bytes(); + int pos = Bry_find_.Find_fwd(text_ucs.Src_bytes(), find_bry, text_ucs.Pos_codes_to_bytes(bgn_as_codes)); // nothing found; return empty if (pos == Bry_find_.Not_found) return rslt.Init_ary_empty(); // bgn: convert pos from bytes back to codes; also adjust for base1 - int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1; + int bgn = text_ucs.Pos_bytes_to_codes(pos) + Base1; // end: add find.Len_in_codes and adjust end for PHP/LUA - Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length); - int end = bgn + find_map.Len_in_codes() - End_adj; + int end = bgn + find_ucs.Len_codes() - End_adj; return rslt.Init_many_objs(bgn, end); } // run regex Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes); + Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes); if (regx_rslts.length == 0) return rslt.Init_ary_empty(); // add to tmp_list Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27 List_adp tmp_list = List_adp_.New(); - tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_bgn()) + Scrib_lib_ustring.Base1); - tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); + tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_bgn()) + Scrib_lib_ustring.Base1); + tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false); return rslt.Init_many_list(tmp_list); } @@ -120,13 +118,12 @@ public class Scrib_lib_ustring implements Scrib_lib { // validate / adjust if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 return rslt.Init_many_list(List_adp_.Noop); - byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length; - Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 - int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes()); + Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes()); // run regex Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes); + Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes); if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 // TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23 @@ -141,7 +138,7 @@ public class Scrib_lib_ustring implements Scrib_lib { } public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { // String text = Scrib_kv_utl_.Val_to_str(values, 0); - byte[] regx = args.Pull_bry(1); + String regx = args.Pull_str(1); Scrib_regx_converter regx_converter = new Scrib_regx_converter(); String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null); return rslt.Init_many_objs(pcre, regx_converter.Capt_ary()); @@ -181,13 +178,13 @@ public class Scrib_lib_ustring implements Scrib_lib { bgn_as_codes = 0; return bgn_as_codes; } - private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) { + private Regx_match[] Run_regex_or_null(Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) { // convert regex from lua to java - find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G); + find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G); // run regex Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str); - return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04 + return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04 } private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch int capts_len = capts == null ? 0 : capts.length; diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java index 6377961c9..5d304861d 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring_gsub_mgr.java @@ -32,7 +32,7 @@ class Scrib_lib_ustring_gsub_mgr { // get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 String regx = args.Xstr_str_or_null(1); - regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow); + regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow); // get @repl Object repl_obj = args.Cast_obj_or_null(2); diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java index 112ee5c41..cab5f5d1c 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java @@ -14,27 +14,31 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import gplx.core.brys.fmtrs.*; +import gplx.core.brys.fmtrs.*; import gplx.core.intls.*; import gplx.langs.regxs.*; public class Scrib_regx_converter { - private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs(); private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr(); private final Bry_bfr bfr = Bry_bfr_.New(); private Bry_bfr tmp_bfr; private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced; - public Scrib_regx_converter() {Init();} + private final Lua_cls_to_regx_map percent_map, brack_map; + public Scrib_regx_converter() { + percent_map = Lua_cls_matcher.Instance.Percent(); + brack_map = Lua_cls_matcher.Instance.Brack(); + } public String Regx() {return regx;} private String regx; public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();} public boolean Any_pos() {return any_pos;} private boolean any_pos; public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);} - public String patternToRegex(byte[] pat, byte[] anchor) { + public String patternToRegex(String pat_str, byte[] anchor) { + Unicode_string pat_ucs = Unicode_string_.New(pat_str); // TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey)) grp_mgr.Clear(); any_pos = false; boolean q_flag = false; - + // bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start - int len = pat.length; + int len = pat_ucs.Len_codes(); int grps_len = 0; int bct = 0; @@ -42,7 +46,7 @@ public class Scrib_regx_converter { for (int i = 0; i < len; i++) { int i_end = i + 1; q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08 - byte cur = pat[i]; + int cur = pat_ucs.Val_codes(i); switch (cur) { case Byte_ascii.Pow: q_flag = i != 0; @@ -59,7 +63,7 @@ public class Scrib_regx_converter { int grp_idx = grp_mgr.Capt__len() + 1; // check for "()"; enables anypos flag - boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; + boolean is_empty_capture = pat_ucs.Val_codes(i + 1) == Byte_ascii.Paren_end; if (is_empty_capture) any_pos = true; grp_mgr.Capt__add__real(grp_idx, is_empty_capture); @@ -77,19 +81,19 @@ public class Scrib_regx_converter { i++; if (i >= len) throw Err_.new_wo_type("malformed pattern (ends with '%')"); - Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1); - if (percent_obj != null) { - bfr.Add((byte[])percent_obj); + byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Val_codes(i)); + if (percent_bry != null) { + bfr.Add(percent_bry); q_flag = true; } else { - byte nxt = pat[i]; + int nxt = pat_ucs.Val_codes(i); switch (nxt) { case Byte_ascii.Ltr_b: // EX: "%b()" i += 2; if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')"); - byte char_0 = pat[i - 1]; - byte char_1 = pat[i]; + int char_0 = pat_ucs.Val_codes(i - 1); + int char_1 = pat_ucs.Val_codes(i); if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1"; bfr.Add(Bry_bf0_seg_0); Regx_quote(bfr, char_0); @@ -109,18 +113,18 @@ public class Scrib_regx_converter { synchronized (fmtr_balanced) { ++bct; int balanced_idx = grp_mgr.Full__len(); - fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2)); + fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2)); grp_mgr.Capt__add__fake(2); bfr.Add(bfr_balanced.To_bry_and_clear()); } } break; case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21 - if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn) + if (i + 1 >= len || pat_ucs.Val_codes(++i) != Byte_ascii.Brack_bgn) throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end)); // %f always followed by bracketed term; convert lua bracketed term to regex if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New(); - i = bracketedCharSetToRegex(tmp_bfr, pat, i, len); + i = bracketedCharSetToRegex(tmp_bfr, pat_ucs, i, len); byte[] re2 = tmp_bfr.To_bry_and_clear(); // scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".' @@ -148,7 +152,7 @@ public class Scrib_regx_converter { } break; case Byte_ascii.Brack_bgn: - i = bracketedCharSetToRegex(bfr, pat, i, len); + i = bracketedCharSetToRegex(bfr, pat_ucs, i, len); q_flag = true; break; case Byte_ascii.Brack_end: @@ -163,12 +167,12 @@ public class Scrib_regx_converter { break; } if (q_flag && i + 1 < len) { - byte tmp_b = pat[i + 1]; + int tmp_b = pat_ucs.Val_codes(i + 1); switch (tmp_b) { case Byte_ascii.Star: case Byte_ascii.Plus: case Byte_ascii.Question: - bfr.Add_byte(tmp_b); + bfr.Add_byte((byte)tmp_b); ++i; break; case Byte_ascii.Dash: @@ -184,35 +188,35 @@ public class Scrib_regx_converter { regx = bfr.To_str_and_clear(); return regx; } - private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) { + private int bracketedCharSetToRegex(Bry_bfr bfr, Unicode_string pat_ucs, int i, int len) { bfr.Add_byte(Byte_ascii.Brack_bgn); i++; - if (i < len && pat[i] == Byte_ascii.Pow) { // ^ + if (i < len && pat_ucs.Val_codes(i) == Byte_ascii.Pow) { // ^ bfr.Add_byte(Byte_ascii.Pow); i++; } - for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) { - if (pat[i] == Byte_ascii.Percent) { + for (int j = i; i < len && (j == i || pat_ucs.Val_codes(i) != Byte_ascii.Brack_end); i++) { + if (pat_ucs.Val_codes(i) == Byte_ascii.Percent) { i++; if (i >= len) { break; } - Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1); - if (brack_obj != null) - bfr.Add((byte[])brack_obj); + byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Val_codes(i)); + if (brack_bry != null) + bfr.Add(brack_bry); else - Regx_quote(bfr, pat[i]); + Regx_quote(bfr, pat_ucs.Val_codes(i)); } - else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) { - if (pat[i] <= pat[i + 2]) { - Regx_quote(bfr, pat[i]); + else if (i + 2 < len && pat_ucs.Val_codes(i + 1) == Byte_ascii.Dash && pat_ucs.Val_codes(i + 2) != Byte_ascii.Brack_end && pat_ucs.Val_codes(i + 2) != Byte_ascii.Hash) { + if (pat_ucs.Val_codes(i) <= pat_ucs.Val_codes(i + 2)) { + Regx_quote(bfr, pat_ucs.Val_codes(i)); bfr.Add_byte(Byte_ascii.Dash); - Regx_quote(bfr, pat[i + 2]); + Regx_quote(bfr, pat_ucs.Val_codes(i + 2)); } i += 2; } else { - Regx_quote(bfr, pat[i]); + Regx_quote(bfr, pat_ucs.Val_codes(i)); } } if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos"); @@ -233,12 +237,12 @@ public class Scrib_regx_converter { } return i; } - private void Regx_quote(Bry_bfr bfr, byte b) { - if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash); - bfr.Add_byte(b); + private void Regx_quote(Bry_bfr bfr, int code) { + if (Regx_char(code)) bfr.Add_byte(Byte_ascii.Backslash); + bfr.Add_u8_int(code); } - private boolean Regx_char(byte b) { - switch (b) { + private boolean Regx_char(int code) { + switch (code) { case Byte_ascii.Dot: case Byte_ascii.Slash: case Byte_ascii.Plus: case Byte_ascii.Star: case Byte_ascii.Question: case Byte_ascii.Pow: case Byte_ascii.Dollar: case Byte_ascii.Eq: case Byte_ascii.Bang: case Byte_ascii.Pipe: case Byte_ascii.Colon: case Byte_ascii.Dash: @@ -257,7 +261,10 @@ public class Scrib_regx_converter { , Bry_star_question = Bry_.new_a7("*?") // was *? ; public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^"); - private void Init() { +} +class Lua_cls_matcher { + public static final Lua_cls_matcher Instance = new Lua_cls_matcher(); + Lua_cls_matcher() { String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10 String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10 Init_itm(Bool_.Y, "a", "\\p{L}"); @@ -288,15 +295,32 @@ public class Scrib_regx_converter { Init_itm(Bool_.N, "X", "\\x00-\\x2f\\x3a-\\x40\\x47-\\x60\\x67-\\x{ff0f}\\x{ff1a}-\\x{ff20}\\x{ff27}-\\x{ff40}\\x{ff47}-\\x{10ffff}"); Init_itm(Bool_.N, "Z", "\\x01-\\x{10ffff}"); } + public Lua_cls_to_regx_map Percent() {return percent_map;} private final Lua_cls_to_regx_map percent_map = new Lua_cls_to_regx_map(); + public Lua_cls_to_regx_map Brack() {return brack_map;} private final Lua_cls_to_regx_map brack_map = new Lua_cls_to_regx_map(); + private void Init_itm(boolean add_to_percent_hash, String lua, String php) { - byte[] lua_bry = Bry_.new_a7(lua); + int lua_len = String_.Len(lua); + if (lua_len != 1) throw Err_.new_wo_type("lua must be 1 char only", "lua", lua); + int lua_code = (int)String_.CharAt(lua, 0); + if (lua_code < Byte_ascii.Ltr_A || lua_code > Byte_ascii.Ltr_z) throw Err_.new_wo_type("lua must be between A and z", "lua", lua); + byte[] php_bry = Bry_.new_a7(php); if (add_to_percent_hash) { - percent_hash.Add_bry_obj(lua_bry, php_bry); - brack_hash.Add_bry_obj(lua_bry, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters + percent_map.Set(lua_code, php_bry); + brack_map.Set(lua_code, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters } else { - brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions + brack_map.Set(lua_code, php_bry); // replace percent_hash definitions } } } +class Lua_cls_to_regx_map { + private static final int MAX = Byte_ascii.Max_7_bit; + private final byte[][] map = new byte[MAX][]; + public byte[] Get_or_null(int code) { + return code < MAX ? map[code] : null; + } + public void Set(int code, byte[] val) { + map[code] = val; + } +} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java index 7c58f6b52..b914049f1 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java @@ -50,6 +50,9 @@ public class Scrib_regx_converter_tst { @Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails fxt.Test_replace("𠀀" , "[𠀀-𯨟]" , "a", "a"); } + @Test public void Invalid_range() {// PURPOSE: if range is invalid, take 1st char only; note range is multi-byte; ISSUE#:383; PAGE:en.d:dictionary DATE:2019-03-16 + fxt.Test_parse("[ড়-য়]" , "[ড়]"); // 2492-2479 + } // @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");} // @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");} } @@ -61,11 +64,11 @@ class Scrib_regx_converter_fxt { } } public void Test_parse(String raw, String expd) { - under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G); + under.patternToRegex(raw, Scrib_regx_converter.Anchor_G); Tfds.Eq(expd, under.Regx()); } public void Test_replace(String text, String find, String replace, String expd) { - String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G); + String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G); String actl = Regx_adp_.Replace(text, regex_str, replace); Tfds.Eq(expd, actl); }