diff --git a/100_core/src/gplx/core/intls/Utf8_.java b/100_core/src/gplx/core/intls/Utf8_.java index 15a94bea1..5e4135cd3 100644 --- a/100_core/src/gplx/core/intls/Utf8_.java +++ b/100_core/src/gplx/core/intls/Utf8_.java @@ -88,7 +88,7 @@ public class Utf8_ { for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info byte b = bry[i]; int char_len = Len_of_char_by_1st_byte(b); - switch (char_len) { // if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: � = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return + switch (char_len) { // if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: € = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return case 2: if (pos - i == 1) return i; break; case 3: if (pos - i == 2) return i; break; case 4: if (pos - i == 3) return i; break; @@ -111,7 +111,7 @@ public class Utf8_ { // loop maximum of 4 times; note that UTF8 char has max of 4 bytes for (int i = 0; i < 4; i++) { int char_len = Len_of_char_by_1st_byte(b); - switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: � = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return + switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: € = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return case 2: if (i == 1) return pos; break; case 3: if (i == 2) return pos; break; case 4: if (i == 3) return pos; break; @@ -141,3 +141,48 @@ public class Utf8_ { , Codepoint_surrogate_end = 0xDFFF ; } +/* +== Definitions == +=== a7 vs u8 === +* a7 -> ASCII (7 bits) +* u8 -> UTF-8 (8 bytes) + +In retrospect, better abbreviations would have been: +* ascii -> ASCII +* utf08 -> UTF-8 +* utf16 -> UTF-16 + +=== General === +==== Byte ==== +* Standard definition; 8 bits (2^8 or 256) + +==== Codepoint ==== +* Represents 1 atomic character but can be composed of multiple bytes +** Examples: +
+1 byte : "a"  (letter a)
+2 bytes: "¢"  (cent)
+3 bytes: "€"  (euro)
+4 bytes: "𤭢" (Chinese character)
+
+* Defined by unicode as a sequence of 4 hexadecimals (2 bytes) or 8 hexadecimals (4 bytes); REF:http://www.unicode.org +** 4 hexadecimal is 2 bytes (2^(4 * 4) -> 2^16) + +==== char ==== +* Java definition of a codepoint which is encoded as 2 bytes (2^16 or 65,536) +* For Western langauges: 1 codepoint equals 1 char (2 bytes); +** For example, chars like "a", "œ", "é" are 1 Java char +* For Eastern langauges: 1 codepoint can equal 2 chars (4 bytes); +** For example, chars like "駣" are 2 Java chars though they represent 1 conceptual codepoint (in English terms, "駣" is a single letter just like the letter "a") + +==== Supplementary characters ==== +* Represents a codepoint which is defined by 3 or 4 bytes +* Is defined by 1 surrogate pair +** lo-surrogate : 2 bytes +** hi-surrogate : 2 bytes + +=== Conventions === +* Codepoints will be rendered as one int (4 bytes), not 4 hexadecimals (1 byte) 8 hexadecimal (4 bytes) +* The "char" datatype will rarely be used in code; instead byte arrays or codepoint-ints will be used +* The "character" word will not be used in comments; instead the "codepoint" word will be used +*/ diff --git a/400_xowa/src/gplx/core/intls/String_surrogate_utl.java b/400_xowa/src/gplx/core/intls/String_surrogate_utl.java deleted file mode 100644 index 572562b62..000000000 --- a/400_xowa/src/gplx/core/intls/String_surrogate_utl.java +++ /dev/null @@ -1,34 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.core.intls; import gplx.*; import gplx.core.*; -public class String_surrogate_utl { - public int Byte_pos() {return byte_pos;} int byte_pos; - public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);} - public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);} - private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) { - int char_count = 0, codepoint_count = 0; - byte_pos = byte_bgn; - while (true) { - if ( stop_idx == (stop_idx_is_char ? char_count : codepoint_count) // requested # of chars found - || byte_pos >= src_len // eos reached; DATE:2014-09-02 - ) return codepoint_count - char_count; - int char_len_in_bytes = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]); - ++char_count; // char_count always incremented by 1 - codepoint_count += (char_len_in_bytes == 4) ? 2 : 1; // codepoint_count incremented by 2 if surrogate pair; else 1 - byte_pos += char_len_in_bytes; - } - } -} diff --git a/400_xowa/src/gplx/core/intls/String_surrogate_utl_tst.java b/400_xowa/src/gplx/core/intls/String_surrogate_utl_tst.java deleted file mode 100644 index ebccb83cd..000000000 --- a/400_xowa/src/gplx/core/intls/String_surrogate_utl_tst.java +++ /dev/null @@ -1,55 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.core.intls; import gplx.*; import gplx.core.*; -import org.junit.*; -public class String_surrogate_utl_tst { - @Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt(); - @Test public void Char_idx() { - String test_str = "aé𡼾bî𡼾"; - fxt.Test_count_surrogates__char_idx (test_str, 0, 1, 0, 1); // a - fxt.Test_count_surrogates__char_idx (test_str, 0, 2, 0, 3); // aé - fxt.Test_count_surrogates__char_idx (test_str, 0, 3, 1, 7); // aé𡼾 - fxt.Test_count_surrogates__char_idx (test_str, 7, 1, 0, 8); // b - fxt.Test_count_surrogates__char_idx (test_str, 7, 2, 0, 10); // bî - fxt.Test_count_surrogates__char_idx (test_str, 7, 3, 1, 14); // bî𡼾 - fxt.Test_count_surrogates__char_idx (test_str, 0, 6, 2, 14); // aé𡼾bî𡼾 - fxt.Test_count_surrogates__char_idx (test_str, 14, 7, 0, 14); // PURPOSE: test out of bounds; DATE:2014-09-02 - } - @Test public void Codepoint_idx() { - String test_str = "aé𡼾bî𡼾"; - fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 1, 0, 1); // a - fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 2, 0, 3); // aé - fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 4, 1, 7); // aé𡼾 - fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 1, 0, 8); // b - fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 2, 0, 10); // bî - fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 4, 1, 14); // bî𡼾 - fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 8, 2, 14); // aé𡼾bî𡼾 - } -} -class String_surrogate_utl_fxt { - private String_surrogate_utl codepoint_utl = new String_surrogate_utl(); - public void Clear() {} - public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) { - byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length; - Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx)); - Tfds.Eq(expd_pos , codepoint_utl.Byte_pos()); - } - public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) { - byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length; - Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count"); - Tfds.Eq(expd_pos , codepoint_utl.Byte_pos(), "pos"); - } -} diff --git a/400_xowa/src/gplx/core/intls/Utf16_mapper.java b/400_xowa/src/gplx/core/intls/Utf16_mapper.java new file mode 100644 index 000000000..dcad8c59d --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Utf16_mapper.java @@ -0,0 +1,79 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +public class Utf16_mapper { + private final int[] ary; + private final int dim_len; + public byte[] Src_bry() {return src_bry;} private final byte[] src_bry; + public String Src_str() {return src_str;} private final String src_str; + public int Len_in_codes() {return len_in_codes;} private int len_in_codes; + public int Len_in_chars() {return len_in_chars;} private int len_in_chars; + public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;} + public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;} + public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;} + public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;} + public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;} + public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;} + public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;} + public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;} + public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) { + // create ary + this.src_str = src_str; + this.src_bry = src_bry; + this.dim_len = src_bry_len + 1; // +1 to capture end + 1 + int ary_len = dim_len * Dims_total; + this.ary = new int[dim_len * Dims_total]; + for (int i = 0; i < ary_len; i++) + ary[i] = Invalid; + + // init + int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0; + + // loop till EOS + while (true) { + // update + ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes; + ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes; + ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes; + ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars; + + // exit if EOS + if (pos_in_bytes >= src_bry_len) break; + + // get lengths + int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]); + int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1; + + // increment + pos_in_bytes += cur_len_in_bytes; + pos_in_chars += cur_len_in_chars; + pos_in_codes += 1; + } + + // set lens + this.len_in_codes = pos_in_codes; + this.len_in_chars = pos_in_chars; + } + + public static final int + Invalid = -1 + , Dims_total = 4 + , Dims_code_for_byte = 0 + , Dims_byte_for_code = 1 + , Dims_code_for_char = 2 + , Dims_char_for_code = 3 + ; +} diff --git a/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java b/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java new file mode 100644 index 000000000..d5bb074d1 --- /dev/null +++ b/400_xowa/src/gplx/core/intls/Utf16_mapper_tst.java @@ -0,0 +1,62 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.intls; import gplx.*; import gplx.core.*; +import org.junit.*; import gplx.core.tests.*; +public class Utf16_mapper_tst { + private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt(); + @Test public void A() { + fxt.Test__map("a¢€𤭢" + , Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4) + , Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1) + , Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1) + , Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1) + ); + } +} +class Utf16_mapper_fxt { + public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) { + byte[] src_bry = Bry_.new_u8(src_str); + int src_len = src_bry.length; + Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len); + Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte); + Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code); + Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char); + Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code); + } + private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) { + int actl_len = src_len + 1; + int[] actl = new int[actl_len]; + for (int i = 0; i < actl_len; i++) { + int v = -1; + switch (dim_type) { + case Utf16_mapper.Dims_code_for_byte: + v = mapper.Get_code_for_byte_or_neg1(i); + break; + case Utf16_mapper.Dims_byte_for_code: + v = mapper.Get_byte_for_code_or_neg1(i); + break; + case Utf16_mapper.Dims_code_for_char: + v = mapper.Get_code_for_char_or_neg1(i); + break; + case Utf16_mapper.Dims_char_for_code: + v = mapper.Get_char_for_code_or_neg1(i); + break; + } + actl[i] = v; + } + Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type)); + } +} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index 5ecaea81e..efadd87d1 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -18,7 +18,6 @@ import gplx.core.intls.*; import gplx.langs.regxs.*; import gplx.xowa.parsers.*; import gplx.xowa.xtns.scribunto.procs.*; public class Scrib_lib_ustring implements Scrib_lib { - private final String_surrogate_utl surrogate_utl = new String_surrogate_utl(); public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core; public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod; public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max; @@ -48,74 +47,92 @@ public class Scrib_lib_ustring implements Scrib_lib { public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub"; private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub); public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) { - String text_str = args.Xstr_str_or_null(0); - String regx = args.Pull_str(1); - int bgn_char_idx = args.Cast_int_or(2, 1); - boolean plain = args.Cast_bool_or_n(3); - synchronized (surrogate_utl) { - byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length; - bgn_char_idx = Bgn_adjust(text_str, bgn_char_idx); - - // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false; - // NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24 - if (String_.Len_eq_0(regx)) // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false - return rslt.Init_many_objs(bgn_char_idx + Scrib_lib_ustring.Base1, bgn_char_idx + Scrib_lib_ustring.Base1 - 1); - - // NOTE: adjust for 2-len chars (surrogates); PAGE:en.d:iglesia DATE:2017-04-23 - int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27 - int bgn_codepoint_idx = bgn_char_idx + bgn_adj; - int bgn_byte_pos = surrogate_utl.Byte_pos(); - if (plain) { - int pos = String_.FindFwd(text_str, regx, bgn_codepoint_idx); - boolean found = pos != Bry_find_.Not_found; - return found - ? rslt.Init_many_objs(pos + Scrib_lib_ustring.Base1, pos + Scrib_lib_ustring.Base1 + String_.Len(regx) - Scrib_lib_ustring.End_adj) - : rslt.Init_ary_empty() - ; - } - Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G); - Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); - Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04 - int len = regx_rslts.length; - if (len == 0) return rslt.Init_ary_empty(); - List_adp tmp_list = List_adp_.New(); - Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27 - int match_find_bgn_codepoint = match.Find_bgn(); // NOTE: java regex returns results in codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27 - int match_find_bgn_adj = -surrogate_utl.Count_surrogates__codepoint_idx1(text_bry, text_bry_len, bgn_byte_pos, match_find_bgn_codepoint - bgn_codepoint_idx); // NOTE: convert from java regex codepoint to lua / php char_idx; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27 - tmp_list.Add(match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1); - tmp_list.Add(match.Find_end() + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); - //Tfds.Dbg (match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - // ,match.Find_end() + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); - AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false); - return rslt.Init_many_list(tmp_list); + // get args + String text_str = args.Xstr_str_or_null(0); + String find_str = args.Pull_str(1); + int bgn_as_codes_base1 = args.Cast_int_or(2, 1); + boolean plain = args.Cast_bool_or_n(3); + + // init text vars + byte[] text_bry = Bry_.new_u8(text_str); + int text_bry_len = text_bry.length; + Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + + // convert bgn from base_1 to base_0 + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes()); + + /* + int offset = 0; + if (bgn_as_codes > 0) { // NOTE: MW.BASE + // $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } - } - private int Bgn_adjust(String text, int bgn) { // adjust to handle bgn < 0 or bgn > len (which PHP allows) - if (bgn > 0) bgn -= Scrib_lib_ustring.Base1; - int text_len = String_.Len(text); - if (bgn < 0) // negative number means search from rear of String - bgn += text_len; // NOTE: PHP has extra + 1 for Base 1 - else if (bgn > text_len) // bgn > text_len; confine to text_len; NOTE: PHP has extra + 1 for Base 1 - bgn = text_len; // NOTE: PHP has extra + 1 for Base 1 - return bgn; + else { + bgn_as_codes_base1 = 0; // NOTE: MW.BASE1 + offset = 0; // -1? + } + */ + + // find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false; + // NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24 + // NOTE: not in MW; is this needed? DATE:2019-02-24 + if (String_.Len_eq_0(find_str)) + return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1); + + // if plain, just do literal match of find and exit + if (plain) { + // find pos by literal match + byte[] find_bry = Bry_.new_u8(find_str); + int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes)); + + // nothing found; return empty + if (pos == Bry_find_.Not_found) + return rslt.Init_ary_empty(); + + // bgn: convert pos from bytes back to codes; also adjust for base1 + int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1; + + // end: add find.Len_in_codes and adjust end for PHP/LUA + Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length); + int end = bgn + find_map.Len_in_codes() - End_adj; + + return rslt.Init_many_objs(bgn, end); + } + + // run regex + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); + Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes); + if (regx_rslts.length == 0) return rslt.Init_ary_empty(); + + // add to tmp_list + Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27 + List_adp tmp_list = List_adp_.New(); + tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_bgn()) + Scrib_lib_ustring.Base1); + tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); + AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false); + return rslt.Init_many_list(tmp_list); } public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) { - String text = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 - if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 + // get args + String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 + String find_str = args.Cast_str_or_null(1); + int bgn_as_codes_base1 = args.Cast_int_or(2, 1); + + // validate / adjust + if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 + return rslt.Init_many_list(List_adp_.Noop); + byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length; + Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes()); + + // run regex Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G); - int bgn = args.Cast_int_or(2, 1); - bgn = Bgn_adjust(text, bgn); - Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); - Regx_match[] regx_rslts = regx_adp.Match_all(text, bgn); - int len = regx_rslts.length; - if (len == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 + Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes); + if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 // TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23 regx_rslts = regx_converter.Adjust_balanced(regx_rslts); List_adp tmp_list = List_adp_.New(); - AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true); + AddCapturesFromMatch(tmp_list, regx_rslts[0], text_str, regx_converter.Capt_ary(), true); return rslt.Init_many_list(tmp_list); } public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { @@ -143,6 +160,35 @@ public class Scrib_lib_ustring implements Scrib_lib { AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23 return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list)); } + private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) { + // convert bgn from base_1 to base_0 + int bgn_as_codes = bgn_as_codes_base1; + if (bgn_as_codes > 0) + bgn_as_codes -= Scrib_lib_ustring.Base1; + // TOMBSTONE: do not adjust negative numbers for base1; fails tests + // else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1; + + // adjust bgn for negative-numbers and large positive-numbers + // NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php + // NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ + if (bgn_as_codes < 0) // negative number means search from rear of String + bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ + else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ + bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ + + // will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23 + if (bgn_as_codes < 0) + bgn_as_codes = 0; + return bgn_as_codes; + } + private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) { + // convert regex from lua to java + find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G); + + // run regex + Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str); + return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04 + } private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch int capts_len = capts == null ? 0 : capts.length; if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 @@ -171,6 +217,7 @@ public class Scrib_lib_ustring implements Scrib_lib { } return rv; } - private static final int Base1 = 1 + private static final int + Base1 = 1 , End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab" } diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__find__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__find__tst.java index 76e289b3f..ea8fd048a 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__find__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__find__tst.java @@ -14,39 +14,107 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*; +import org.junit.*; +import gplx.core.consoles.*; +import gplx.xowa.xtns.scribunto.engines.mocks.*; public class Scrib_lib_ustring__find__tst { - private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib; - @Before public void init() { - fxt.Clear(); - lib = fxt.Core().Lib_ustring().Init(); + private final Scrib_lib_ustring__find__fxt fxt = new Scrib_lib_ustring__find__fxt(); + @Test public void Plain() { + fxt.Test__find("aabaab" , "b" , 2, Bool_.Y, "3;3"); // bytes=1 + fxt.Test__find("€€b€€b" , "b" , 2, Bool_.Y, "3;3"); // bytes=3 + fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=4 + fxt.Test__find("()()" , "(" , 2, Bool_.Y, "3;3"); // exact match; note that "(" is invalid regx + fxt.Test__find("abcd" , "" , 2, Bool_.Y, "2;1"); // empty find should return values; EX:w:Fool's_mate; DATE:2014-03-04 + fxt.Test__find("a€b" , "€" , 1, Bool_.Y, "2;2"); // find is bytes=3 + } + @Test public void Bgn__negative() { + fxt.Test__find("abab" , "b" , -1, Bool_.Y, "4;4"); // search from back of String + fxt.Test__find("abab" , "b" , -9, Bool_.Y, "2;2"); // do not throw error if negative index > text.length; ISSUE#:366; DATE:2019-02-23 + fxt.Test__find("𤭢" , "𤭢" , -1, Bool_.Y, "1;1"); // fails if "" b/c it would have counted -1 as -1 char instead of -1 codepoint } - @Test public void Basic() { - Exec_find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic - Exec_find("abac" , "a" , 2, Bool_.N, "3;3"); // bgn - Exec_find("()()" , "(" , 2, Bool_.Y, "3;3"); // plain; note that ( would "break" regx - Exec_find("a bcd e" , "(b(c)d)" , 2, Bool_.N, "3;5;bcd;c"); // groups - Exec_find("a bcd e" , "()(b)" , 2, Bool_.N, "3;3;3;b"); // groups; empty capture - Exec_find("abcd" , "x" , 1, Bool_.N, ""); // empty - Exec_find("abcd" , "" , 2, Bool_.Y, "2;1"); // empty regx should return values; plain; EX:w:Fool's_mate; DATE:2014-03-04 - Exec_find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04 - Exec_find("abcd" , "^(c)" , 3, Bool_.N, "3;3;c"); // ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07 + @Test public void Regx__simple() { + fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic + fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn + fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match + fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04 } - @Test public void Arg_int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12 - fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(123, "2", 1, Bool_.N), "2;2"); + @Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12 + fxt.Test__find(123 , "2" , 1, Bool_.N, "2;2"); } - @Test public void Return_int() { - fxt.Test__proc__kvps__vals(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_("a", "()", 2, Bool_.N), 2, 1, 2); + @Test public void Regx__groups() { + fxt.Test__find("a bcd e" , "(b(c)d)" , 2, Bool_.N, "3;5;bcd;c"); // groups + fxt.Test__find("a bcd e" , "()(b)" , 2, Bool_.N, "3;3;3;b"); // groups; empty capture + } + @Test public void Regx__caret() { + fxt.Test__find("abcd" , "^(c)" , 3, Bool_.N, "3;3;c"); // ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07 + } + @Test public void Regx__return_is_int() { + fxt.Test__find("a" , "()" , 2, Bool_.N, "2;1;2"); } @Test public void Surrogate__find__value() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28 - Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1) - Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1) + fxt.Test__find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1) + fxt.Test__find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1) } @Test public void Surrogate__find__empty() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28 - Exec_find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1) -// Exec_find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1) + fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1) + fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "5;4"); // 8 b/c \n starts at pos 8 (super 1) + } +} +class Scrib_lib_ustring__find__fxt { + private boolean dbg = false; + private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); + private Scrib_lib lib; + public Scrib_lib_ustring__find__fxt() { + fxt.Clear(); + lib = fxt.Core().Lib_ustring().Init(); } - private void Exec_find(String text, String regx, int bgn, boolean plain, String expd) { + public Scrib_lib_ustring__find__fxt Dbg_y_() {dbg = Bool_.Y; return this;} + public Scrib_lib_ustring__find__fxt Dbg_n_() {dbg = Bool_.N; return this;} + public void Test__find(String text, String regx, int bgn, boolean plain, String expd) { + if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd)); fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd); } + public void Test__find(int text, String regx, int bgn, boolean plain, String expd) { + if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd)); + fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd); + } + private String Bld_test_string(Object text, String regx, int bgn, boolean plain, String expd) { + /* + {| class=wikitable + ! rslt !! expd !! actl !! code + |} + */ + String invk = "{{" + String_.Format("#invoke:Sandbox/Gnosygnu|ustring_find|{0}|{1}|{2}|{3}", Object_.Xto_str_strict_or_empty(text), regx, bgn, plain ? Bool_.True_str : Bool_.False_str) + "}}"; + Bry_bfr bfr = Bry_bfr_.New(); + bfr.Add_str_a7("|-\n"); + bfr.Add_str_u8("| {{#ifeq:" + invk + "|" + expd + "|pass|fail}}\n"); + bfr.Add_str_u8("| " + expd + "\n"); + bfr.Add_str_u8("| " + invk + "\n"); + bfr.Add_str_u8("| " + invk + "\n"); + return bfr.To_str(); + } } +/* +TEST: +* URL: https://en.wikipedia.org/wiki/Project:Sandbox +* CODE: +{{#invoke:Sandbox/Gnosygnu|ustring_find|abab|b|3|true}} + +MODULE: +* URL: https://en.wikipedia.org/wiki/Module:Sandbox/Gnosygnu +* CODE: +function p.ustring_find(frame) + local args = frame.args; + local rslt = {mw.ustring.find(args[1], args[2], tonumber(args[3]), args[4] == 'true')}; + + local rv = ''; + local rslt_len = #rslt; + for i=1,rslt_len do + if i ~= 1 then + rv = rv .. ';' + end + rv = rv .. rslt[i] + end + return rv; +end +*/