Scribunto: Do not fail in ustring.find if negative bgn is large [#366]

pull/620/head
gnosygnu 5 years ago
parent 91cbb34fa5
commit 3fd759b020

@ -88,7 +88,7 @@ public class Utf8_ {
for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
byte b = bry[i];
int char_len = Len_of_char_by_1st_byte(b);
switch (char_len) { // if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return
switch (char_len) { // if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return
case 2: if (pos - i == 1) return i; break;
case 3: if (pos - i == 2) return i; break;
case 4: if (pos - i == 3) return i; break;
@ -111,7 +111,7 @@ public class Utf8_ {
// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
for (int i = 0; i < 4; i++) {
int char_len = Len_of_char_by_1st_byte(b);
switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
case 2: if (i == 1) return pos; break;
case 3: if (i == 2) return pos; break;
case 4: if (i == 3) return pos; break;
@ -141,3 +141,48 @@ public class Utf8_ {
, Codepoint_surrogate_end = 0xDFFF
;
}
/*
== Definitions ==
=== a7 vs u8 ===
* a7 -> ASCII (7 bits)
* u8 -> UTF-8 (8 bytes)
In retrospect, better abbreviations would have been:
* ascii -> ASCII
* utf08 -> UTF-8
* utf16 -> UTF-16
=== General ===
==== Byte ====
* Standard definition; 8 bits (2^8 or 256)
==== Codepoint ====
* Represents 1 atomic character but can be composed of multiple bytes
** Examples:
<pre>
1 byte : "a" (letter a)
2 bytes: "¢" (cent)
3 bytes: "€" (euro)
4 bytes: "𤭢" (Chinese character)
</pre>
* Defined by unicode as a sequence of 4 hexadecimals (2 bytes) or 8 hexadecimals (4 bytes); REF:http://www.unicode.org
** 4 hexadecimal is 2 bytes (2^(4 * 4) -> 2^16)
==== char ====
* Java definition of a codepoint which is encoded as 2 bytes (2^16 or 65,536)
* For Western langauges: 1 codepoint equals 1 char (2 bytes);
** For example, chars like "a", "œ", "é" are 1 Java char
* For Eastern langauges: 1 codepoint can equal 2 chars (4 bytes);
** For example, chars like "駣" are 2 Java chars though they represent 1 conceptual codepoint (in English terms, "駣" is a single letter just like the letter "a")
==== Supplementary characters ====
* Represents a codepoint which is defined by 3 or 4 bytes
* Is defined by 1 surrogate pair
** lo-surrogate : 2 bytes
** hi-surrogate : 2 bytes
=== Conventions ===
* Codepoints will be rendered as one int (4 bytes), not 4 hexadecimals (1 byte) 8 hexadecimal (4 bytes)
* The "char" datatype will rarely be used in code; instead byte arrays or codepoint-ints will be used
* The "character" word will not be used in comments; instead the "codepoint" word will be used
*/

@ -1,34 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class String_surrogate_utl {
public int Byte_pos() {return byte_pos;} int byte_pos;
public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);}
public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);}
private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) {
int char_count = 0, codepoint_count = 0;
byte_pos = byte_bgn;
while (true) {
if ( stop_idx == (stop_idx_is_char ? char_count : codepoint_count) // requested # of chars found
|| byte_pos >= src_len // eos reached; DATE:2014-09-02
) return codepoint_count - char_count;
int char_len_in_bytes = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]);
++char_count; // char_count always incremented by 1
codepoint_count += (char_len_in_bytes == 4) ? 2 : 1; // codepoint_count incremented by 2 if surrogate pair; else 1
byte_pos += char_len_in_bytes;
}
}
}

@ -1,55 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*;
public class String_surrogate_utl_tst {
@Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt();
@Test public void Char_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__char_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__char_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__char_idx (test_str, 0, 3, 1, 7); // aé𡼾
fxt.Test_count_surrogates__char_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__char_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__char_idx (test_str, 7, 3, 1, 14); // bî𡼾
fxt.Test_count_surrogates__char_idx (test_str, 0, 6, 2, 14); // aé𡼾bî𡼾
fxt.Test_count_surrogates__char_idx (test_str, 14, 7, 0, 14); // PURPOSE: test out of bounds; DATE:2014-09-02
}
@Test public void Codepoint_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 4, 1, 7); // aé𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 4, 1, 14); // bî𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 8, 2, 14); // aé𡼾bî𡼾
}
}
class String_surrogate_utl_fxt {
private String_surrogate_utl codepoint_utl = new String_surrogate_utl();
public void Clear() {}
public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx));
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos());
}
public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count");
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos(), "pos");
}
}

@ -0,0 +1,79 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Utf16_mapper {
private final int[] ary;
private final int dim_len;
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
public String Src_str() {return src_str;} private final String src_str;
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
// create ary
this.src_str = src_str;
this.src_bry = src_bry;
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
int ary_len = dim_len * Dims_total;
this.ary = new int[dim_len * Dims_total];
for (int i = 0; i < ary_len; i++)
ary[i] = Invalid;
// init
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
// loop till EOS
while (true) {
// update
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
// exit if EOS
if (pos_in_bytes >= src_bry_len) break;
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
// increment
pos_in_bytes += cur_len_in_bytes;
pos_in_chars += cur_len_in_chars;
pos_in_codes += 1;
}
// set lens
this.len_in_codes = pos_in_codes;
this.len_in_chars = pos_in_chars;
}
public static final int
Invalid = -1
, Dims_total = 4
, Dims_code_for_byte = 0
, Dims_byte_for_code = 1
, Dims_code_for_char = 2
, Dims_char_for_code = 3
;
}

@ -0,0 +1,62 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Utf16_mapper_tst {
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
@Test public void A() {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
);
}
}
class Utf16_mapper_fxt {
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
byte[] src_bry = Bry_.new_u8(src_str);
int src_len = src_bry.length;
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
}
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
int actl_len = src_len + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int v = -1;
switch (dim_type) {
case Utf16_mapper.Dims_code_for_byte:
v = mapper.Get_code_for_byte_or_neg1(i);
break;
case Utf16_mapper.Dims_byte_for_code:
v = mapper.Get_byte_for_code_or_neg1(i);
break;
case Utf16_mapper.Dims_code_for_char:
v = mapper.Get_code_for_char_or_neg1(i);
break;
case Utf16_mapper.Dims_char_for_code:
v = mapper.Get_char_for_code_or_neg1(i);
break;
}
actl[i] = v;
}
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
}
}

@ -18,7 +18,6 @@ import gplx.core.intls.*; import gplx.langs.regxs.*;
import gplx.xowa.parsers.*;
import gplx.xowa.xtns.scribunto.procs.*;
public class Scrib_lib_ustring implements Scrib_lib {
private final String_surrogate_utl surrogate_utl = new String_surrogate_utl();
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
@ -48,74 +47,92 @@ public class Scrib_lib_ustring implements Scrib_lib {
public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub";
private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub);
public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text_str = args.Xstr_str_or_null(0);
String regx = args.Pull_str(1);
int bgn_char_idx = args.Cast_int_or(2, 1);
boolean plain = args.Cast_bool_or_n(3);
synchronized (surrogate_utl) {
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
bgn_char_idx = Bgn_adjust(text_str, bgn_char_idx);
// regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
if (String_.Len_eq_0(regx)) // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false
return rslt.Init_many_objs(bgn_char_idx + Scrib_lib_ustring.Base1, bgn_char_idx + Scrib_lib_ustring.Base1 - 1);
// NOTE: adjust for 2-len chars (surrogates); PAGE:en.d:iglesia DATE:2017-04-23
int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
int bgn_codepoint_idx = bgn_char_idx + bgn_adj;
int bgn_byte_pos = surrogate_utl.Byte_pos();
if (plain) {
int pos = String_.FindFwd(text_str, regx, bgn_codepoint_idx);
boolean found = pos != Bry_find_.Not_found;
return found
? rslt.Init_many_objs(pos + Scrib_lib_ustring.Base1, pos + Scrib_lib_ustring.Base1 + String_.Len(regx) - Scrib_lib_ustring.End_adj)
: rslt.Init_ary_empty()
;
}
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
int len = regx_rslts.length;
if (len == 0) return rslt.Init_ary_empty();
List_adp tmp_list = List_adp_.New();
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
int match_find_bgn_codepoint = match.Find_bgn(); // NOTE: java regex returns results in codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
int match_find_bgn_adj = -surrogate_utl.Count_surrogates__codepoint_idx1(text_bry, text_bry_len, bgn_byte_pos, match_find_bgn_codepoint - bgn_codepoint_idx); // NOTE: convert from java regex codepoint to lua / php char_idx; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
tmp_list.Add(match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1);
tmp_list.Add(match.Find_end() + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
//Tfds.Dbg (match_find_bgn_codepoint + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1
// ,match.Find_end() + match_find_bgn_adj + -bgn_adj + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
// get args
String text_str = args.Xstr_str_or_null(0);
String find_str = args.Pull_str(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
boolean plain = args.Cast_bool_or_n(3);
// init text vars
byte[] text_bry = Bry_.new_u8(text_str);
int text_bry_len = text_bry.length;
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// convert bgn from base_1 to base_0
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
/*
int offset = 0;
if (bgn_as_codes > 0) { // NOTE: MW.BASE
// $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
}
}
private int Bgn_adjust(String text, int bgn) { // adjust to handle bgn < 0 or bgn > len (which PHP allows)
if (bgn > 0) bgn -= Scrib_lib_ustring.Base1;
int text_len = String_.Len(text);
if (bgn < 0) // negative number means search from rear of String
bgn += text_len; // NOTE: PHP has extra + 1 for Base 1
else if (bgn > text_len) // bgn > text_len; confine to text_len; NOTE: PHP has extra + 1 for Base 1
bgn = text_len; // NOTE: PHP has extra + 1 for Base 1
return bgn;
else {
bgn_as_codes_base1 = 0; // NOTE: MW.BASE1
offset = 0; // -1?
}
*/
// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
// NOTE: not in MW; is this needed? DATE:2019-02-24
if (String_.Len_eq_0(find_str))
return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1);
// if plain, just do literal match of find and exit
if (plain) {
// find pos by literal match
byte[] find_bry = Bry_.new_u8(find_str);
int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes));
// nothing found; return empty
if (pos == Bry_find_.Not_found)
return rslt.Init_ary_empty();
// bgn: convert pos from bytes back to codes; also adjust for base1
int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1;
// end: add find.Len_in_codes and adjust end for PHP/LUA
Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length);
int end = bgn + find_map.Len_in_codes() - End_adj;
return rslt.Init_many_objs(bgn, end);
}
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
// add to tmp_list
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
// get args
String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
String find_str = args.Cast_str_or_null(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
// validate / adjust
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
return rslt.Init_many_list(List_adp_.Noop);
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
int bgn = args.Cast_int_or(2, 1);
bgn = Bgn_adjust(text, bgn);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_match[] regx_rslts = regx_adp.Match_all(text, bgn);
int len = regx_rslts.length;
if (len == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true);
AddCapturesFromMatch(tmp_list, regx_rslts[0], text_str, regx_converter.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
}
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
@ -143,6 +160,35 @@ public class Scrib_lib_ustring implements Scrib_lib {
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
}
private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) {
// convert bgn from base_1 to base_0
int bgn_as_codes = bgn_as_codes_base1;
if (bgn_as_codes > 0)
bgn_as_codes -= Scrib_lib_ustring.Base1;
// TOMBSTONE: do not adjust negative numbers for base1; fails tests
// else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1;
// adjust bgn for negative-numbers and large positive-numbers
// NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php
// NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ
if (bgn_as_codes < 0) // negative number means search from rear of String
bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
// will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23
if (bgn_as_codes < 0)
bgn_as_codes = 0;
return bgn_as_codes;
}
private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// convert regex from lua to java
find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
int capts_len = capts == null ? 0 : capts.length;
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
@ -171,6 +217,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
}
return rv;
}
private static final int Base1 = 1
private static final int
Base1 = 1
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
}

@ -14,39 +14,107 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
import org.junit.*;
import gplx.core.consoles.*;
import gplx.xowa.xtns.scribunto.engines.mocks.*;
public class Scrib_lib_ustring__find__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() {
fxt.Clear();
lib = fxt.Core().Lib_ustring().Init();
private final Scrib_lib_ustring__find__fxt fxt = new Scrib_lib_ustring__find__fxt();
@Test public void Plain() {
fxt.Test__find("aabaab" , "b" , 2, Bool_.Y, "3;3"); // bytes=1
fxt.Test__find("€€b€€b" , "b" , 2, Bool_.Y, "3;3"); // bytes=3
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=4
fxt.Test__find("()()" , "(" , 2, Bool_.Y, "3;3"); // exact match; note that "(" is invalid regx
fxt.Test__find("abcd" , "" , 2, Bool_.Y, "2;1"); // empty find should return values; EX:w:Fool's_mate; DATE:2014-03-04
fxt.Test__find("a€b" , "€" , 1, Bool_.Y, "2;2"); // find is bytes=3
}
@Test public void Bgn__negative() {
fxt.Test__find("abab" , "b" , -1, Bool_.Y, "4;4"); // search from back of String
fxt.Test__find("abab" , "b" , -9, Bool_.Y, "2;2"); // do not throw error if negative index > text.length; ISSUE#:366; DATE:2019-02-23
fxt.Test__find("𤭢" , "𤭢" , -1, Bool_.Y, "1;1"); // fails if "" b/c it would have counted -1 as -1 char instead of -1 codepoint
}
@Test public void Basic() {
Exec_find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
Exec_find("abac" , "a" , 2, Bool_.N, "3;3"); // bgn
Exec_find("()()" , "(" , 2, Bool_.Y, "3;3"); // plain; note that ( would "break" regx
Exec_find("a bcd e" , "(b(c)d)" , 2, Bool_.N, "3;5;bcd;c"); // groups
Exec_find("a bcd e" , "()(b)" , 2, Bool_.N, "3;3;3;b"); // groups; empty capture
Exec_find("abcd" , "x" , 1, Bool_.N, ""); // empty
Exec_find("abcd" , "" , 2, Bool_.Y, "2;1"); // empty regx should return values; plain; EX:w:Fool's_mate; DATE:2014-03-04
Exec_find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
Exec_find("abcd" , "^(c)" , 3, Bool_.N, "3;3;c"); // ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07
@Test public void Regx__simple() {
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
}
@Test public void Arg_int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(123, "2", 1, Bool_.N), "2;2");
@Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
fxt.Test__find(123 , "2" , 1, Bool_.N, "2;2");
}
@Test public void Return_int() {
fxt.Test__proc__kvps__vals(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_("a", "()", 2, Bool_.N), 2, 1, 2);
@Test public void Regx__groups() {
fxt.Test__find("a bcd e" , "(b(c)d)" , 2, Bool_.N, "3;5;bcd;c"); // groups
fxt.Test__find("a bcd e" , "()(b)" , 2, Bool_.N, "3;3;3;b"); // groups; empty capture
}
@Test public void Regx__caret() {
fxt.Test__find("abcd" , "^(c)" , 3, Bool_.N, "3;3;c"); // ^ should be converted to \G; regx; EX:cs.n:Category:1._září_2008; DATE:2014-05-07
}
@Test public void Regx__return_is_int() {
fxt.Test__find("a" , "()" , 2, Bool_.N, "2;1;2");
}
@Test public void Surrogate__find__value() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
fxt.Test__find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
fxt.Test__find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
}
@Test public void Surrogate__find__empty() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
Exec_find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
// Exec_find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "5;4"); // 8 b/c \n starts at pos 8 (super 1)
}
}
class Scrib_lib_ustring__find__fxt {
private boolean dbg = false;
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt();
private Scrib_lib lib;
public Scrib_lib_ustring__find__fxt() {
fxt.Clear();
lib = fxt.Core().Lib_ustring().Init();
}
private void Exec_find(String text, String regx, int bgn, boolean plain, String expd) {
public Scrib_lib_ustring__find__fxt Dbg_y_() {dbg = Bool_.Y; return this;}
public Scrib_lib_ustring__find__fxt Dbg_n_() {dbg = Bool_.N; return this;}
public void Test__find(String text, String regx, int bgn, boolean plain, String expd) {
if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd));
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
}
public void Test__find(int text, String regx, int bgn, boolean plain, String expd) {
if (dbg) Console_adp__sys.Instance.Write_str(Bld_test_string(text, regx, bgn, plain, expd));
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
}
private String Bld_test_string(Object text, String regx, int bgn, boolean plain, String expd) {
/*
{| class=wikitable
! rslt !! expd !! actl !! code
|}
*/
String invk = "{{" + String_.Format("#invoke:Sandbox/Gnosygnu|ustring_find|{0}|{1}|{2}|{3}", Object_.Xto_str_strict_or_empty(text), regx, bgn, plain ? Bool_.True_str : Bool_.False_str) + "}}";
Bry_bfr bfr = Bry_bfr_.New();
bfr.Add_str_a7("|-\n");
bfr.Add_str_u8("| {{#ifeq:" + invk + "|" + expd + "|<span style='color:green'>pass</span>|<span style='color:red'>fail</span>}}\n");
bfr.Add_str_u8("| " + expd + "\n");
bfr.Add_str_u8("| " + invk + "\n");
bfr.Add_str_u8("| <nowiki>" + invk + "</nowiki>\n");
return bfr.To_str();
}
}
/*
TEST:
* URL: https://en.wikipedia.org/wiki/Project:Sandbox
* CODE:
{{#invoke:Sandbox/Gnosygnu|ustring_find|abab|b|3|true}}
MODULE:
* URL: https://en.wikipedia.org/wiki/Module:Sandbox/Gnosygnu
* CODE:
function p.ustring_find(frame)
local args = frame.args;
local rslt = {mw.ustring.find(args[1], args[2], tonumber(args[3]), args[4] == 'true')};
local rv = '';
local rslt_len = #rslt;
for i=1,rslt_len do
if i ~= 1 then
rv = rv .. ';'
end
rv = rv .. rslt[i]
end
return rv;
end
*/

Loading…
Cancel
Save