Scribunto: Iterate regx by codepoint not by bytes [#383]

pull/620/head
gnosygnu 5 years ago
parent 4cd23b9827
commit 8ef5854eb7

@ -52,6 +52,7 @@ public class Utf8_ {
default: throw Err_.new_wo_type("invalid initial utf8 byte", "byte", b);
}
}
public static int Len_of_char_by_bytes_len(int v) {return v == 4 ? 2 : 1;} // 1 to 3 UTF bytes will encode up to 65,536 and fit in 1 char
public static byte[] Get_char_at_pos_as_bry(byte[] bry, int pos) {
int len = Len_of_char_by_1st_byte(bry[pos]);
return Bry_.Mid(bry, pos, pos + len);

@ -19,6 +19,7 @@ public class Gftest {
private static final Bry_bfr bfr = Bry_bfr_.New();
public static void Eq__ary(Object[] expd, Object[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__obj, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(boolean[] expd, boolean[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__bool, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(int[] expd, int[] actl) {Eq__array(Type_ids_.Id__int, expd, actl, "");}
public static void Eq__ary(int[] expd, int[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__int, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(long[] expd, long[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__long, expd, actl, msg_fmt, msg_args);}
public static void Eq__ary(byte[] expd, byte[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__byte, expd, actl, msg_fmt, msg_args);}

@ -0,0 +1,49 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public interface Unicode_string {
boolean Tid_is_single();
String Src_string();
byte[] Src_bytes();
int Len_codes();
int Len_chars();
int Len_bytes();
int Val_codes(int i);
int Pos_codes_to_bytes(int i);
int Pos_codes_to_chars(int i);
int Pos_bytes_to_codes(int i);
int Pos_chars_to_codes(int i);
}
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
private final int[] codes;
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
this.src_string = src_string;
this.src_bytes = src_bytes;
this.codes = codes;
this.codes_len = codes_len;
}
public boolean Tid_is_single() {return true;}
public String Src_string() {return src_string;} private final String src_string;
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return codes_len;}
public int Len_bytes() {return codes_len;}
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
}

@ -0,0 +1,48 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Unicode_string_ {
public static Unicode_string New(String orig) {
// null
if (orig == null)
return new Unicode_string_single(null, null, null, 0);
// init bytes
byte[] bytes = Bry_.new_u8(orig);
int bytes_len = bytes.length;
// init codes
int[] codes = new int[bytes_len];
int codes_len = 0;
// loop
int bytes_pos = 0;
int chars_pos = 0;
while (bytes_pos < bytes_len) {
// set codes
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_len += 1;
}
return codes_len == bytes_len
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
}
}

@ -0,0 +1,81 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
class Unicode_string_multi implements Unicode_string {
private final int[] codes;
private final int[] codes_to_bytes;
private final int[] codes_to_chars;
private final int[] bytes_to_codes;
private final int[] chars_to_codes;
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
// set member vars
this.src = src;
this.bytes = bytes;
this.bytes_len = bytes_len;
this.codes = codes;
this.codes_len = codes_len;
this.chars_len = chars_len;
// init maps
this.codes_to_bytes = new int[codes_len + Adj_end];
this.codes_to_chars = new int[codes_len + Adj_end];
this.bytes_to_codes = New_int_ary(bytes_len);
this.chars_to_codes = New_int_ary(chars_len);
// init loop
int codes_pos = 0;
int bytes_pos = 0;
int chars_pos = 0;
// loop till EOS
while (true) {
// update
codes_to_bytes[codes_pos] = bytes_pos;
codes_to_chars[codes_pos] = chars_pos;
bytes_to_codes[bytes_pos] = codes_pos;
chars_to_codes[chars_pos] = codes_pos;
if (bytes_pos == bytes_len) break;
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_pos += 1;
}
}
public boolean Tid_is_single() {return false;}
public String Src_string() {return src;} private final String src;
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return chars_len;} private final int chars_len;
public int Len_bytes() {return bytes_len;} private final int bytes_len;
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
private static int[] New_int_ary(int len) {
int rv_len = len + Adj_end;
int[] rv = new int[rv_len];
for (int i = 0; i < rv_len; i++)
rv[i] = Invalid;
return rv;
}
}

@ -0,0 +1,110 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Unicode_string_tst {
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
@Test public void Null() {
fxt.Init(null);
fxt.Test__Len(0, 0, 0);
}
@Test public void Blank() {
fxt.Init("");
fxt.Test__Len(0, 0, 0);
}
@Test public void Single() {
fxt.Init("Abc");
fxt.Test__Len(3, 3, 3);
fxt.Test__Val_codes(65, 98, 99);
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
}
@Test public void Multi() {
fxt.Init("a¢€𤭢");
fxt.Test__Len(4, 5, 10);
fxt.Test__Val_codes(97, 162, 8364, 150370);
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
}
}
class Unicode_string_fxt {
private Unicode_string under;
public void Init(String src) {
this.under = Unicode_string_.New(src);
}
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
}
public void Test__Val_codes(int... expd) {
int actl_len = under.Len_codes();
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Val_codes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_bytes(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_bytes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_chars(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_chars(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_bytes_to_codes(int... expd) {
int actl_len = under.Len_bytes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_bytes_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_chars_to_codes(int... expd) {
int actl_len = under.Len_chars() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_chars_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
}

@ -1,79 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Utf16_mapper {
private final int[] ary;
private final int dim_len;
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
public String Src_str() {return src_str;} private final String src_str;
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
// create ary
this.src_str = src_str;
this.src_bry = src_bry;
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
int ary_len = dim_len * Dims_total;
this.ary = new int[dim_len * Dims_total];
for (int i = 0; i < ary_len; i++)
ary[i] = Invalid;
// init
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
// loop till EOS
while (true) {
// update
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
// exit if EOS
if (pos_in_bytes >= src_bry_len) break;
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
// increment
pos_in_bytes += cur_len_in_bytes;
pos_in_chars += cur_len_in_chars;
pos_in_codes += 1;
}
// set lens
this.len_in_codes = pos_in_codes;
this.len_in_chars = pos_in_chars;
}
public static final int
Invalid = -1
, Dims_total = 4
, Dims_code_for_byte = 0
, Dims_byte_for_code = 1
, Dims_code_for_char = 2
, Dims_char_for_code = 3
;
}

@ -1,62 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Utf16_mapper_tst {
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
@Test public void A() {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
);
}
}
class Utf16_mapper_fxt {
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
byte[] src_bry = Bry_.new_u8(src_str);
int src_len = src_bry.length;
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
}
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
int actl_len = src_len + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int v = -1;
switch (dim_type) {
case Utf16_mapper.Dims_code_for_byte:
v = mapper.Get_code_for_byte_or_neg1(i);
break;
case Utf16_mapper.Dims_byte_for_code:
v = mapper.Get_byte_for_code_or_neg1(i);
break;
case Utf16_mapper.Dims_code_for_char:
v = mapper.Get_code_for_char_or_neg1(i);
break;
case Utf16_mapper.Dims_char_for_code:
v = mapper.Get_char_for_code_or_neg1(i);
break;
}
actl[i] = v;
}
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
}
}

@ -54,12 +54,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
boolean plain = args.Cast_bool_or_n(3);
// init text vars
byte[] text_bry = Bry_.new_u8(text_str);
int text_bry_len = text_bry.length;
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// convert bgn from base_1 to base_0
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
/*
int offset = 0;
@ -81,33 +79,33 @@ public class Scrib_lib_ustring implements Scrib_lib {
// if plain, just do literal match of find and exit
if (plain) {
// find pos by literal match
byte[] find_bry = Bry_.new_u8(find_str);
int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes));
Unicode_string find_ucs = Unicode_string_.New(find_str);
byte[] find_bry = find_ucs.Src_bytes();
int pos = Bry_find_.Find_fwd(text_ucs.Src_bytes(), find_bry, text_ucs.Pos_codes_to_bytes(bgn_as_codes));
// nothing found; return empty
if (pos == Bry_find_.Not_found)
return rslt.Init_ary_empty();
// bgn: convert pos from bytes back to codes; also adjust for base1
int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1;
int bgn = text_ucs.Pos_bytes_to_codes(pos) + Base1;
// end: add find.Len_in_codes and adjust end for PHP/LUA
Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length);
int end = bgn + find_map.Len_in_codes() - End_adj;
int end = bgn + find_ucs.Len_codes() - End_adj;
return rslt.Init_many_objs(bgn, end);
}
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
// add to tmp_list
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}
@ -120,13 +118,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
// validate / adjust
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
return rslt.Init_many_list(List_adp_.Noop);
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
@ -141,7 +138,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
}
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
byte[] regx = args.Pull_bry(1);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
@ -181,13 +178,13 @@ public class Scrib_lib_ustring implements Scrib_lib {
bgn_as_codes = 0;
return bgn_as_codes;
}
private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
private Regx_match[] Run_regex_or_null(Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// convert regex from lua to java
find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G);
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
int capts_len = capts == null ? 0 : capts.length;

@ -32,7 +32,7 @@ class Scrib_lib_ustring_gsub_mgr {
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
String regx = args.Xstr_str_or_null(1);
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow);
// get @repl
Object repl_obj = args.Cast_obj_or_null(2);

@ -14,27 +14,31 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.core.brys.fmtrs.*;
import gplx.core.brys.fmtrs.*; import gplx.core.intls.*;
import gplx.langs.regxs.*;
public class Scrib_regx_converter {
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
private final Bry_bfr bfr = Bry_bfr_.New();
private Bry_bfr tmp_bfr;
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
public Scrib_regx_converter() {Init();}
private final Lua_cls_to_regx_map percent_map, brack_map;
public Scrib_regx_converter() {
percent_map = Lua_cls_matcher.Instance.Percent();
brack_map = Lua_cls_matcher.Instance.Brack();
}
public String Regx() {return regx;} private String regx;
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
public boolean Any_pos() {return any_pos;} private boolean any_pos;
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
public String patternToRegex(byte[] pat, byte[] anchor) {
public String patternToRegex(String pat_str, byte[] anchor) {
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
grp_mgr.Clear();
any_pos = false;
boolean q_flag = false;
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
int len = pat.length;
int len = pat_ucs.Len_codes();
int grps_len = 0;
int bct = 0;
@ -42,7 +46,7 @@ public class Scrib_regx_converter {
for (int i = 0; i < len; i++) {
int i_end = i + 1;
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
byte cur = pat[i];
int cur = pat_ucs.Val_codes(i);
switch (cur) {
case Byte_ascii.Pow:
q_flag = i != 0;
@ -59,7 +63,7 @@ public class Scrib_regx_converter {
int grp_idx = grp_mgr.Capt__len() + 1;
// check for "()"; enables anypos flag
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end;
boolean is_empty_capture = pat_ucs.Val_codes(i + 1) == Byte_ascii.Paren_end;
if (is_empty_capture)
any_pos = true;
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
@ -77,19 +81,19 @@ public class Scrib_regx_converter {
i++;
if (i >= len)
throw Err_.new_wo_type("malformed pattern (ends with '%')");
Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1);
if (percent_obj != null) {
bfr.Add((byte[])percent_obj);
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Val_codes(i));
if (percent_bry != null) {
bfr.Add(percent_bry);
q_flag = true;
}
else {
byte nxt = pat[i];
int nxt = pat_ucs.Val_codes(i);
switch (nxt) {
case Byte_ascii.Ltr_b: // EX: "%b()"
i += 2;
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
byte char_0 = pat[i - 1];
byte char_1 = pat[i];
int char_0 = pat_ucs.Val_codes(i - 1);
int char_1 = pat_ucs.Val_codes(i);
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
bfr.Add(Bry_bf0_seg_0);
Regx_quote(bfr, char_0);
@ -109,18 +113,18 @@ public class Scrib_regx_converter {
synchronized (fmtr_balanced) {
++bct;
int balanced_idx = grp_mgr.Full__len();
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
grp_mgr.Capt__add__fake(2);
bfr.Add(bfr_balanced.To_bry_and_clear());
}
}
break;
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn)
if (i + 1 >= len || pat_ucs.Val_codes(++i) != Byte_ascii.Brack_bgn)
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
// %f always followed by bracketed term; convert lua bracketed term to regex
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
i = bracketedCharSetToRegex(tmp_bfr, pat, i, len);
i = bracketedCharSetToRegex(tmp_bfr, pat_ucs, i, len);
byte[] re2 = tmp_bfr.To_bry_and_clear();
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
@ -148,7 +152,7 @@ public class Scrib_regx_converter {
}
break;
case Byte_ascii.Brack_bgn:
i = bracketedCharSetToRegex(bfr, pat, i, len);
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
q_flag = true;
break;
case Byte_ascii.Brack_end:
@ -163,12 +167,12 @@ public class Scrib_regx_converter {
break;
}
if (q_flag && i + 1 < len) {
byte tmp_b = pat[i + 1];
int tmp_b = pat_ucs.Val_codes(i + 1);
switch (tmp_b) {
case Byte_ascii.Star:
case Byte_ascii.Plus:
case Byte_ascii.Question:
bfr.Add_byte(tmp_b);
bfr.Add_byte((byte)tmp_b);
++i;
break;
case Byte_ascii.Dash:
@ -184,35 +188,35 @@ public class Scrib_regx_converter {
regx = bfr.To_str_and_clear();
return regx;
}
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
private int bracketedCharSetToRegex(Bry_bfr bfr, Unicode_string pat_ucs, int i, int len) {
bfr.Add_byte(Byte_ascii.Brack_bgn);
i++;
if (i < len && pat[i] == Byte_ascii.Pow) { // ^
if (i < len && pat_ucs.Val_codes(i) == Byte_ascii.Pow) { // ^
bfr.Add_byte(Byte_ascii.Pow);
i++;
}
for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) {
if (pat[i] == Byte_ascii.Percent) {
for (int j = i; i < len && (j == i || pat_ucs.Val_codes(i) != Byte_ascii.Brack_end); i++) {
if (pat_ucs.Val_codes(i) == Byte_ascii.Percent) {
i++;
if (i >= len) {
break;
}
Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1);
if (brack_obj != null)
bfr.Add((byte[])brack_obj);
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Val_codes(i));
if (brack_bry != null)
bfr.Add(brack_bry);
else
Regx_quote(bfr, pat[i]);
Regx_quote(bfr, pat_ucs.Val_codes(i));
}
else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) {
if (pat[i] <= pat[i + 2]) {
Regx_quote(bfr, pat[i]);
else if (i + 2 < len && pat_ucs.Val_codes(i + 1) == Byte_ascii.Dash && pat_ucs.Val_codes(i + 2) != Byte_ascii.Brack_end && pat_ucs.Val_codes(i + 2) != Byte_ascii.Hash) {
if (pat_ucs.Val_codes(i) <= pat_ucs.Val_codes(i + 2)) {
Regx_quote(bfr, pat_ucs.Val_codes(i));
bfr.Add_byte(Byte_ascii.Dash);
Regx_quote(bfr, pat[i + 2]);
Regx_quote(bfr, pat_ucs.Val_codes(i + 2));
}
i += 2;
}
else {
Regx_quote(bfr, pat[i]);
Regx_quote(bfr, pat_ucs.Val_codes(i));
}
}
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
@ -233,12 +237,12 @@ public class Scrib_regx_converter {
}
return i;
}
private void Regx_quote(Bry_bfr bfr, byte b) {
if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash);
bfr.Add_byte(b);
private void Regx_quote(Bry_bfr bfr, int code) {
if (Regx_char(code)) bfr.Add_byte(Byte_ascii.Backslash);
bfr.Add_u8_int(code);
}
private boolean Regx_char(byte b) {
switch (b) {
private boolean Regx_char(int code) {
switch (code) {
case Byte_ascii.Dot: case Byte_ascii.Slash: case Byte_ascii.Plus: case Byte_ascii.Star: case Byte_ascii.Question:
case Byte_ascii.Pow: case Byte_ascii.Dollar: case Byte_ascii.Eq: case Byte_ascii.Bang: case Byte_ascii.Pipe:
case Byte_ascii.Colon: case Byte_ascii.Dash:
@ -257,7 +261,10 @@ public class Scrib_regx_converter {
, Bry_star_question = Bry_.new_a7("*?") // was *?
;
public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^");
private void Init() {
}
class Lua_cls_matcher {
public static final Lua_cls_matcher Instance = new Lua_cls_matcher();
Lua_cls_matcher() {
String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
Init_itm(Bool_.Y, "a", "\\p{L}");
@ -288,15 +295,32 @@ public class Scrib_regx_converter {
Init_itm(Bool_.N, "X", "\\x00-\\x2f\\x3a-\\x40\\x47-\\x60\\x67-\\x{ff0f}\\x{ff1a}-\\x{ff20}\\x{ff27}-\\x{ff40}\\x{ff47}-\\x{10ffff}");
Init_itm(Bool_.N, "Z", "\\x01-\\x{10ffff}");
}
public Lua_cls_to_regx_map Percent() {return percent_map;} private final Lua_cls_to_regx_map percent_map = new Lua_cls_to_regx_map();
public Lua_cls_to_regx_map Brack() {return brack_map;} private final Lua_cls_to_regx_map brack_map = new Lua_cls_to_regx_map();
private void Init_itm(boolean add_to_percent_hash, String lua, String php) {
byte[] lua_bry = Bry_.new_a7(lua);
int lua_len = String_.Len(lua);
if (lua_len != 1) throw Err_.new_wo_type("lua must be 1 char only", "lua", lua);
int lua_code = (int)String_.CharAt(lua, 0);
if (lua_code < Byte_ascii.Ltr_A || lua_code > Byte_ascii.Ltr_z) throw Err_.new_wo_type("lua must be between A and z", "lua", lua);
byte[] php_bry = Bry_.new_a7(php);
if (add_to_percent_hash) {
percent_hash.Add_bry_obj(lua_bry, php_bry);
brack_hash.Add_bry_obj(lua_bry, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters
percent_map.Set(lua_code, php_bry);
brack_map.Set(lua_code, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters
}
else {
brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions
brack_map.Set(lua_code, php_bry); // replace percent_hash definitions
}
}
}
class Lua_cls_to_regx_map {
private static final int MAX = Byte_ascii.Max_7_bit;
private final byte[][] map = new byte[MAX][];
public byte[] Get_or_null(int code) {
return code < MAX ? map[code] : null;
}
public void Set(int code, byte[] val) {
map[code] = val;
}
}

@ -50,6 +50,9 @@ public class Scrib_regx_converter_tst {
@Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails
fxt.Test_replace("𠀀" , "[𠀀-𯨟]" , "a", "a");
}
@Test public void Invalid_range() {// PURPOSE: if range is invalid, take 1st char only; note range is multi-byte; ISSUE#:383; PAGE:en.d:dictionary DATE:2019-03-16
fxt.Test_parse("[ড়-য়]" , "[ড়]"); // 2492-2479
}
// @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");}
// @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");}
}
@ -61,11 +64,11 @@ class Scrib_regx_converter_fxt {
}
}
public void Test_parse(String raw, String expd) {
under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G);
Tfds.Eq(expd, under.Regx());
}
public void Test_replace(String text, String find, String replace, String expd) {
String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G);
String actl = Regx_adp_.Replace(text, regex_str, replace);
Tfds.Eq(expd, actl);
}

Loading…
Cancel
Save