mirror of
https://github.com/gnosygnu/xowa.git
synced 2025-06-02 07:24:19 +00:00
Scribunto: Iterate regx by codepoint not by bytes [#383]
This commit is contained in:
parent
4cd23b9827
commit
8ef5854eb7
@ -52,6 +52,7 @@ public class Utf8_ {
|
||||
default: throw Err_.new_wo_type("invalid initial utf8 byte", "byte", b);
|
||||
}
|
||||
}
|
||||
public static int Len_of_char_by_bytes_len(int v) {return v == 4 ? 2 : 1;} // 1 to 3 UTF bytes will encode up to 65,536 and fit in 1 char
|
||||
public static byte[] Get_char_at_pos_as_bry(byte[] bry, int pos) {
|
||||
int len = Len_of_char_by_1st_byte(bry[pos]);
|
||||
return Bry_.Mid(bry, pos, pos + len);
|
||||
|
@ -19,6 +19,7 @@ public class Gftest {
|
||||
private static final Bry_bfr bfr = Bry_bfr_.New();
|
||||
public static void Eq__ary(Object[] expd, Object[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__obj, expd, actl, msg_fmt, msg_args);}
|
||||
public static void Eq__ary(boolean[] expd, boolean[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__bool, expd, actl, msg_fmt, msg_args);}
|
||||
public static void Eq__ary(int[] expd, int[] actl) {Eq__array(Type_ids_.Id__int, expd, actl, "");}
|
||||
public static void Eq__ary(int[] expd, int[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__int, expd, actl, msg_fmt, msg_args);}
|
||||
public static void Eq__ary(long[] expd, long[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__long, expd, actl, msg_fmt, msg_args);}
|
||||
public static void Eq__ary(byte[] expd, byte[] actl, String msg_fmt, Object... msg_args) {Eq__array(Type_ids_.Id__byte, expd, actl, msg_fmt, msg_args);}
|
||||
|
49
400_xowa/src/gplx/core/intls/Unicode_string.java
Normal file
49
400_xowa/src/gplx/core/intls/Unicode_string.java
Normal file
@ -0,0 +1,49 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public interface Unicode_string {
|
||||
boolean Tid_is_single();
|
||||
String Src_string();
|
||||
byte[] Src_bytes();
|
||||
int Len_codes();
|
||||
int Len_chars();
|
||||
int Len_bytes();
|
||||
int Val_codes(int i);
|
||||
int Pos_codes_to_bytes(int i);
|
||||
int Pos_codes_to_chars(int i);
|
||||
int Pos_bytes_to_codes(int i);
|
||||
int Pos_chars_to_codes(int i);
|
||||
}
|
||||
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
|
||||
private final int[] codes;
|
||||
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
|
||||
this.src_string = src_string;
|
||||
this.src_bytes = src_bytes;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
}
|
||||
public boolean Tid_is_single() {return true;}
|
||||
public String Src_string() {return src_string;} private final String src_string;
|
||||
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return codes_len;}
|
||||
public int Len_bytes() {return codes_len;}
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
||||
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
||||
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
||||
}
|
48
400_xowa/src/gplx/core/intls/Unicode_string_.java
Normal file
48
400_xowa/src/gplx/core/intls/Unicode_string_.java
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Unicode_string_ {
|
||||
public static Unicode_string New(String orig) {
|
||||
// null
|
||||
if (orig == null)
|
||||
return new Unicode_string_single(null, null, null, 0);
|
||||
|
||||
// init bytes
|
||||
byte[] bytes = Bry_.new_u8(orig);
|
||||
int bytes_len = bytes.length;
|
||||
|
||||
// init codes
|
||||
int[] codes = new int[bytes_len];
|
||||
int codes_len = 0;
|
||||
|
||||
// loop
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
while (bytes_pos < bytes_len) {
|
||||
// set codes
|
||||
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_len += 1;
|
||||
}
|
||||
return codes_len == bytes_len
|
||||
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
|
||||
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
|
||||
}
|
||||
}
|
81
400_xowa/src/gplx/core/intls/Unicode_string_multi.java
Normal file
81
400_xowa/src/gplx/core/intls/Unicode_string_multi.java
Normal file
@ -0,0 +1,81 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
class Unicode_string_multi implements Unicode_string {
|
||||
private final int[] codes;
|
||||
private final int[] codes_to_bytes;
|
||||
private final int[] codes_to_chars;
|
||||
private final int[] bytes_to_codes;
|
||||
private final int[] chars_to_codes;
|
||||
|
||||
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
|
||||
// set member vars
|
||||
this.src = src;
|
||||
this.bytes = bytes;
|
||||
this.bytes_len = bytes_len;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
this.chars_len = chars_len;
|
||||
|
||||
// init maps
|
||||
this.codes_to_bytes = new int[codes_len + Adj_end];
|
||||
this.codes_to_chars = new int[codes_len + Adj_end];
|
||||
this.bytes_to_codes = New_int_ary(bytes_len);
|
||||
this.chars_to_codes = New_int_ary(chars_len);
|
||||
|
||||
// init loop
|
||||
int codes_pos = 0;
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
|
||||
// loop till EOS
|
||||
while (true) {
|
||||
// update
|
||||
codes_to_bytes[codes_pos] = bytes_pos;
|
||||
codes_to_chars[codes_pos] = chars_pos;
|
||||
bytes_to_codes[bytes_pos] = codes_pos;
|
||||
chars_to_codes[chars_pos] = codes_pos;
|
||||
|
||||
if (bytes_pos == bytes_len) break;
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_pos += 1;
|
||||
}
|
||||
}
|
||||
public boolean Tid_is_single() {return false;}
|
||||
public String Src_string() {return src;} private final String src;
|
||||
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return chars_len;} private final int chars_len;
|
||||
public int Len_bytes() {return bytes_len;} private final int bytes_len;
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
|
||||
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
|
||||
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
|
||||
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
|
||||
|
||||
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
|
||||
private static int[] New_int_ary(int len) {
|
||||
int rv_len = len + Adj_end;
|
||||
int[] rv = new int[rv_len];
|
||||
for (int i = 0; i < rv_len; i++)
|
||||
rv[i] = Invalid;
|
||||
return rv;
|
||||
}
|
||||
}
|
110
400_xowa/src/gplx/core/intls/Unicode_string_tst.java
Normal file
110
400_xowa/src/gplx/core/intls/Unicode_string_tst.java
Normal file
@ -0,0 +1,110 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Unicode_string_tst {
|
||||
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
|
||||
@Test public void Null() {
|
||||
fxt.Init(null);
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Blank() {
|
||||
fxt.Init("");
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Single() {
|
||||
fxt.Init("Abc");
|
||||
fxt.Test__Len(3, 3, 3);
|
||||
fxt.Test__Val_codes(65, 98, 99);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
|
||||
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
|
||||
}
|
||||
@Test public void Multi() {
|
||||
fxt.Init("a¢€𤭢");
|
||||
fxt.Test__Len(4, 5, 10);
|
||||
fxt.Test__Val_codes(97, 162, 8364, 150370);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
|
||||
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
|
||||
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
|
||||
}
|
||||
}
|
||||
class Unicode_string_fxt {
|
||||
private Unicode_string under;
|
||||
public void Init(String src) {
|
||||
this.under = Unicode_string_.New(src);
|
||||
}
|
||||
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
|
||||
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
|
||||
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
|
||||
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
|
||||
}
|
||||
public void Test__Val_codes(int... expd) {
|
||||
int actl_len = under.Len_codes();
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Val_codes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_bytes(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_bytes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_chars(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_chars(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_bytes_to_codes(int... expd) {
|
||||
int actl_len = under.Len_bytes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_bytes_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_chars_to_codes(int... expd) {
|
||||
int actl_len = under.Len_chars() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_chars_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
}
|
@ -1,79 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Utf16_mapper {
|
||||
private final int[] ary;
|
||||
private final int dim_len;
|
||||
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
|
||||
public String Src_str() {return src_str;} private final String src_str;
|
||||
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
|
||||
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
|
||||
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
|
||||
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
|
||||
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
|
||||
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
|
||||
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
|
||||
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
|
||||
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
|
||||
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
|
||||
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
|
||||
// create ary
|
||||
this.src_str = src_str;
|
||||
this.src_bry = src_bry;
|
||||
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
|
||||
int ary_len = dim_len * Dims_total;
|
||||
this.ary = new int[dim_len * Dims_total];
|
||||
for (int i = 0; i < ary_len; i++)
|
||||
ary[i] = Invalid;
|
||||
|
||||
// init
|
||||
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
|
||||
|
||||
// loop till EOS
|
||||
while (true) {
|
||||
// update
|
||||
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
|
||||
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
|
||||
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
|
||||
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
|
||||
|
||||
// exit if EOS
|
||||
if (pos_in_bytes >= src_bry_len) break;
|
||||
|
||||
// get lengths
|
||||
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
|
||||
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
|
||||
|
||||
// increment
|
||||
pos_in_bytes += cur_len_in_bytes;
|
||||
pos_in_chars += cur_len_in_chars;
|
||||
pos_in_codes += 1;
|
||||
}
|
||||
|
||||
// set lens
|
||||
this.len_in_codes = pos_in_codes;
|
||||
this.len_in_chars = pos_in_chars;
|
||||
}
|
||||
|
||||
public static final int
|
||||
Invalid = -1
|
||||
, Dims_total = 4
|
||||
, Dims_code_for_byte = 0
|
||||
, Dims_byte_for_code = 1
|
||||
, Dims_code_for_char = 2
|
||||
, Dims_char_for_code = 3
|
||||
;
|
||||
}
|
@ -1,62 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Utf16_mapper_tst {
|
||||
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
|
||||
@Test public void A() {
|
||||
fxt.Test__map("a¢€𤭢"
|
||||
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
|
||||
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
|
||||
);
|
||||
}
|
||||
}
|
||||
class Utf16_mapper_fxt {
|
||||
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
int src_len = src_bry.length;
|
||||
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
|
||||
}
|
||||
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
|
||||
int actl_len = src_len + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int v = -1;
|
||||
switch (dim_type) {
|
||||
case Utf16_mapper.Dims_code_for_byte:
|
||||
v = mapper.Get_code_for_byte_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_byte_for_code:
|
||||
v = mapper.Get_byte_for_code_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_code_for_char:
|
||||
v = mapper.Get_code_for_char_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_char_for_code:
|
||||
v = mapper.Get_char_for_code_or_neg1(i);
|
||||
break;
|
||||
}
|
||||
actl[i] = v;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
|
||||
}
|
||||
}
|
@ -54,12 +54,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
boolean plain = args.Cast_bool_or_n(3);
|
||||
|
||||
// init text vars
|
||||
byte[] text_bry = Bry_.new_u8(text_str);
|
||||
int text_bry_len = text_bry.length;
|
||||
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
|
||||
// convert bgn from base_1 to base_0
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
|
||||
|
||||
/*
|
||||
int offset = 0;
|
||||
@ -81,33 +79,33 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
// if plain, just do literal match of find and exit
|
||||
if (plain) {
|
||||
// find pos by literal match
|
||||
byte[] find_bry = Bry_.new_u8(find_str);
|
||||
int pos = Bry_find_.Find_fwd(text_bry, find_bry, text_map.Get_byte_for_code_or_fail(bgn_as_codes));
|
||||
Unicode_string find_ucs = Unicode_string_.New(find_str);
|
||||
byte[] find_bry = find_ucs.Src_bytes();
|
||||
int pos = Bry_find_.Find_fwd(text_ucs.Src_bytes(), find_bry, text_ucs.Pos_codes_to_bytes(bgn_as_codes));
|
||||
|
||||
// nothing found; return empty
|
||||
if (pos == Bry_find_.Not_found)
|
||||
return rslt.Init_ary_empty();
|
||||
|
||||
// bgn: convert pos from bytes back to codes; also adjust for base1
|
||||
int bgn = text_map.Get_code_for_byte_or_fail(pos) + Base1;
|
||||
int bgn = text_ucs.Pos_bytes_to_codes(pos) + Base1;
|
||||
|
||||
// end: add find.Len_in_codes and adjust end for PHP/LUA
|
||||
Utf16_mapper find_map = new Utf16_mapper(find_str, find_bry, find_bry.length);
|
||||
int end = bgn + find_map.Len_in_codes() - End_adj;
|
||||
int end = bgn + find_ucs.Len_codes() - End_adj;
|
||||
|
||||
return rslt.Init_many_objs(bgn, end);
|
||||
}
|
||||
|
||||
// run regex
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
|
||||
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
|
||||
|
||||
// add to tmp_list
|
||||
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
|
||||
return rslt.Init_many_list(tmp_list);
|
||||
}
|
||||
@ -120,13 +118,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
// validate / adjust
|
||||
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
|
||||
return rslt.Init_many_list(List_adp_.Noop);
|
||||
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
|
||||
Utf16_mapper text_map = new Utf16_mapper(text_str, text_bry, text_bry_len); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_map.Len_in_codes());
|
||||
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
|
||||
|
||||
// run regex
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
Regx_match[] regx_rslts = Run_regex_or_null(text_map, regx_converter, find_str, bgn_as_codes);
|
||||
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
||||
|
||||
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||
@ -141,7 +138,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
}
|
||||
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
|
||||
byte[] regx = args.Pull_bry(1);
|
||||
String regx = args.Pull_str(1);
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
|
||||
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
|
||||
@ -181,13 +178,13 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
bgn_as_codes = 0;
|
||||
return bgn_as_codes;
|
||||
}
|
||||
private Regx_match[] Run_regex_or_null(Utf16_mapper text_map, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||
private Regx_match[] Run_regex_or_null(Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||
// convert regex from lua to java
|
||||
find_str = regx_converter.patternToRegex(Bry_.new_u8(find_str), Scrib_regx_converter.Anchor_G);
|
||||
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G);
|
||||
|
||||
// run regex
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
|
||||
return regx_adp.Match_all(text_map.Src_str(), text_map.Get_char_for_code_or_fail(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
}
|
||||
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
|
||||
int capts_len = capts == null ? 0 : capts.length;
|
||||
|
@ -32,7 +32,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
|
||||
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
|
||||
String regx = args.Xstr_str_or_null(1);
|
||||
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
|
||||
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow);
|
||||
|
||||
// get @repl
|
||||
Object repl_obj = args.Cast_obj_or_null(2);
|
||||
|
@ -14,27 +14,31 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import gplx.core.brys.fmtrs.*;
|
||||
import gplx.core.brys.fmtrs.*; import gplx.core.intls.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public class Scrib_regx_converter {
|
||||
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
|
||||
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private Bry_bfr tmp_bfr;
|
||||
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
|
||||
public Scrib_regx_converter() {Init();}
|
||||
private final Lua_cls_to_regx_map percent_map, brack_map;
|
||||
public Scrib_regx_converter() {
|
||||
percent_map = Lua_cls_matcher.Instance.Percent();
|
||||
brack_map = Lua_cls_matcher.Instance.Brack();
|
||||
}
|
||||
public String Regx() {return regx;} private String regx;
|
||||
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
|
||||
public boolean Any_pos() {return any_pos;} private boolean any_pos;
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
|
||||
public String patternToRegex(byte[] pat, byte[] anchor) {
|
||||
public String patternToRegex(String pat_str, byte[] anchor) {
|
||||
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
|
||||
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
|
||||
grp_mgr.Clear();
|
||||
any_pos = false;
|
||||
boolean q_flag = false;
|
||||
|
||||
|
||||
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
|
||||
int len = pat.length;
|
||||
int len = pat_ucs.Len_codes();
|
||||
int grps_len = 0;
|
||||
int bct = 0;
|
||||
|
||||
@ -42,7 +46,7 @@ public class Scrib_regx_converter {
|
||||
for (int i = 0; i < len; i++) {
|
||||
int i_end = i + 1;
|
||||
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
|
||||
byte cur = pat[i];
|
||||
int cur = pat_ucs.Val_codes(i);
|
||||
switch (cur) {
|
||||
case Byte_ascii.Pow:
|
||||
q_flag = i != 0;
|
||||
@ -59,7 +63,7 @@ public class Scrib_regx_converter {
|
||||
int grp_idx = grp_mgr.Capt__len() + 1;
|
||||
|
||||
// check for "()"; enables anypos flag
|
||||
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end;
|
||||
boolean is_empty_capture = pat_ucs.Val_codes(i + 1) == Byte_ascii.Paren_end;
|
||||
if (is_empty_capture)
|
||||
any_pos = true;
|
||||
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
|
||||
@ -77,19 +81,19 @@ public class Scrib_regx_converter {
|
||||
i++;
|
||||
if (i >= len)
|
||||
throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
||||
Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1);
|
||||
if (percent_obj != null) {
|
||||
bfr.Add((byte[])percent_obj);
|
||||
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Val_codes(i));
|
||||
if (percent_bry != null) {
|
||||
bfr.Add(percent_bry);
|
||||
q_flag = true;
|
||||
}
|
||||
else {
|
||||
byte nxt = pat[i];
|
||||
int nxt = pat_ucs.Val_codes(i);
|
||||
switch (nxt) {
|
||||
case Byte_ascii.Ltr_b: // EX: "%b()"
|
||||
i += 2;
|
||||
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
|
||||
byte char_0 = pat[i - 1];
|
||||
byte char_1 = pat[i];
|
||||
int char_0 = pat_ucs.Val_codes(i - 1);
|
||||
int char_1 = pat_ucs.Val_codes(i);
|
||||
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
|
||||
bfr.Add(Bry_bf0_seg_0);
|
||||
Regx_quote(bfr, char_0);
|
||||
@ -109,18 +113,18 @@ public class Scrib_regx_converter {
|
||||
synchronized (fmtr_balanced) {
|
||||
++bct;
|
||||
int balanced_idx = grp_mgr.Full__len();
|
||||
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
|
||||
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
|
||||
grp_mgr.Capt__add__fake(2);
|
||||
bfr.Add(bfr_balanced.To_bry_and_clear());
|
||||
}
|
||||
}
|
||||
break;
|
||||
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
|
||||
if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn)
|
||||
if (i + 1 >= len || pat_ucs.Val_codes(++i) != Byte_ascii.Brack_bgn)
|
||||
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
|
||||
// %f always followed by bracketed term; convert lua bracketed term to regex
|
||||
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
|
||||
i = bracketedCharSetToRegex(tmp_bfr, pat, i, len);
|
||||
i = bracketedCharSetToRegex(tmp_bfr, pat_ucs, i, len);
|
||||
byte[] re2 = tmp_bfr.To_bry_and_clear();
|
||||
|
||||
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
|
||||
@ -148,7 +152,7 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
break;
|
||||
case Byte_ascii.Brack_bgn:
|
||||
i = bracketedCharSetToRegex(bfr, pat, i, len);
|
||||
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
|
||||
q_flag = true;
|
||||
break;
|
||||
case Byte_ascii.Brack_end:
|
||||
@ -163,12 +167,12 @@ public class Scrib_regx_converter {
|
||||
break;
|
||||
}
|
||||
if (q_flag && i + 1 < len) {
|
||||
byte tmp_b = pat[i + 1];
|
||||
int tmp_b = pat_ucs.Val_codes(i + 1);
|
||||
switch (tmp_b) {
|
||||
case Byte_ascii.Star:
|
||||
case Byte_ascii.Plus:
|
||||
case Byte_ascii.Question:
|
||||
bfr.Add_byte(tmp_b);
|
||||
bfr.Add_byte((byte)tmp_b);
|
||||
++i;
|
||||
break;
|
||||
case Byte_ascii.Dash:
|
||||
@ -184,35 +188,35 @@ public class Scrib_regx_converter {
|
||||
regx = bfr.To_str_and_clear();
|
||||
return regx;
|
||||
}
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, Unicode_string pat_ucs, int i, int len) {
|
||||
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||
i++;
|
||||
if (i < len && pat[i] == Byte_ascii.Pow) { // ^
|
||||
if (i < len && pat_ucs.Val_codes(i) == Byte_ascii.Pow) { // ^
|
||||
bfr.Add_byte(Byte_ascii.Pow);
|
||||
i++;
|
||||
}
|
||||
for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) {
|
||||
if (pat[i] == Byte_ascii.Percent) {
|
||||
for (int j = i; i < len && (j == i || pat_ucs.Val_codes(i) != Byte_ascii.Brack_end); i++) {
|
||||
if (pat_ucs.Val_codes(i) == Byte_ascii.Percent) {
|
||||
i++;
|
||||
if (i >= len) {
|
||||
break;
|
||||
}
|
||||
Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1);
|
||||
if (brack_obj != null)
|
||||
bfr.Add((byte[])brack_obj);
|
||||
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Val_codes(i));
|
||||
if (brack_bry != null)
|
||||
bfr.Add(brack_bry);
|
||||
else
|
||||
Regx_quote(bfr, pat[i]);
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
}
|
||||
else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) {
|
||||
if (pat[i] <= pat[i + 2]) {
|
||||
Regx_quote(bfr, pat[i]);
|
||||
else if (i + 2 < len && pat_ucs.Val_codes(i + 1) == Byte_ascii.Dash && pat_ucs.Val_codes(i + 2) != Byte_ascii.Brack_end && pat_ucs.Val_codes(i + 2) != Byte_ascii.Hash) {
|
||||
if (pat_ucs.Val_codes(i) <= pat_ucs.Val_codes(i + 2)) {
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
bfr.Add_byte(Byte_ascii.Dash);
|
||||
Regx_quote(bfr, pat[i + 2]);
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i + 2));
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
else {
|
||||
Regx_quote(bfr, pat[i]);
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
}
|
||||
}
|
||||
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
|
||||
@ -233,12 +237,12 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
return i;
|
||||
}
|
||||
private void Regx_quote(Bry_bfr bfr, byte b) {
|
||||
if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash);
|
||||
bfr.Add_byte(b);
|
||||
private void Regx_quote(Bry_bfr bfr, int code) {
|
||||
if (Regx_char(code)) bfr.Add_byte(Byte_ascii.Backslash);
|
||||
bfr.Add_u8_int(code);
|
||||
}
|
||||
private boolean Regx_char(byte b) {
|
||||
switch (b) {
|
||||
private boolean Regx_char(int code) {
|
||||
switch (code) {
|
||||
case Byte_ascii.Dot: case Byte_ascii.Slash: case Byte_ascii.Plus: case Byte_ascii.Star: case Byte_ascii.Question:
|
||||
case Byte_ascii.Pow: case Byte_ascii.Dollar: case Byte_ascii.Eq: case Byte_ascii.Bang: case Byte_ascii.Pipe:
|
||||
case Byte_ascii.Colon: case Byte_ascii.Dash:
|
||||
@ -257,7 +261,10 @@ public class Scrib_regx_converter {
|
||||
, Bry_star_question = Bry_.new_a7("*?") // was *?
|
||||
;
|
||||
public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^");
|
||||
private void Init() {
|
||||
}
|
||||
class Lua_cls_matcher {
|
||||
public static final Lua_cls_matcher Instance = new Lua_cls_matcher();
|
||||
Lua_cls_matcher() {
|
||||
String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
|
||||
String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
|
||||
Init_itm(Bool_.Y, "a", "\\p{L}");
|
||||
@ -288,15 +295,32 @@ public class Scrib_regx_converter {
|
||||
Init_itm(Bool_.N, "X", "\\x00-\\x2f\\x3a-\\x40\\x47-\\x60\\x67-\\x{ff0f}\\x{ff1a}-\\x{ff20}\\x{ff27}-\\x{ff40}\\x{ff47}-\\x{10ffff}");
|
||||
Init_itm(Bool_.N, "Z", "\\x01-\\x{10ffff}");
|
||||
}
|
||||
public Lua_cls_to_regx_map Percent() {return percent_map;} private final Lua_cls_to_regx_map percent_map = new Lua_cls_to_regx_map();
|
||||
public Lua_cls_to_regx_map Brack() {return brack_map;} private final Lua_cls_to_regx_map brack_map = new Lua_cls_to_regx_map();
|
||||
|
||||
private void Init_itm(boolean add_to_percent_hash, String lua, String php) {
|
||||
byte[] lua_bry = Bry_.new_a7(lua);
|
||||
int lua_len = String_.Len(lua);
|
||||
if (lua_len != 1) throw Err_.new_wo_type("lua must be 1 char only", "lua", lua);
|
||||
int lua_code = (int)String_.CharAt(lua, 0);
|
||||
if (lua_code < Byte_ascii.Ltr_A || lua_code > Byte_ascii.Ltr_z) throw Err_.new_wo_type("lua must be between A and z", "lua", lua);
|
||||
|
||||
byte[] php_bry = Bry_.new_a7(php);
|
||||
if (add_to_percent_hash) {
|
||||
percent_hash.Add_bry_obj(lua_bry, php_bry);
|
||||
brack_hash.Add_bry_obj(lua_bry, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters
|
||||
percent_map.Set(lua_code, php_bry);
|
||||
brack_map.Set(lua_code, php_bry); // always add to brack_hash; brack_hash = percent_hash + other characters
|
||||
}
|
||||
else {
|
||||
brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions
|
||||
brack_map.Set(lua_code, php_bry); // replace percent_hash definitions
|
||||
}
|
||||
}
|
||||
}
|
||||
class Lua_cls_to_regx_map {
|
||||
private static final int MAX = Byte_ascii.Max_7_bit;
|
||||
private final byte[][] map = new byte[MAX][];
|
||||
public byte[] Get_or_null(int code) {
|
||||
return code < MAX ? map[code] : null;
|
||||
}
|
||||
public void Set(int code, byte[] val) {
|
||||
map[code] = val;
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +50,9 @@ public class Scrib_regx_converter_tst {
|
||||
@Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails
|
||||
fxt.Test_replace("𠀀" , "[𠀀-]" , "a", "a");
|
||||
}
|
||||
@Test public void Invalid_range() {// PURPOSE: if range is invalid, take 1st char only; note range is multi-byte; ISSUE#:383; PAGE:en.d:dictionary DATE:2019-03-16
|
||||
fxt.Test_parse("[ড়-য়]" , "[ড়]"); // 2492-2479
|
||||
}
|
||||
// @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");}
|
||||
// @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");}
|
||||
}
|
||||
@ -61,11 +64,11 @@ class Scrib_regx_converter_fxt {
|
||||
}
|
||||
}
|
||||
public void Test_parse(String raw, String expd) {
|
||||
under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
|
||||
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G);
|
||||
Tfds.Eq(expd, under.Regx());
|
||||
}
|
||||
public void Test_replace(String text, String find, String replace, String expd) {
|
||||
String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
|
||||
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G);
|
||||
String actl = Regx_adp_.Replace(text, regex_str, replace);
|
||||
Tfds.Eq(expd, actl);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user