1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Scribunto: Iterate regx by codepoint not by bytes [#383]

This commit is contained in:
gnosygnu
2019-03-16 23:50:26 -04:00
parent 4cd23b9827
commit 8ef5854eb7
12 changed files with 380 additions and 207 deletions

View File

@@ -0,0 +1,49 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public interface Unicode_string {
boolean Tid_is_single();
String Src_string();
byte[] Src_bytes();
int Len_codes();
int Len_chars();
int Len_bytes();
int Val_codes(int i);
int Pos_codes_to_bytes(int i);
int Pos_codes_to_chars(int i);
int Pos_bytes_to_codes(int i);
int Pos_chars_to_codes(int i);
}
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
private final int[] codes;
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
this.src_string = src_string;
this.src_bytes = src_bytes;
this.codes = codes;
this.codes_len = codes_len;
}
public boolean Tid_is_single() {return true;}
public String Src_string() {return src_string;} private final String src_string;
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return codes_len;}
public int Len_bytes() {return codes_len;}
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
}

View File

@@ -0,0 +1,48 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Unicode_string_ {
public static Unicode_string New(String orig) {
// null
if (orig == null)
return new Unicode_string_single(null, null, null, 0);
// init bytes
byte[] bytes = Bry_.new_u8(orig);
int bytes_len = bytes.length;
// init codes
int[] codes = new int[bytes_len];
int codes_len = 0;
// loop
int bytes_pos = 0;
int chars_pos = 0;
while (bytes_pos < bytes_len) {
// set codes
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_len += 1;
}
return codes_len == bytes_len
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
}
}

View File

@@ -0,0 +1,81 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
class Unicode_string_multi implements Unicode_string {
private final int[] codes;
private final int[] codes_to_bytes;
private final int[] codes_to_chars;
private final int[] bytes_to_codes;
private final int[] chars_to_codes;
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
// set member vars
this.src = src;
this.bytes = bytes;
this.bytes_len = bytes_len;
this.codes = codes;
this.codes_len = codes_len;
this.chars_len = chars_len;
// init maps
this.codes_to_bytes = new int[codes_len + Adj_end];
this.codes_to_chars = new int[codes_len + Adj_end];
this.bytes_to_codes = New_int_ary(bytes_len);
this.chars_to_codes = New_int_ary(chars_len);
// init loop
int codes_pos = 0;
int bytes_pos = 0;
int chars_pos = 0;
// loop till EOS
while (true) {
// update
codes_to_bytes[codes_pos] = bytes_pos;
codes_to_chars[codes_pos] = chars_pos;
bytes_to_codes[bytes_pos] = codes_pos;
chars_to_codes[chars_pos] = codes_pos;
if (bytes_pos == bytes_len) break;
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_pos += 1;
}
}
public boolean Tid_is_single() {return false;}
public String Src_string() {return src;} private final String src;
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return chars_len;} private final int chars_len;
public int Len_bytes() {return bytes_len;} private final int bytes_len;
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
private static int[] New_int_ary(int len) {
int rv_len = len + Adj_end;
int[] rv = new int[rv_len];
for (int i = 0; i < rv_len; i++)
rv[i] = Invalid;
return rv;
}
}

View File

@@ -0,0 +1,110 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Unicode_string_tst {
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
@Test public void Null() {
fxt.Init(null);
fxt.Test__Len(0, 0, 0);
}
@Test public void Blank() {
fxt.Init("");
fxt.Test__Len(0, 0, 0);
}
@Test public void Single() {
fxt.Init("Abc");
fxt.Test__Len(3, 3, 3);
fxt.Test__Val_codes(65, 98, 99);
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
}
@Test public void Multi() {
fxt.Init("a¢€𤭢");
fxt.Test__Len(4, 5, 10);
fxt.Test__Val_codes(97, 162, 8364, 150370);
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
}
}
class Unicode_string_fxt {
private Unicode_string under;
public void Init(String src) {
this.under = Unicode_string_.New(src);
}
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
}
public void Test__Val_codes(int... expd) {
int actl_len = under.Len_codes();
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Val_codes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_bytes(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_bytes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_chars(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_chars(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_bytes_to_codes(int... expd) {
int actl_len = under.Len_bytes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_bytes_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_chars_to_codes(int... expd) {
int actl_len = under.Len_chars() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_chars_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
}

View File

@@ -1,79 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Utf16_mapper {
private final int[] ary;
private final int dim_len;
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
public String Src_str() {return src_str;} private final String src_str;
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
// create ary
this.src_str = src_str;
this.src_bry = src_bry;
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
int ary_len = dim_len * Dims_total;
this.ary = new int[dim_len * Dims_total];
for (int i = 0; i < ary_len; i++)
ary[i] = Invalid;
// init
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
// loop till EOS
while (true) {
// update
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
// exit if EOS
if (pos_in_bytes >= src_bry_len) break;
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
// increment
pos_in_bytes += cur_len_in_bytes;
pos_in_chars += cur_len_in_chars;
pos_in_codes += 1;
}
// set lens
this.len_in_codes = pos_in_codes;
this.len_in_chars = pos_in_chars;
}
public static final int
Invalid = -1
, Dims_total = 4
, Dims_code_for_byte = 0
, Dims_byte_for_code = 1
, Dims_code_for_char = 2
, Dims_char_for_code = 3
;
}

View File

@@ -1,62 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Utf16_mapper_tst {
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
@Test public void A() {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
);
}
}
class Utf16_mapper_fxt {
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
byte[] src_bry = Bry_.new_u8(src_str);
int src_len = src_bry.length;
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
}
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
int actl_len = src_len + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int v = -1;
switch (dim_type) {
case Utf16_mapper.Dims_code_for_byte:
v = mapper.Get_code_for_byte_or_neg1(i);
break;
case Utf16_mapper.Dims_byte_for_code:
v = mapper.Get_byte_for_code_or_neg1(i);
break;
case Utf16_mapper.Dims_code_for_char:
v = mapper.Get_code_for_char_or_neg1(i);
break;
case Utf16_mapper.Dims_char_for_code:
v = mapper.Get_char_for_code_or_neg1(i);
break;
}
actl[i] = v;
}
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
}
}