1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Scribunto: Use Luaj for pattern-matching (instead of Java Regex) [#413]

This commit is contained in:
gnosygnu
2019-04-28 17:31:33 -04:00
parent 4a1b2e25c0
commit f860edf064
51 changed files with 2045 additions and 729 deletions

View File

@@ -1,51 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public interface Unicode_string {
boolean Tid_is_single();
String Src_string();
byte[] Src_bytes();
int Len_codes();
int Len_chars();
int Len_bytes();
int Val_codes(int i);
int Pos_codes_to_bytes(int i);
int Pos_codes_to_chars(int i);
int Pos_bytes_to_chars(int i);
int Pos_bytes_to_codes(int i);
int Pos_chars_to_codes(int i);
}
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
private final int[] codes;
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
this.src_string = src_string;
this.src_bytes = src_bytes;
this.codes = codes;
this.codes_len = codes_len;
}
public boolean Tid_is_single() {return true;}
public String Src_string() {return src_string;} private final String src_string;
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return codes_len;}
public int Len_bytes() {return codes_len;}
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
}

View File

@@ -1,48 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Unicode_string_ {
public static Unicode_string New(String orig) {
// null
if (orig == null)
return new Unicode_string_single(null, null, null, 0);
// init bytes
byte[] bytes = Bry_.new_u8(orig);
int bytes_len = bytes.length;
// init codes
int[] codes = new int[bytes_len];
int codes_len = 0;
// loop
int bytes_pos = 0;
int chars_pos = 0;
while (bytes_pos < bytes_len) {
// set codes
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_len += 1;
}
return codes_len == bytes_len
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
}
}

View File

@@ -1,85 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
class Unicode_string_multi implements Unicode_string {
private final int[] codes;
private final int[] codes_to_bytes;
private final int[] codes_to_chars;
private final int[] bytes_to_chars;
private final int[] bytes_to_codes;
private final int[] chars_to_codes;
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
// set member vars
this.src = src;
this.bytes = bytes;
this.bytes_len = bytes_len;
this.codes = codes;
this.codes_len = codes_len;
this.chars_len = chars_len;
// init maps
this.codes_to_bytes = new int[codes_len + Adj_end];
this.codes_to_chars = new int[codes_len + Adj_end];
this.bytes_to_codes = New_int_ary(bytes_len);
this.bytes_to_chars = New_int_ary(bytes_len);
this.chars_to_codes = New_int_ary(chars_len);
// init loop
int codes_pos = 0;
int bytes_pos = 0;
int chars_pos = 0;
// loop till EOS
while (true) {
// update
codes_to_bytes[codes_pos] = bytes_pos;
codes_to_chars[codes_pos] = chars_pos;
bytes_to_chars[bytes_pos] = chars_pos;
bytes_to_codes[bytes_pos] = codes_pos;
chars_to_codes[chars_pos] = codes_pos;
if (bytes_pos == bytes_len) break;
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_pos += 1;
}
}
public boolean Tid_is_single() {return false;}
public String Src_string() {return src;} private final String src;
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return chars_len;} private final int chars_len;
public int Len_bytes() {return bytes_len;} private final int bytes_len;
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
private static int[] New_int_ary(int len) {
int rv_len = len + Adj_end;
int[] rv = new int[rv_len];
for (int i = 0; i < rv_len; i++)
rv[i] = Invalid;
return rv;
}
}

View File

@@ -1,110 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Unicode_string_tst {
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
@Test public void Null() {
fxt.Init(null);
fxt.Test__Len(0, 0, 0);
}
@Test public void Blank() {
fxt.Init("");
fxt.Test__Len(0, 0, 0);
}
@Test public void Single() {
fxt.Init("Abc");
fxt.Test__Len(3, 3, 3);
fxt.Test__Val_codes(65, 98, 99);
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
}
@Test public void Multi() {
fxt.Init("a¢€𤭢");
fxt.Test__Len(4, 5, 10);
fxt.Test__Val_codes(97, 162, 8364, 150370);
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
}
}
class Unicode_string_fxt {
private Unicode_string under;
public void Init(String src) {
this.under = Unicode_string_.New(src);
}
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
}
public void Test__Val_codes(int... expd) {
int actl_len = under.Len_codes();
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Val_codes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_bytes(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_bytes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_chars(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_chars(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_bytes_to_codes(int... expd) {
int actl_len = under.Len_bytes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_bytes_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_chars_to_codes(int... expd) {
int actl_len = under.Len_chars() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_chars_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
}