mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Scribunto: Use Luaj for pattern-matching (instead of Java Regex) [#413]
This commit is contained in:
@@ -1,51 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public interface Unicode_string {
|
||||
boolean Tid_is_single();
|
||||
String Src_string();
|
||||
byte[] Src_bytes();
|
||||
int Len_codes();
|
||||
int Len_chars();
|
||||
int Len_bytes();
|
||||
int Val_codes(int i);
|
||||
int Pos_codes_to_bytes(int i);
|
||||
int Pos_codes_to_chars(int i);
|
||||
int Pos_bytes_to_chars(int i);
|
||||
int Pos_bytes_to_codes(int i);
|
||||
int Pos_chars_to_codes(int i);
|
||||
}
|
||||
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
|
||||
private final int[] codes;
|
||||
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
|
||||
this.src_string = src_string;
|
||||
this.src_bytes = src_bytes;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
}
|
||||
public boolean Tid_is_single() {return true;}
|
||||
public String Src_string() {return src_string;} private final String src_string;
|
||||
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return codes_len;}
|
||||
public int Len_bytes() {return codes_len;}
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Unicode_string_ {
|
||||
public static Unicode_string New(String orig) {
|
||||
// null
|
||||
if (orig == null)
|
||||
return new Unicode_string_single(null, null, null, 0);
|
||||
|
||||
// init bytes
|
||||
byte[] bytes = Bry_.new_u8(orig);
|
||||
int bytes_len = bytes.length;
|
||||
|
||||
// init codes
|
||||
int[] codes = new int[bytes_len];
|
||||
int codes_len = 0;
|
||||
|
||||
// loop
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
while (bytes_pos < bytes_len) {
|
||||
// set codes
|
||||
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_len += 1;
|
||||
}
|
||||
return codes_len == bytes_len
|
||||
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
|
||||
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
|
||||
}
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
class Unicode_string_multi implements Unicode_string {
|
||||
private final int[] codes;
|
||||
private final int[] codes_to_bytes;
|
||||
private final int[] codes_to_chars;
|
||||
private final int[] bytes_to_chars;
|
||||
private final int[] bytes_to_codes;
|
||||
private final int[] chars_to_codes;
|
||||
|
||||
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
|
||||
// set member vars
|
||||
this.src = src;
|
||||
this.bytes = bytes;
|
||||
this.bytes_len = bytes_len;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
this.chars_len = chars_len;
|
||||
|
||||
// init maps
|
||||
this.codes_to_bytes = new int[codes_len + Adj_end];
|
||||
this.codes_to_chars = new int[codes_len + Adj_end];
|
||||
this.bytes_to_codes = New_int_ary(bytes_len);
|
||||
this.bytes_to_chars = New_int_ary(bytes_len);
|
||||
this.chars_to_codes = New_int_ary(chars_len);
|
||||
|
||||
// init loop
|
||||
int codes_pos = 0;
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
|
||||
// loop till EOS
|
||||
while (true) {
|
||||
// update
|
||||
codes_to_bytes[codes_pos] = bytes_pos;
|
||||
codes_to_chars[codes_pos] = chars_pos;
|
||||
bytes_to_chars[bytes_pos] = chars_pos;
|
||||
bytes_to_codes[bytes_pos] = codes_pos;
|
||||
chars_to_codes[chars_pos] = codes_pos;
|
||||
|
||||
if (bytes_pos == bytes_len) break;
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_pos += 1;
|
||||
}
|
||||
}
|
||||
public boolean Tid_is_single() {return false;}
|
||||
public String Src_string() {return src;} private final String src;
|
||||
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return chars_len;} private final int chars_len;
|
||||
public int Len_bytes() {return bytes_len;} private final int bytes_len;
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
|
||||
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
|
||||
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
|
||||
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
|
||||
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
|
||||
|
||||
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
|
||||
private static int[] New_int_ary(int len) {
|
||||
int rv_len = len + Adj_end;
|
||||
int[] rv = new int[rv_len];
|
||||
for (int i = 0; i < rv_len; i++)
|
||||
rv[i] = Invalid;
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Unicode_string_tst {
|
||||
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
|
||||
@Test public void Null() {
|
||||
fxt.Init(null);
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Blank() {
|
||||
fxt.Init("");
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Single() {
|
||||
fxt.Init("Abc");
|
||||
fxt.Test__Len(3, 3, 3);
|
||||
fxt.Test__Val_codes(65, 98, 99);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
|
||||
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
|
||||
}
|
||||
@Test public void Multi() {
|
||||
fxt.Init("a¢€𤭢");
|
||||
fxt.Test__Len(4, 5, 10);
|
||||
fxt.Test__Val_codes(97, 162, 8364, 150370);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
|
||||
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
|
||||
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
|
||||
}
|
||||
}
|
||||
class Unicode_string_fxt {
|
||||
private Unicode_string under;
|
||||
public void Init(String src) {
|
||||
this.under = Unicode_string_.New(src);
|
||||
}
|
||||
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
|
||||
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
|
||||
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
|
||||
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
|
||||
}
|
||||
public void Test__Val_codes(int... expd) {
|
||||
int actl_len = under.Len_codes();
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Val_codes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_bytes(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_bytes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_chars(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_chars(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_bytes_to_codes(int... expd) {
|
||||
int actl_len = under.Len_bytes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_bytes_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_chars_to_codes(int... expd) {
|
||||
int actl_len = under.Len_chars() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_chars_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user