mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
v2.9.3.1
This commit is contained in:
@@ -36,6 +36,7 @@ public class Err_ {
|
||||
public static Err new_parse_exc(Exception e, Class<?> c, String raw) {return new_parse(Type_adp_.FullNameOf_type(c), raw).Args_add("e", Err_.Message_lang(e));}
|
||||
public static Err new_parse(String type, String raw) {return new Err(Bool_.Y, Trace_null, Type__gplx, "parse failed", "type", type, "raw", raw);}
|
||||
public static Err new_null() {return new Err(Bool_.Y, Trace_null, Type__gplx, "null obj");}
|
||||
public static Err new_null(String arg) {return new Err(Bool_.Y, Trace_null, Type__gplx, "null obj", "arg", arg);}
|
||||
public static Err new_missing_idx(int idx, int len) {return new Err(Bool_.Y, Trace_null, Type__gplx, "index is out of bounds", "idx", idx, "len", len);}
|
||||
public static Err new_missing_key(String key) {return new Err(Bool_.Y, Trace_null, Type__gplx, "key not found", "key", key);}
|
||||
public static Err new_invalid_op(String msg) {return new Err(Bool_.Y, Trace_null, Type__gplx, msg);}
|
||||
|
||||
@@ -29,14 +29,14 @@ public class Bry_rdr {
|
||||
public void Pos_add_one() {++pos;}
|
||||
public int Or_int() {return or_int;} public void Or_int_(int v) {or_int = v;} private int or_int = Int_.Min_value;
|
||||
public byte[] Or_bry() {return or_bry;} public void Or_bry_(byte[] v) {or_bry = v;} private byte[] or_bry;
|
||||
public int Find_fwd(byte find) {return Bry_finder.Find_fwd(src, find, pos);}
|
||||
public int Find_fwd_ws() {return Bry_finder.Find_fwd_until_ws(src, pos, src_len);}
|
||||
public int Find_fwd(byte find) {return Bry_find_.Find_fwd(src, find, pos);}
|
||||
public int Find_fwd_ws() {return Bry_find_.Find_fwd_until_ws(src, pos, src_len);}
|
||||
public int Find_fwd__pos_at_lhs(byte[] find_bry) {return Find_fwd__pos_at(find_bry, Bool_.N);}
|
||||
public int Find_fwd__pos_at_rhs(byte[] find_bry) {return Find_fwd__pos_at(find_bry, Bool_.Y);}
|
||||
public int Find_fwd__pos_at(byte[] find_bry, boolean pos_at_rhs) {
|
||||
int find_pos = Bry_finder.Find_fwd(src, find_bry, pos, src_len);
|
||||
int find_pos = Bry_find_.Find_fwd(src, find_bry, pos, src_len);
|
||||
if (pos_at_rhs) find_pos += find_bry.length;
|
||||
if (find_pos != Bry_finder.Not_found) pos = find_pos;
|
||||
if (find_pos != Bry_find_.Not_found) pos = find_pos;
|
||||
return find_pos;
|
||||
}
|
||||
public int Read_int_to_semic() {return Read_int_to(Byte_ascii.Semic);}
|
||||
|
||||
@@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.btries; import gplx.*; import gplx.core.*;
|
||||
import gplx.intl.*;
|
||||
import gplx.core.intls.*;
|
||||
class Btrie_u8_itm {
|
||||
private Hash_adp_bry nxts;
|
||||
private byte[] asymmetric_bry;
|
||||
@@ -40,8 +40,8 @@ class Btrie_u8_itm {
|
||||
else { // itm has asymmetric_bry; EX: "İ" was added to trie, must match "İ" and "i";
|
||||
if (called_by_match) { // called by mgr.Match
|
||||
return
|
||||
( Bry_.Eq(rv.key, src, c_bgn, c_end) // key matches src; EX: "aİ"
|
||||
|| Bry_.Eq(rv.asymmetric_bry, src, c_bgn, c_end) // asymmetric_bry matches src; EX: "ai"; note that "aI" won't match
|
||||
( Bry_.Eq(src, c_bgn, c_end, rv.key) // key matches src; EX: "aİ"
|
||||
|| Bry_.Eq(src, c_bgn, c_end, rv.asymmetric_bry) // asymmetric_bry matches src; EX: "ai"; note that "aI" won't match
|
||||
)
|
||||
? rv : null;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.btries; import gplx.*; import gplx.core.*;
|
||||
import gplx.intl.*;
|
||||
import gplx.core.intls.*;
|
||||
public class Btrie_u8_mgr implements Btrie_mgr {
|
||||
private Btrie_u8_itm root; private Gfo_case_mgr case_mgr;
|
||||
Btrie_u8_mgr(Gfo_case_mgr case_mgr) {
|
||||
|
||||
24
100_core/src/gplx/core/intls/Gfo_case_itm.java
Normal file
24
100_core/src/gplx/core/intls/Gfo_case_itm.java
Normal file
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public interface Gfo_case_itm {
|
||||
int Hashcode_lo();
|
||||
int Len_lo();
|
||||
byte[] Asymmetric_bry();
|
||||
int Utf8_id_lo(); // lower-case byte or byte[] as single utf8 int
|
||||
}
|
||||
22
100_core/src/gplx/core/intls/Gfo_case_mgr.java
Normal file
22
100_core/src/gplx/core/intls/Gfo_case_mgr.java
Normal file
@@ -0,0 +1,22 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public interface Gfo_case_mgr {
|
||||
byte Tid();
|
||||
Gfo_case_itm Get_or_null(byte bgn_byte, byte[] src, int bgn, int end);
|
||||
}
|
||||
21
100_core/src/gplx/core/intls/Gfo_case_mgr_.java
Normal file
21
100_core/src/gplx/core/intls/Gfo_case_mgr_.java
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Gfo_case_mgr_ {
|
||||
public static final byte Tid_a7 = 0, Tid_u8 = 1, Tid_custom = 2;
|
||||
}
|
||||
137
100_core/src/gplx/core/intls/Utf16_.java
Normal file
137
100_core/src/gplx/core/intls/Utf16_.java
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import gplx.core.primitives.*;
|
||||
public class Utf16_ {
|
||||
public static int Surrogate_merge(int hi, int lo) { // REF: http://perldoc.perl.org/Encode/Unicode.html
|
||||
return 0x10000 + (hi - 0xD800) * 0x400 + (lo - 0xDC00);
|
||||
}
|
||||
public static void Surrogate_split(int v, Int_obj_ref hi, Int_obj_ref lo) {
|
||||
hi.Val_((v - 0x10000) / 0x400 + 0xD800);
|
||||
lo.Val_((v - 0x10000) % 0x400 + 0xDC00);
|
||||
}
|
||||
public static int Decode_to_int(byte[] ary, int pos) {
|
||||
byte b0 = ary[pos];
|
||||
if ((b0 & 0x80) == 0) {
|
||||
return b0;
|
||||
}
|
||||
else if ((b0 & 0xE0) == 0xC0) {
|
||||
return ( b0 & 0x1f) << 6
|
||||
| ( ary[pos + 1] & 0x3f)
|
||||
;
|
||||
}
|
||||
else if ((b0 & 0xF0) == 0xE0) {
|
||||
return ( b0 & 0x0f) << 12
|
||||
| ((ary[pos + 1] & 0x3f) << 6)
|
||||
| ( ary[pos + 2] & 0x3f)
|
||||
;
|
||||
}
|
||||
else if ((b0 & 0xF8) == 0xF0) {
|
||||
return ( b0 & 0x07) << 18
|
||||
| ((ary[pos + 1] & 0x3f) << 12)
|
||||
| ((ary[pos + 2] & 0x3f) << 6)
|
||||
| ( ary[pos + 3] & 0x3f)
|
||||
;
|
||||
}
|
||||
else throw Err_.new_wo_type("invalid utf8 byte", "byte", b0);
|
||||
}
|
||||
public static byte[] Encode_hex_to_bry(String raw) {return Encode_hex_to_bry(Bry_.new_a7(raw));}
|
||||
public static byte[] Encode_hex_to_bry(byte[] raw) {
|
||||
if (raw == null) return null;
|
||||
int int_val = gplx.texts.HexDecUtl.parse_or(raw, Int_.Min_value);
|
||||
return int_val == Int_.Min_value ? null : Encode_int_to_bry(int_val);
|
||||
}
|
||||
public static byte[] Encode_int_to_bry(int c) {
|
||||
int bry_len = Len_by_int(c);
|
||||
byte[] bry = new byte[bry_len];
|
||||
Encode_int(c, bry, 0);
|
||||
return bry;
|
||||
}
|
||||
public static int Encode_char(int c, char[] c_ary, int c_pos, byte[] b_ary, int b_pos) {
|
||||
if ((c > -1)
|
||||
&& (c < 128)) {
|
||||
b_ary[ b_pos] = (byte)c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 2048) {
|
||||
b_ary[ b_pos] = (byte)(0xC0 | (c >> 6));
|
||||
b_ary[++b_pos] = (byte)(0x80 | (c & 0x3F));
|
||||
return 1;
|
||||
}
|
||||
else if((c > 55295) // 0xD800
|
||||
&& (c < 56320)) { // 0xDFFF
|
||||
if (c_pos >= c_ary.length) throw Err_.new_wo_type("incomplete surrogate pair at end of String", "char", c);
|
||||
char nxt_char = c_ary[c_pos + 1];
|
||||
int v = Surrogate_merge(c, nxt_char);
|
||||
b_ary[b_pos] = (byte)(0xF0 | (v >> 18));
|
||||
b_ary[++b_pos] = (byte)(0x80 | (v >> 12) & 0x3F);
|
||||
b_ary[++b_pos] = (byte)(0x80 | (v >> 6) & 0x3F);
|
||||
b_ary[++b_pos] = (byte)(0x80 | (v & 0x3F));
|
||||
return 2;
|
||||
}
|
||||
else {
|
||||
b_ary[b_pos] = (byte)(0xE0 | (c >> 12));
|
||||
b_ary[++b_pos] = (byte)(0x80 | (c >> 6) & 0x3F);
|
||||
b_ary[++b_pos] = (byte)(0x80 | (c & 0x3F));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
public static int Encode_int(int c, byte[] src, int pos) {
|
||||
if ((c > -1)
|
||||
&& (c < 128)) {
|
||||
src[ pos] = (byte)c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 2048) {
|
||||
src[ pos] = (byte)(0xC0 | (c >> 6));
|
||||
src[++pos] = (byte)(0x80 | (c & 0x3F));
|
||||
return 2;
|
||||
}
|
||||
else if (c < 65536) {
|
||||
src[pos] = (byte)(0xE0 | (c >> 12));
|
||||
src[++pos] = (byte)(0x80 | (c >> 6) & 0x3F);
|
||||
src[++pos] = (byte)(0x80 | (c & 0x3F));
|
||||
return 3;
|
||||
}
|
||||
else if (c < 2097152) {
|
||||
src[pos] = (byte)(0xF0 | (c >> 18));
|
||||
src[++pos] = (byte)(0x80 | (c >> 12) & 0x3F);
|
||||
src[++pos] = (byte)(0x80 | (c >> 6) & 0x3F);
|
||||
src[++pos] = (byte)(0x80 | (c & 0x3F));
|
||||
return 4;
|
||||
}
|
||||
else throw Err_.new_wo_type("UTF-16 int must be between 0 and 2097152", "char", c);
|
||||
}
|
||||
private static int Len_by_int(int c) {
|
||||
if ((c > -1)
|
||||
&& (c < 128)) return 1; // 1 << 7
|
||||
else if (c < 2048) return 2; // 1 << 11
|
||||
else if (c < 65536) return 3; // 1 << 16
|
||||
else if (c < 2097152) return 4;
|
||||
else throw Err_.new_wo_type("UTF-16 int must be between 0 and 2097152", "char", c);
|
||||
}
|
||||
public static int Len_by_char(int c) {
|
||||
if ((c > -1)
|
||||
&& (c < 128)) return 1; // 1 << 7
|
||||
else if (c < 2048) return 2; // 1 << 11
|
||||
else if((c > 55295) // 0xD800
|
||||
&& (c < 56320)) return 4; // 0xDFFF
|
||||
else if (c < 65536) return 3; // 1 << 16
|
||||
else throw Err_.new_wo_type("UTF-16 int must be between 0 and 65536", "char", c);
|
||||
}
|
||||
}
|
||||
59
100_core/src/gplx/core/intls/Utf16__tst.java
Normal file
59
100_core/src/gplx/core/intls/Utf16__tst.java
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.primitives.*;
|
||||
public class Utf16__tst {
|
||||
private Utf16__fxt fxt = new Utf16__fxt();
|
||||
@Test public void Encode_decode() {
|
||||
// fxt.Test_encode_decode(162, 194, 162); // cent
|
||||
// fxt.Test_encode_decode(8364, 226, 130, 172); // euro
|
||||
fxt.Test_encode_decode(150370, 240, 164, 173, 162); // example from [[UTF-8]]; should be encoded as two bytes
|
||||
}
|
||||
@Test public void Encode_as_bry_by_hex() {
|
||||
fxt.Test_Encode_hex_to_bry("00", 0);
|
||||
fxt.Test_Encode_hex_to_bry("41", 65);
|
||||
fxt.Test_Encode_hex_to_bry("0041", 65);
|
||||
fxt.Test_Encode_hex_to_bry("00C0", 195, 128);
|
||||
}
|
||||
@Test public void Surrogate() {
|
||||
fxt.Test_surrogate(0x64321, 0xD950, 0xDF21); // example from w:UTF-16
|
||||
fxt.Test_surrogate(66643, 55297, 56403); // example from d:Boomerang
|
||||
}
|
||||
}
|
||||
class Utf16__fxt {
|
||||
private Int_obj_ref hi_ref = Int_obj_ref.neg1_(), lo_ref = Int_obj_ref.neg1_();
|
||||
public void Test_encode_decode(int expd_c_int, int... expd_int) {
|
||||
byte[] expd = Bry_.new_ints(expd_int);
|
||||
byte[] bfr = new byte[10];
|
||||
int bfr_len = Utf16_.Encode_int(expd_c_int, bfr, 0);
|
||||
byte[] actl = Bry_.Mid_by_len(bfr, 0, bfr_len);
|
||||
Tfds.Eq_ary(expd, actl);
|
||||
int actl_c_int = Utf16_.Decode_to_int(bfr, 0);
|
||||
Tfds.Eq(expd_c_int, actl_c_int);
|
||||
}
|
||||
public void Test_surrogate(int v, int hi, int lo) {
|
||||
Tfds.Eq(v, Utf16_.Surrogate_merge((char)hi, (char)lo));
|
||||
Utf16_.Surrogate_split(v, hi_ref, lo_ref);
|
||||
Tfds.Eq(hi, hi_ref.Val());
|
||||
Tfds.Eq(lo, lo_ref.Val());
|
||||
}
|
||||
public void Test_Encode_hex_to_bry(String raw, int... expd) {
|
||||
byte[] actl = Utf16_.Encode_hex_to_bry(raw);
|
||||
Tfds.Eq_ary(Byte_.Ary_by_ints(expd), actl);
|
||||
}
|
||||
}
|
||||
117
100_core/src/gplx/core/intls/Utf8_.java
Normal file
117
100_core/src/gplx/core/intls/Utf8_.java
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Utf8_ {
|
||||
public static int Len_of_bry(byte[] ary) {
|
||||
if (ary == null) return 0;
|
||||
int rv = 0;
|
||||
int pos = 0, len = ary.length;
|
||||
while (pos < len) {
|
||||
int char_len = Len_of_char_by_1st_byte(ary[pos]);
|
||||
++rv;
|
||||
pos += char_len;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
public static int Len_of_char_by_1st_byte(byte b) {// SEE:w:UTF-8
|
||||
int i = b & 0xff; // PATCH.JAVA:need to convert to unsigned byte
|
||||
switch (i) {
|
||||
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15:
|
||||
case 16: case 17: case 18: case 19: case 20: case 21: case 22: case 23: case 24: case 25: case 26: case 27: case 28: case 29: case 30: case 31:
|
||||
case 32: case 33: case 34: case 35: case 36: case 37: case 38: case 39: case 40: case 41: case 42: case 43: case 44: case 45: case 46: case 47:
|
||||
case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: case 58: case 59: case 60: case 61: case 62: case 63:
|
||||
case 64: case 65: case 66: case 67: case 68: case 69: case 70: case 71: case 72: case 73: case 74: case 75: case 76: case 77: case 78: case 79:
|
||||
case 80: case 81: case 82: case 83: case 84: case 85: case 86: case 87: case 88: case 89: case 90: case 91: case 92: case 93: case 94: case 95:
|
||||
case 96: case 97: case 98: case 99: case 100: case 101: case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 109: case 110: case 111:
|
||||
case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 119: case 120: case 121: case 122: case 123: case 124: case 125: case 126: case 127:
|
||||
case 128: case 129: case 130: case 131: case 132: case 133: case 134: case 135: case 136: case 137: case 138: case 139: case 140: case 141: case 142: case 143:
|
||||
case 144: case 145: case 146: case 147: case 148: case 149: case 150: case 151: case 152: case 153: case 154: case 155: case 156: case 157: case 158: case 159:
|
||||
case 160: case 161: case 162: case 163: case 164: case 165: case 166: case 167: case 168: case 169: case 170: case 171: case 172: case 173: case 174: case 175:
|
||||
case 176: case 177: case 178: case 179: case 180: case 181: case 182: case 183: case 184: case 185: case 186: case 187: case 188: case 189: case 190: case 191:
|
||||
return 1;
|
||||
case 192: case 193: case 194: case 195: case 196: case 197: case 198: case 199: case 200: case 201: case 202: case 203: case 204: case 205: case 206: case 207:
|
||||
case 208: case 209: case 210: case 211: case 212: case 213: case 214: case 215: case 216: case 217: case 218: case 219: case 220: case 221: case 222: case 223:
|
||||
return 2;
|
||||
case 224: case 225: case 226: case 227: case 228: case 229: case 230: case 231: case 232: case 233: case 234: case 235: case 236: case 237: case 238: case 239:
|
||||
return 3;
|
||||
case 240: case 241: case 242: case 243: case 244: case 245: case 246: case 247:
|
||||
return 4;
|
||||
default: throw Err_.new_wo_type("invalid initial utf8 byte", "byte", b);
|
||||
}
|
||||
}
|
||||
public static byte[] Get_char_at_pos_as_bry(byte[] bry, int pos) {
|
||||
int len = Len_of_char_by_1st_byte(bry[pos]);
|
||||
return Bry_.Mid(bry, pos, pos + len);
|
||||
}
|
||||
public static byte[] Increment_char_at_last_pos(byte[] bry) { // EX: abc -> abd; complexity is for multi-byte chars
|
||||
int bry_len = bry.length; if (bry_len == 0) return bry;
|
||||
int pos = bry_len - 1;
|
||||
while (true) { // loop bwds
|
||||
int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos); // get byte0 of char
|
||||
int cur_char_len = (pos - cur_char_pos0) + 1; // calc len of char
|
||||
int nxt_char = Codepoint_max;
|
||||
if (cur_char_len == 1) { // len=1; just change 1 byte
|
||||
nxt_char = Increment_char(bry[cur_char_pos0]); // get next char
|
||||
if (nxt_char < 128) { // single-byte char; just change pos
|
||||
bry = Bry_.Copy(bry); // always return new bry; never reuse existing
|
||||
bry[cur_char_pos0] = (byte)nxt_char;
|
||||
return bry;
|
||||
}
|
||||
}
|
||||
int cur_char = Utf16_.Decode_to_int(bry, cur_char_pos0);
|
||||
nxt_char = Increment_char(cur_char);
|
||||
if (nxt_char != Int_.Min_value) {
|
||||
byte[] nxt_char_as_bry = Utf16_.Encode_int_to_bry(nxt_char);
|
||||
bry = Bry_.Add(Bry_.Mid(bry, 0, cur_char_pos0), nxt_char_as_bry);
|
||||
return bry;
|
||||
}
|
||||
pos = cur_char_pos0 - 1;
|
||||
if (pos < 0) return null;
|
||||
}
|
||||
}
|
||||
public static int Get_pos0_of_char_bwd(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
|
||||
int stop = pos - 4; // UTF8 char has max of 4 bytes
|
||||
if (stop < 0) stop = 0; // if at pos 0 - 3, stop at 0
|
||||
for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
|
||||
byte b = bry[i];
|
||||
int char_len = Len_of_char_by_1st_byte(b);
|
||||
switch (char_len) { // if char_len is multi-byte and pos is at correct multi-byte pos (pos - i = # of bytes - 1), then pos0 found; EX: <20> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct pos for 3 byte char -> return
|
||||
case 2: if (pos - i == 1) return i; break;
|
||||
case 3: if (pos - i == 2) return i; break;
|
||||
case 4: if (pos - i == 3) return i; break;
|
||||
}
|
||||
}
|
||||
return pos; // no mult-byte char found; return pos
|
||||
}
|
||||
@gplx.Internal protected static int Increment_char(int cur) {
|
||||
while (cur++ < Codepoint_max) {
|
||||
if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1; // skip over surrogate range
|
||||
if (!Codepoint_valid(cur)) continue;
|
||||
return cur;
|
||||
}
|
||||
return Int_.Min_value;
|
||||
}
|
||||
private static boolean Codepoint_valid(int v) {
|
||||
return Character.isDefined(v);
|
||||
}
|
||||
public static final int
|
||||
Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
|
||||
, Codepoint_surrogate_bgn = 0xD800
|
||||
, Codepoint_surrogate_end = 0xDFFF
|
||||
;
|
||||
}
|
||||
69
100_core/src/gplx/core/intls/Utf8__tst.java
Normal file
69
100_core/src/gplx/core/intls/Utf8__tst.java
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*;
|
||||
public class Utf8__tst {
|
||||
private Utf8__fxt fxt = new Utf8__fxt();
|
||||
@Test public void Get_pos0_of_char_bwd() {
|
||||
fxt.Test_Get_pos0_of_char_bwd("abcd", 3); // len=1; (note that bry.len = 4)
|
||||
fxt.Test_Get_pos0_of_char_bwd("a", 0); // len=1; short-String
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc¢", 3); // len=2; (note that bry.len = 5)
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc€", 3); // len=3; (note that bry.len = 6)
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
|
||||
}
|
||||
@Test public void Increment_char_at_last_pos() {
|
||||
fxt.Test_Increment_char_at_last_pos("a", "b");
|
||||
fxt.Test_Increment_char_at_last_pos("abc", "abd");
|
||||
fxt.Test_Increment_char_at_last_pos("É", "Ê"); // len=2
|
||||
fxt.Test_Increment_char_at_last_pos("€", "₭"); // len=3
|
||||
}
|
||||
// @Test public void Increment_char_at_last_pos_exhaustive_check() { // check all values; commented for perf
|
||||
// Bry_bfr bfr = Bry_bfr.new_();
|
||||
// int bgn = 32;
|
||||
// while (true) {
|
||||
// byte[] bgn_bry = Utf16_.Encode_int_to_bry(bgn);
|
||||
// int end = Utf8_.Increment_char(bgn);
|
||||
// if (end == Utf8_.Codepoint_max) break;
|
||||
//// if (bgn > 1024 * 1024) break;
|
||||
// byte[] end_by_codepoint_next = Utf16_.Encode_int_to_bry(end);
|
||||
// byte[] end_by_increment_char = Utf8_.Increment_char_at_last_pos(bgn_bry);
|
||||
// if (!Bry_.Eq(end_by_codepoint_next, end_by_increment_char)) {
|
||||
// Tfds.Write(bgn);
|
||||
// }
|
||||
//// bfr .Add_int_variable(bgn).Add_byte(Byte_ascii.Tab)
|
||||
//// .Add(bgn_bry).Add_byte(Byte_ascii.Tab)
|
||||
//// .Add(end_by_codepoint_next).Add_byte(Byte_ascii.Tab)
|
||||
//// .Add(end_by_increment_char).Add_byte(Byte_ascii.Tab)
|
||||
//// .Add_byte_nl()
|
||||
//// ;
|
||||
// bgn = end;
|
||||
// bgn_bry = end_by_codepoint_next;
|
||||
// }
|
||||
// Tfds.WriteText(bfr.Xto_str_and_clear());
|
||||
// }
|
||||
}
|
||||
class Utf8__fxt {
|
||||
public void Test_Get_pos0_of_char_bwd(String str, int expd) {
|
||||
byte[] bry = Bry_.new_u8(str);
|
||||
int pos = bry.length - 1; // always start from last char
|
||||
Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
|
||||
}
|
||||
public void Test_Increment_char_at_last_pos(String str, String expd) {
|
||||
Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
public class Regx_adp {
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
public class Regx_adp_ {
|
||||
public static Regx_adp new_(String pattern) {return new Regx_adp(pattern);}
|
||||
public static List_adp Find_all(String input, String find) {
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
import org.junit.*;
|
||||
public class Regx_adp__tst implements TfdsEqListItmStr {
|
||||
@Test public void Match() {
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
import gplx.core.strings.*;
|
||||
public class Regx_bldr {
|
||||
public static String Includes(String characters) {return String_.Concat_any(Regx_bldr.Tkn_CharSetBegin, characters, Regx_bldr.Tkn_CharSetEnd);}
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
public class Regx_group {
|
||||
public Regx_group(boolean rslt, int bgn, int end, String val) {this.rslt = rslt; this.bgn = bgn; this.end = end; this.val = val;}
|
||||
public boolean Rslt() {return rslt;} private boolean rslt;
|
||||
@@ -15,7 +15,7 @@ GNU Affero General Public License for more details.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.regxs; import gplx.*; import gplx.core.*;
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
public class Regx_match {
|
||||
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {this.rslt = rslt; this.find_bgn = find_bgn; this.find_end = find_end; this.groups = groups;}
|
||||
public boolean Rslt() {return rslt;} private boolean rslt;
|
||||
Reference in New Issue
Block a user