1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Search: Add more punctuation support

This commit is contained in:
gnosygnu
2017-03-01 16:37:47 -05:00
parent 9301973825
commit 8de3cf0cc6
10 changed files with 233 additions and 26 deletions

View File

@@ -60,7 +60,7 @@ public class Utf8_ {
int bry_len = bry.length; if (bry_len == 0) return bry;
int pos = bry_len - 1;
while (true) { // loop bwds
int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos); // get byte0 of char
int cur_char_pos0 = Get_prv_char_pos0_old(bry, pos); // get byte0 of char
int cur_char_len = (pos - cur_char_pos0) + 1; // calc len of char
int nxt_char = Codepoint_max;
if (cur_char_len == 1) { // len=1; just change 1 byte
@@ -82,7 +82,7 @@ public class Utf8_ {
if (pos < 0) return null;
}
}
public static int Get_pos0_of_char_bwd(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
public static int Get_prv_char_pos0_old(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
int stop = pos - 4; // UTF8 char has max of 4 bytes
if (stop < 0) stop = 0; // if at pos 0 - 3, stop at 0
for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
@@ -96,6 +96,34 @@ public class Utf8_ {
}
return pos; // no mult-byte char found; return pos
}
public static int Get_prv_char_pos0(byte[] src, int cur) { // find pos0 of char while moving bwd through src; see test
// do bounds checks
if (cur == 0) return -1;
if (cur <= -1 || cur > src.length) throw Err_.new_wo_type("invalid index for get_prv_char_pos0", "src", src, "cur", cur);
// start at cur - 1; note bounds checks above
int pos = cur - 1;
// get 1st byte and check if ASCII for (a) error-checking (ASCII can only be in 1st byte); (b) performance
byte b = src[pos];
if (b >= 0 && b <= Byte_.Max_value_127) return pos;
// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
for (int i = 0; i < 4; i++) {
int char_len = Len_of_char_by_1st_byte(b);
switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <20> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
case 2: if (i == 1) return pos; break;
case 3: if (i == 2) return pos; break;
case 4: if (i == 3) return pos; break;
}
// decrement and set byte
pos--;
b = src[pos];
}
throw Err_.new_wo_type("could not get prv_char", "src", src, "cur", cur);
}
@gplx.Internal protected static int Increment_char(int cur) {
while (cur++ < Codepoint_max) {
if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1; // skip over surrogate range
@@ -107,7 +135,7 @@ public class Utf8_ {
private static boolean Codepoint_valid(int v) {
return Character.isDefined(v);
}
public static final int
public static final int
Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
, Codepoint_surrogate_bgn = 0xD800
, Codepoint_surrogate_end = 0xDFFF

View File

@@ -17,12 +17,12 @@ package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*;
public class Utf8__tst {
private Utf8__fxt fxt = new Utf8__fxt();
@Test public void Get_pos0_of_char_bwd() {
fxt.Test_Get_pos0_of_char_bwd("abcd", 3); // len=1; (note that bry.len = 4)
fxt.Test_Get_pos0_of_char_bwd("a", 0); // len=1; short-String
fxt.Test_Get_pos0_of_char_bwd("abc¢", 3); // len=2; (note that bry.len = 5)
fxt.Test_Get_pos0_of_char_bwd("abc€", 3); // len=3; (note that bry.len = 6)
fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
@Test public void Get_prv_char_pos0() {
fxt.Test__Get_prv_char_pos0("abcd", 3); // len=1; (note that bry.len = 4)
fxt.Test__Get_prv_char_pos0("a", 0); // len=1; short-String
fxt.Test__Get_prv_char_pos0("abc¢", 3); // len=2; (note that bry.len = 5)
fxt.Test__Get_prv_char_pos0("abc€", 3); // len=3; (note that bry.len = 6)
fxt.Test__Get_prv_char_pos0("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
}
@Test public void Increment_char_at_last_pos() {
fxt.Test_Increment_char_at_last_pos("a", "b");
@@ -56,10 +56,10 @@ public class Utf8__tst {
// }
}
class Utf8__fxt {
public void Test_Get_pos0_of_char_bwd(String str, int expd) {
byte[] bry = Bry_.new_u8(str);
int pos = bry.length - 1; // always start from last char
Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
public void Test__Get_prv_char_pos0(String src_str, int expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0 (src_bry, src_bry.length));
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0_old(src_bry, src_bry.length - 1));
}
public void Test_Increment_char_at_last_pos(String str, String expd) {
Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));