Search: Add more punctuation support

2026-03-02 03:49:30 +00:00 · 2017-03-01 16:37:47 -05:00
parent 9301973825
commit 8de3cf0cc6
10 changed files with 233 additions and 26 deletions
--- a/100_core/src/gplx/core/intls/Utf8_.java
+++ b/100_core/src/gplx/core/intls/Utf8_.java
@@ -60,7 +60,7 @@ public class Utf8_ {
 		int bry_len = bry.length; if (bry_len == 0) return bry;
 		int pos = bry_len - 1;
 		while (true) {												// loop bwds
-			int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos);		// get byte0 of char
+			int cur_char_pos0 = Get_prv_char_pos0_old(bry, pos);		// get byte0 of char
 			int cur_char_len = (pos - cur_char_pos0) + 1;			// calc len of char
 			int nxt_char = Codepoint_max;
 			if (cur_char_len == 1) {								// len=1; just change 1 byte
@@ -82,7 +82,7 @@ public class Utf8_ {
 			if (pos < 0) return null;
 		}
 	}
-	public static int Get_pos0_of_char_bwd(byte[] bry, int pos) {	// find pos0 of char while moving bwd through bry; see test
+	public static int Get_prv_char_pos0_old(byte[] bry, int pos) {	// find pos0 of char while moving bwd through bry; see test
 		int stop = pos - 4;						// UTF8 char has max of 4 bytes
 		if (stop < 0) stop = 0;					// if at pos 0 - 3, stop at 0
 		for (int i = pos - 1; i >= stop; i--) {	// start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
@@ -96,6 +96,34 @@ public class Utf8_ {
 		}
 		return pos;	// no mult-byte char found; return pos
 	}
+	public static int Get_prv_char_pos0(byte[] src, int cur) {	// find pos0 of char while moving bwd through src; see test
+		// do bounds checks
+		if (cur == 0) return -1;
+		if (cur <= -1 || cur > src.length) throw Err_.new_wo_type("invalid index for get_prv_char_pos0", "src", src, "cur", cur);
+
+		// start at cur - 1; note bounds checks above
+		int pos = cur - 1; 
+
+		// get 1st byte and check if ASCII for (a) error-checking (ASCII can only be in 1st byte); (b) performance
+		byte b = src[pos];
+		if (b >= 0 && b <= Byte_.Max_value_127) return pos;
+
+		// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
+		for (int i = 0; i < 4; i++) {
+			int char_len = Len_of_char_by_1st_byte(b);
+			switch (char_len) {	// if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <20> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
+				case 2: if (i == 1) return pos; break;
+				case 3: if (i == 2) return pos; break;
+				case 4: if (i == 3) return pos; break;
+			}
+
+			// decrement and set byte
+			pos--;
+			b = src[pos];
+		}
+
+		throw Err_.new_wo_type("could not get prv_char", "src", src, "cur", cur);
+	}
 	@gplx.Internal protected static int Increment_char(int cur) {
 		while (cur++ < Codepoint_max) {
 			if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1;	// skip over surrogate range
@@ -107,7 +135,7 @@ public class Utf8_ {
 	private static boolean Codepoint_valid(int v) {
 				return Character.isDefined(v);
 			}
-	public static final int 
+	public static final    int 
 	  Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
 	, Codepoint_surrogate_bgn = 0xD800
 	, Codepoint_surrogate_end = 0xDFFF
--- a/100_core/src/gplx/core/intls/Utf8__tst.java
+++ b/100_core/src/gplx/core/intls/Utf8__tst.java
@@ -17,12 +17,12 @@ package gplx.core.intls; import gplx.*; import gplx.core.*;
 import org.junit.*;
 public class Utf8__tst {
 	private Utf8__fxt fxt = new Utf8__fxt();
-	@Test  public void Get_pos0_of_char_bwd() {
-		fxt.Test_Get_pos0_of_char_bwd("abcd", 3);		// len=1; (note that bry.len = 4)
-		fxt.Test_Get_pos0_of_char_bwd("a", 0);			// len=1; short-String
-		fxt.Test_Get_pos0_of_char_bwd("abc¢", 3);		// len=2; (note that bry.len = 5)
-		fxt.Test_Get_pos0_of_char_bwd("abc€", 3);		// len=3; (note that bry.len = 6)
-		fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3);		// len=4; (note that bry.len = 7)
+	@Test  public void Get_prv_char_pos0() {
+		fxt.Test__Get_prv_char_pos0("abcd", 3);        // len=1; (note that bry.len = 4)
+		fxt.Test__Get_prv_char_pos0("a", 0);           // len=1; short-String
+		fxt.Test__Get_prv_char_pos0("abc¢", 3);        // len=2; (note that bry.len = 5)
+		fxt.Test__Get_prv_char_pos0("abc€", 3);        // len=3; (note that bry.len = 6)
+		fxt.Test__Get_prv_char_pos0("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3);		// len=4; (note that bry.len = 7)
 	}
 	@Test  public void Increment_char_at_last_pos() {
 		fxt.Test_Increment_char_at_last_pos("a", "b");
@@ -56,10 +56,10 @@ public class Utf8__tst {
 //		}
 }
 class Utf8__fxt {
-	public void Test_Get_pos0_of_char_bwd(String str, int expd) {
-		byte[] bry = Bry_.new_u8(str);
-		int pos = bry.length - 1;	// always start from last char
-		Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
+	public void Test__Get_prv_char_pos0(String src_str, int expd) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Tfds.Eq(expd, Utf8_.Get_prv_char_pos0    (src_bry, src_bry.length));
+		Tfds.Eq(expd, Utf8_.Get_prv_char_pos0_old(src_bry, src_bry.length - 1));
 	}
 	public void Test_Increment_char_at_last_pos(String str, String expd) {
 		Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));