Search: Add more punctuation support

2025-06-13 12:54:14 +00:00 · 2017-03-01 16:37:47 -05:00 · 2017-03-01 16:37:47 -05:00 · 8de3cf0cc6
commit 8de3cf0cc6
parent 9301973825
10 changed files with 233 additions and 26 deletions
--- a/100_core/src/gplx/core/intls/Utf8_.java
+++ b/100_core/src/gplx/core/intls/Utf8_.java
@ -60,7 +60,7 @@ public class Utf8_ {
 		int bry_len = bry.length; if (bry_len == 0) return bry;
 		int pos = bry_len - 1;
 		while (true) {												// loop bwds
-			int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos);		// get byte0 of char
+			int cur_char_pos0 = Get_prv_char_pos0_old(bry, pos);		// get byte0 of char
 			int cur_char_len = (pos - cur_char_pos0) + 1;			// calc len of char
 			int nxt_char = Codepoint_max;
 			if (cur_char_len == 1) {								// len=1; just change 1 byte
@ -82,7 +82,7 @@ public class Utf8_ {
 			if (pos < 0) return null;
 		}
 	}
-	public static int Get_pos0_of_char_bwd(byte[] bry, int pos) {	// find pos0 of char while moving bwd through bry; see test
+	public static int Get_prv_char_pos0_old(byte[] bry, int pos) {	// find pos0 of char while moving bwd through bry; see test
 		int stop = pos - 4;						// UTF8 char has max of 4 bytes
 		if (stop < 0) stop = 0;					// if at pos 0 - 3, stop at 0
 		for (int i = pos - 1; i >= stop; i--) {	// start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
@ -96,6 +96,34 @@ public class Utf8_ {
 		}
 		return pos;	// no mult-byte char found; return pos
 	}
+	public static int Get_prv_char_pos0(byte[] src, int cur) {	// find pos0 of char while moving bwd through src; see test
+		// do bounds checks
+		if (cur == 0) return -1;
+		if (cur <= -1 || cur > src.length) throw Err_.new_wo_type("invalid index for get_prv_char_pos0", "src", src, "cur", cur);
+
+		// start at cur - 1; note bounds checks above
+		int pos = cur - 1; 
+
+		// get 1st byte and check if ASCII for (a) error-checking (ASCII can only be in 1st byte); (b) performance
+		byte b = src[pos];
+		if (b >= 0 && b <= Byte_.Max_value_127) return pos;
+
+		// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
+		for (int i = 0; i < 4; i++) {
+			int char_len = Len_of_char_by_1st_byte(b);
+			switch (char_len) {	// if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
+				case 2: if (i == 1) return pos; break;
+				case 3: if (i == 2) return pos; break;
+				case 4: if (i == 3) return pos; break;
+			}
+
+			// decrement and set byte
+			pos--;
+			b = src[pos];
+		}
+
+		throw Err_.new_wo_type("could not get prv_char", "src", src, "cur", cur);
+	}
 	@gplx.Internal protected static int Increment_char(int cur) {
 		while (cur++ < Codepoint_max) {
 			if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1;	// skip over surrogate range
@ -107,7 +135,7 @@ public class Utf8_ {
 	private static boolean Codepoint_valid(int v) {
 				return Character.isDefined(v);
 			}
-	public static final int 
+	public static final    int 
 	  Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
 	, Codepoint_surrogate_bgn = 0xD800
 	, Codepoint_surrogate_end = 0xDFFF
--- a/100_core/src/gplx/core/intls/Utf8__tst.java
+++ b/100_core/src/gplx/core/intls/Utf8__tst.java
@ -17,12 +17,12 @@ package gplx.core.intls; import gplx.*; import gplx.core.*;
 import org.junit.*;
 public class Utf8__tst {
 	private Utf8__fxt fxt = new Utf8__fxt();
-	@Test  public void Get_pos0_of_char_bwd() {
-		fxt.Test_Get_pos0_of_char_bwd("abcd", 3);		// len=1; (note that bry.len = 4)
-		fxt.Test_Get_pos0_of_char_bwd("a", 0);			// len=1; short-String
-		fxt.Test_Get_pos0_of_char_bwd("abc¢", 3);		// len=2; (note that bry.len = 5)
-		fxt.Test_Get_pos0_of_char_bwd("abc€", 3);		// len=3; (note that bry.len = 6)
-		fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3);		// len=4; (note that bry.len = 7)
+	@Test  public void Get_prv_char_pos0() {
+		fxt.Test__Get_prv_char_pos0("abcd", 3);        // len=1; (note that bry.len = 4)
+		fxt.Test__Get_prv_char_pos0("a", 0);           // len=1; short-String
+		fxt.Test__Get_prv_char_pos0("abc¢", 3);        // len=2; (note that bry.len = 5)
+		fxt.Test__Get_prv_char_pos0("abc€", 3);        // len=3; (note that bry.len = 6)
+		fxt.Test__Get_prv_char_pos0("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3);		// len=4; (note that bry.len = 7)
 	}
 	@Test  public void Increment_char_at_last_pos() {
 		fxt.Test_Increment_char_at_last_pos("a", "b");
@ -56,10 +56,10 @@ public class Utf8__tst {
 //		}
 }
 class Utf8__fxt {
-	public void Test_Get_pos0_of_char_bwd(String str, int expd) {
-		byte[] bry = Bry_.new_u8(str);
-		int pos = bry.length - 1;	// always start from last char
-		Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
+	public void Test__Get_prv_char_pos0(String src_str, int expd) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Tfds.Eq(expd, Utf8_.Get_prv_char_pos0    (src_bry, src_bry.length));
+		Tfds.Eq(expd, Utf8_.Get_prv_char_pos0_old(src_bry, src_bry.length - 1));
 	}
 	public void Test_Increment_char_at_last_pos(String str, String expd) {
 		Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));
--- a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_finder_cbkevaltst.java
+++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_finder_cbkevaltst.java
@ -58,6 +58,40 @@ public class Xosearch_finder_cbk__eval__tst {
 		// n: char exists
 		fxt.Test__eval_n("a");
 	}
+	@Test   public void Trim_end() {
+		fxt.Init__search("a");
+		// y: single
+		fxt.Test__eval_y("a!");
+		// y: many
+		fxt.Test__eval_y("a!!!");
+	}
+	@Test   public void Trim_bgn() {
+		fxt.Init__search("a");
+		// y: single
+		fxt.Test__eval_y("!a");
+		// y: many
+		fxt.Test__eval_y("!!!a");
+	}
+	@Test   public void Trim_both() {
+		fxt.Init__search("a");
+		// y: single
+		fxt.Test__eval_y("'a'");
+		// y: many
+		fxt.Test__eval_y("'''a'''");
+	}
+	@Test   public void Slash() {
+		fxt.Init__search("a");
+		// y: slash before, after
+		fxt.Test__eval_y("a/b/c", "b/a/c", "b/c/a");
+	}
+	// .
+	// ...
+	// -
+	// a'b
+	// https://site/page
+	// ()
+	// []
+	// <>
 }
 class Xosearch_finder_cbk__eval__fxt {
 	private boolean case_match = false;
@ -76,7 +110,7 @@ class Xosearch_finder_cbk__eval__fxt {
 			byte[] text_bry = Bry_.new_u8(text);
 			cbk.found = false;
 			finder.Match(text_bry, 0, text_bry.length, cbk);
-			Gftest.Eq__bool(expd, cbk.found, "query={0}, text={1}", finder.Query(), text);
+			Gftest.Eq__bool(expd, cbk.found, "query={0} text={1}", finder.Query(), text);
 		}
 	}
 }
--- a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_finder_mgr.java
+++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_finder_mgr.java
@ -22,6 +22,8 @@ public class Xosearch_finder_mgr {
 	private Xosearch_word_node tree_root;
 	private final    Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
 	private final    Btrie_rv trv = new Btrie_rv();
+	private final    Xosearch_word_lang lang = new Xosearch_word_lang();
+	private final    Xosearch_word_bounds word_bounds = new Xosearch_word_bounds();

 	public byte[] Query() {return query;} private byte[] query;
 	public void Init(byte[] query, boolean case_match, boolean auto_wildcard, byte wildchar_byte, byte not_byte) {
@ -51,16 +53,17 @@ public class Xosearch_finder_mgr {

 			// current byte matches a hook; get hook and hook_end
 			Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
+			int hook_bgn = cur;
 			int hook_end = cur + hook.word_hook.length;

-			// get current word bounds by finding flanking ws
-			int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
-			int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
-			if (word_end == -1) word_end = src_end;	// WORKAROUND: no match returns -1 instead of src_end
+			// get word_bounds
+			lang.Get_word_bounds(word_bounds, trv, src, src_end, hook_bgn, hook_end);
+			int word_bgn = word_bounds.word_bgn;
+			int word_end = word_bounds.word_end;

 			// check if current word matches criteria-word
-			if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
-				cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
+			if (hook.Match_word(lang, src, hook_bgn, hook_end, word_bgn, word_end)) {
+				cbk.Process_item_found(src, hook_bgn, hook_end, word_bgn, word_end, hook);
 			}

 			// update position to word_end
--- a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_bounds.java
+++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_bounds.java
@ -0,0 +1,24 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
+public class Xosearch_word_bounds {
+	public int word_bgn;
+	public int word_end;
+	public void Init(int word_bgn, int word_end) {
+		this.word_bgn = word_bgn;
+		this.word_end = word_end;
+	}
+}
--- a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_lang.java
+++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_lang.java
@ -0,0 +1,119 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
+import gplx.core.btries.*;
+import gplx.core.intls.*;
+public class Xosearch_word_lang {
+	private final    Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
+		.Add_many_str("\t", "\n", "\r", " ", "/");
+	private final    Btrie_slim_mgr ws_end;
+	private final    Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
+		.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"")
+		;
+	private final    Btrie_slim_mgr punct_end;
+	public Xosearch_word_lang() {
+		this.ws_end = ws_bgn;
+		this.punct_end = punct_bgn;
+	}
+	public void Get_word_bounds(Xosearch_word_bounds word_bounds, Btrie_rv trv, byte[] src, int src_end, int hook_bgn, int hook_end) {
+		int tmp_pos = -1;
+		Object tmp_obj = null;
+
+		// find word_bgn
+		int word_bgn = hook_bgn;
+		tmp_pos = word_bgn;
+		while (true) {
+			// stop if BOS
+			if (tmp_pos == 0) break;
+
+			// move back one char
+			tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
+
+			// check if char is ws
+			tmp_obj = ws_bgn.Match_at(trv, src, tmp_pos, hook_end);
+
+			// char is ws -> stop
+			if (tmp_obj != null) break;
+
+			// char is not ws -> update word_end
+			word_bgn = tmp_pos;
+		}
+
+		// find word_end
+		int word_end = hook_end;
+		tmp_pos = word_end;
+		while (true) {
+			// stop if passed EOS
+			if (tmp_pos >= src_end) break;
+
+			// check if char is ws
+			tmp_obj = ws_end.Match_at(trv, src, tmp_pos, src_end);
+
+			// stop if ws
+			if (tmp_obj != null) break;
+
+			// increment before
+			tmp_pos++;
+
+			// update word_end
+			word_end = tmp_pos;
+		}
+
+		// trim punct at bgn; EX: "'abc" -> "abc"
+		if (word_bgn < hook_bgn) {
+			tmp_pos = word_bgn;
+			while (true) {
+				// stop if passed hook-end
+				if (tmp_pos >= hook_bgn) break;
+
+				// check if char is punct
+				tmp_obj = punct_bgn.Match_at(trv, src, tmp_pos, word_end);
+
+				// stop if not a punct
+				if (tmp_obj == null) break;
+
+				// increment before
+				tmp_pos++;
+
+				// update word_end
+				word_bgn = tmp_pos;
+			}
+		}
+
+		// trim punct at end; EX: "abc." -> "abc"
+		if (word_end > hook_end) {
+			tmp_pos = word_end;
+			while (true) {
+				// scan bwd one char
+				tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
+
+				// stop if passed hook-end
+				if (tmp_pos < hook_end) break;
+
+				// check if char is punct
+				tmp_obj = punct_end.Match_at(trv, src, tmp_pos, word_end);
+
+				// stop if not a punct
+				if (tmp_obj == null) break;
+
+				// update word_end
+				word_end = tmp_pos;
+			}
+		}
+
+		word_bounds.Init(word_bgn, word_end);
+	}
+}
--- a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_node.java
+++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/finders/Xosearch_word_node.java
@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
 Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
+import gplx.core.btries.*;
 import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
 public class Xosearch_word_node {
 	public int tid;
@ -24,9 +25,7 @@ public class Xosearch_word_node {
 	public boolean wildcard_at_end;
 	public boolean found;

-	public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
-		// TODO.XO: handle punctuation
-
+	public boolean Match_word(Xosearch_word_lang ctx, byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
 		// if no wildcard at bgn, hook_bgn must match word_bgn
 		if (   !wildcard_at_bgn
 			&& hook_bgn != word_bgn)
--- a/400_xowa/src/gplx/xowa/parsers/lnkes/Xop_lnke_wkr.java
+++ b/400_xowa/src/gplx/xowa/parsers/lnkes/Xop_lnke_wkr.java
@ -280,7 +280,7 @@ public class Xop_lnke_wkr implements Xop_ctx_wkr {
 				return false;	// alpha-numerical is invalid; EX: "titel:" should not generate a lnke for "tel:"
 		}
 		if (prv_byte >= Byte_ascii.Ascii_min && prv_byte <= Byte_ascii.Ascii_max) return true;	// consider all other ASCII chars as true; EX: \t\n !, etc; 
-		prv_pos = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, prv_pos);
+		prv_pos = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, prv_pos);
 		prv_byte = src[prv_pos];
 		boolean prv_char_is_letter = ctx.Lang().Case_mgr().Match_any_exists(prv_byte, src, prv_pos, bgn_pos);
 		return !prv_char_is_letter;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_regex_boundary.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_regex_boundary.java
@ -29,7 +29,7 @@ public class Xomw_regex_boundary {	// THREAD.SAFE: trv is only for consistent in
 	}
 	public boolean Is_boundary_prv(byte[] src, int pos) {
 		if (pos == 0) return true; // BOS is true
-		int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
+		int bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0(src, pos);
 		byte b = src[bgn];
 		Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
 		return o != null;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
@ -285,7 +285,7 @@ class Xomw_regex_html_entity {
 		int numbers = 0;
 		int letters = 0;
 		while (cur >= src_end) {
-			int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
+			int b_bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, cur);
 			switch (src[b_bgn]) {
 				case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
 				case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J: