1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-09-28 14:30:51 +00:00

Search: Add more punctuation support

This commit is contained in:
gnosygnu 2017-03-01 16:37:47 -05:00
parent 9301973825
commit 8de3cf0cc6
10 changed files with 233 additions and 26 deletions

View File

@ -60,7 +60,7 @@ public class Utf8_ {
int bry_len = bry.length; if (bry_len == 0) return bry;
int pos = bry_len - 1;
while (true) { // loop bwds
int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos); // get byte0 of char
int cur_char_pos0 = Get_prv_char_pos0_old(bry, pos); // get byte0 of char
int cur_char_len = (pos - cur_char_pos0) + 1; // calc len of char
int nxt_char = Codepoint_max;
if (cur_char_len == 1) { // len=1; just change 1 byte
@ -82,7 +82,7 @@ public class Utf8_ {
if (pos < 0) return null;
}
}
public static int Get_pos0_of_char_bwd(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
public static int Get_prv_char_pos0_old(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
int stop = pos - 4; // UTF8 char has max of 4 bytes
if (stop < 0) stop = 0; // if at pos 0 - 3, stop at 0
for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
@ -96,6 +96,34 @@ public class Utf8_ {
}
return pos; // no mult-byte char found; return pos
}
public static int Get_prv_char_pos0(byte[] src, int cur) { // find pos0 of char while moving bwd through src; see test
// do bounds checks
if (cur == 0) return -1;
if (cur <= -1 || cur > src.length) throw Err_.new_wo_type("invalid index for get_prv_char_pos0", "src", src, "cur", cur);
// start at cur - 1; note bounds checks above
int pos = cur - 1;
// get 1st byte and check if ASCII for (a) error-checking (ASCII can only be in 1st byte); (b) performance
byte b = src[pos];
if (b >= 0 && b <= Byte_.Max_value_127) return pos;
// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
for (int i = 0; i < 4; i++) {
int char_len = Len_of_char_by_1st_byte(b);
switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
case 2: if (i == 1) return pos; break;
case 3: if (i == 2) return pos; break;
case 4: if (i == 3) return pos; break;
}
// decrement and set byte
pos--;
b = src[pos];
}
throw Err_.new_wo_type("could not get prv_char", "src", src, "cur", cur);
}
@gplx.Internal protected static int Increment_char(int cur) {
while (cur++ < Codepoint_max) {
if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1; // skip over surrogate range
@ -107,7 +135,7 @@ public class Utf8_ {
private static boolean Codepoint_valid(int v) {
return Character.isDefined(v);
}
public static final int
public static final int
Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
, Codepoint_surrogate_bgn = 0xD800
, Codepoint_surrogate_end = 0xDFFF

View File

@ -17,12 +17,12 @@ package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*;
public class Utf8__tst {
private Utf8__fxt fxt = new Utf8__fxt();
@Test public void Get_pos0_of_char_bwd() {
fxt.Test_Get_pos0_of_char_bwd("abcd", 3); // len=1; (note that bry.len = 4)
fxt.Test_Get_pos0_of_char_bwd("a", 0); // len=1; short-String
fxt.Test_Get_pos0_of_char_bwd("abc¢", 3); // len=2; (note that bry.len = 5)
fxt.Test_Get_pos0_of_char_bwd("abc€", 3); // len=3; (note that bry.len = 6)
fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
@Test public void Get_prv_char_pos0() {
fxt.Test__Get_prv_char_pos0("abcd", 3); // len=1; (note that bry.len = 4)
fxt.Test__Get_prv_char_pos0("a", 0); // len=1; short-String
fxt.Test__Get_prv_char_pos0("abc¢", 3); // len=2; (note that bry.len = 5)
fxt.Test__Get_prv_char_pos0("abc€", 3); // len=3; (note that bry.len = 6)
fxt.Test__Get_prv_char_pos0("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
}
@Test public void Increment_char_at_last_pos() {
fxt.Test_Increment_char_at_last_pos("a", "b");
@ -56,10 +56,10 @@ public class Utf8__tst {
// }
}
class Utf8__fxt {
public void Test_Get_pos0_of_char_bwd(String str, int expd) {
byte[] bry = Bry_.new_u8(str);
int pos = bry.length - 1; // always start from last char
Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
public void Test__Get_prv_char_pos0(String src_str, int expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0 (src_bry, src_bry.length));
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0_old(src_bry, src_bry.length - 1));
}
public void Test_Increment_char_at_last_pos(String str, String expd) {
Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));

View File

@ -58,6 +58,40 @@ public class Xosearch_finder_cbk__eval__tst {
// n: char exists
fxt.Test__eval_n("a");
}
@Test public void Trim_end() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("a!");
// y: many
fxt.Test__eval_y("a!!!");
}
@Test public void Trim_bgn() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("!a");
// y: many
fxt.Test__eval_y("!!!a");
}
@Test public void Trim_both() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("'a'");
// y: many
fxt.Test__eval_y("'''a'''");
}
@Test public void Slash() {
fxt.Init__search("a");
// y: slash before, after
fxt.Test__eval_y("a/b/c", "b/a/c", "b/c/a");
}
// .
// ...
// -
// a'b
// https://site/page
// ()
// []
// <>
}
class Xosearch_finder_cbk__eval__fxt {
private boolean case_match = false;
@ -76,7 +110,7 @@ class Xosearch_finder_cbk__eval__fxt {
byte[] text_bry = Bry_.new_u8(text);
cbk.found = false;
finder.Match(text_bry, 0, text_bry.length, cbk);
Gftest.Eq__bool(expd, cbk.found, "query={0}, text={1}", finder.Query(), text);
Gftest.Eq__bool(expd, cbk.found, "query={0} text={1}", finder.Query(), text);
}
}
}

View File

@ -22,6 +22,8 @@ public class Xosearch_finder_mgr {
private Xosearch_word_node tree_root;
private final Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
private final Btrie_rv trv = new Btrie_rv();
private final Xosearch_word_lang lang = new Xosearch_word_lang();
private final Xosearch_word_bounds word_bounds = new Xosearch_word_bounds();
public byte[] Query() {return query;} private byte[] query;
public void Init(byte[] query, boolean case_match, boolean auto_wildcard, byte wildchar_byte, byte not_byte) {
@ -51,16 +53,17 @@ public class Xosearch_finder_mgr {
// current byte matches a hook; get hook and hook_end
Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
int hook_bgn = cur;
int hook_end = cur + hook.word_hook.length;
// get current word bounds by finding flanking ws
int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
if (word_end == -1) word_end = src_end; // WORKAROUND: no match returns -1 instead of src_end
// get word_bounds
lang.Get_word_bounds(word_bounds, trv, src, src_end, hook_bgn, hook_end);
int word_bgn = word_bounds.word_bgn;
int word_end = word_bounds.word_end;
// check if current word matches criteria-word
if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
if (hook.Match_word(lang, src, hook_bgn, hook_end, word_bgn, word_end)) {
cbk.Process_item_found(src, hook_bgn, hook_end, word_bgn, word_end, hook);
}
// update position to word_end

View File

@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
public class Xosearch_word_bounds {
public int word_bgn;
public int word_end;
public void Init(int word_bgn, int word_end) {
this.word_bgn = word_bgn;
this.word_end = word_end;
}
}

View File

@ -0,0 +1,119 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.core.intls.*;
public class Xosearch_word_lang {
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
.Add_many_str("\t", "\n", "\r", " ", "/");
private final Btrie_slim_mgr ws_end;
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"")
;
private final Btrie_slim_mgr punct_end;
public Xosearch_word_lang() {
this.ws_end = ws_bgn;
this.punct_end = punct_bgn;
}
public void Get_word_bounds(Xosearch_word_bounds word_bounds, Btrie_rv trv, byte[] src, int src_end, int hook_bgn, int hook_end) {
int tmp_pos = -1;
Object tmp_obj = null;
// find word_bgn
int word_bgn = hook_bgn;
tmp_pos = word_bgn;
while (true) {
// stop if BOS
if (tmp_pos == 0) break;
// move back one char
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
// check if char is ws
tmp_obj = ws_bgn.Match_at(trv, src, tmp_pos, hook_end);
// char is ws -> stop
if (tmp_obj != null) break;
// char is not ws -> update word_end
word_bgn = tmp_pos;
}
// find word_end
int word_end = hook_end;
tmp_pos = word_end;
while (true) {
// stop if passed EOS
if (tmp_pos >= src_end) break;
// check if char is ws
tmp_obj = ws_end.Match_at(trv, src, tmp_pos, src_end);
// stop if ws
if (tmp_obj != null) break;
// increment before
tmp_pos++;
// update word_end
word_end = tmp_pos;
}
// trim punct at bgn; EX: "'abc" -> "abc"
if (word_bgn < hook_bgn) {
tmp_pos = word_bgn;
while (true) {
// stop if passed hook-end
if (tmp_pos >= hook_bgn) break;
// check if char is punct
tmp_obj = punct_bgn.Match_at(trv, src, tmp_pos, word_end);
// stop if not a punct
if (tmp_obj == null) break;
// increment before
tmp_pos++;
// update word_end
word_bgn = tmp_pos;
}
}
// trim punct at end; EX: "abc." -> "abc"
if (word_end > hook_end) {
tmp_pos = word_end;
while (true) {
// scan bwd one char
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
// stop if passed hook-end
if (tmp_pos < hook_end) break;
// check if char is punct
tmp_obj = punct_end.Match_at(trv, src, tmp_pos, word_end);
// stop if not a punct
if (tmp_obj == null) break;
// update word_end
word_end = tmp_pos;
}
}
word_bounds.Init(word_bgn, word_end);
}
}

View File

@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Xosearch_word_node {
public int tid;
@ -24,9 +25,7 @@ public class Xosearch_word_node {
public boolean wildcard_at_end;
public boolean found;
public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
// TODO.XO: handle punctuation
public boolean Match_word(Xosearch_word_lang ctx, byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
// if no wildcard at bgn, hook_bgn must match word_bgn
if ( !wildcard_at_bgn
&& hook_bgn != word_bgn)

View File

@ -280,7 +280,7 @@ public class Xop_lnke_wkr implements Xop_ctx_wkr {
return false; // alpha-numerical is invalid; EX: "titel:" should not generate a lnke for "tel:"
}
if (prv_byte >= Byte_ascii.Ascii_min && prv_byte <= Byte_ascii.Ascii_max) return true; // consider all other ASCII chars as true; EX: \t\n !, etc;
prv_pos = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, prv_pos);
prv_pos = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, prv_pos);
prv_byte = src[prv_pos];
boolean prv_char_is_letter = ctx.Lang().Case_mgr().Match_any_exists(prv_byte, src, prv_pos, bgn_pos);
return !prv_char_is_letter;

View File

@ -29,7 +29,7 @@ public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent in
}
public boolean Is_boundary_prv(byte[] src, int pos) {
if (pos == 0) return true; // BOS is true
int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
int bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0(src, pos);
byte b = src[bgn];
Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
return o != null;

View File

@ -285,7 +285,7 @@ class Xomw_regex_html_entity {
int numbers = 0;
int letters = 0;
while (cur >= src_end) {
int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
int b_bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, cur);
switch (src[b_bgn]) {
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J: