mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Search: Add more punctuation support
This commit is contained in:
parent
9301973825
commit
8de3cf0cc6
@ -60,7 +60,7 @@ public class Utf8_ {
|
||||
int bry_len = bry.length; if (bry_len == 0) return bry;
|
||||
int pos = bry_len - 1;
|
||||
while (true) { // loop bwds
|
||||
int cur_char_pos0 = Get_pos0_of_char_bwd(bry, pos); // get byte0 of char
|
||||
int cur_char_pos0 = Get_prv_char_pos0_old(bry, pos); // get byte0 of char
|
||||
int cur_char_len = (pos - cur_char_pos0) + 1; // calc len of char
|
||||
int nxt_char = Codepoint_max;
|
||||
if (cur_char_len == 1) { // len=1; just change 1 byte
|
||||
@ -82,7 +82,7 @@ public class Utf8_ {
|
||||
if (pos < 0) return null;
|
||||
}
|
||||
}
|
||||
public static int Get_pos0_of_char_bwd(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
|
||||
public static int Get_prv_char_pos0_old(byte[] bry, int pos) { // find pos0 of char while moving bwd through bry; see test
|
||||
int stop = pos - 4; // UTF8 char has max of 4 bytes
|
||||
if (stop < 0) stop = 0; // if at pos 0 - 3, stop at 0
|
||||
for (int i = pos - 1; i >= stop; i--) { // start at pos - 1, and move bwd; NOTE: pos - 1 to skip pos, b/c pos will never definitively yield any char_len info
|
||||
@ -96,6 +96,34 @@ public class Utf8_ {
|
||||
}
|
||||
return pos; // no mult-byte char found; return pos
|
||||
}
|
||||
public static int Get_prv_char_pos0(byte[] src, int cur) { // find pos0 of char while moving bwd through src; see test
|
||||
// do bounds checks
|
||||
if (cur == 0) return -1;
|
||||
if (cur <= -1 || cur > src.length) throw Err_.new_wo_type("invalid index for get_prv_char_pos0", "src", src, "cur", cur);
|
||||
|
||||
// start at cur - 1; note bounds checks above
|
||||
int pos = cur - 1;
|
||||
|
||||
// get 1st byte and check if ASCII for (a) error-checking (ASCII can only be in 1st byte); (b) performance
|
||||
byte b = src[pos];
|
||||
if (b >= 0 && b <= Byte_.Max_value_127) return pos;
|
||||
|
||||
// loop maximum of 4 times; note that UTF8 char has max of 4 bytes
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int char_len = Len_of_char_by_1st_byte(b);
|
||||
switch (char_len) { // if char_len is multi-byte and cur is at correct multi-byte pos (cur - i = # of bytes - 1), then pos0 found; EX: <EFBFBD> = {226,130,172}; 172 is skipped; 130 has len of 1 -> continue; 226 has len of 3 and is found at correct cur for 3 byte char -> return
|
||||
case 2: if (i == 1) return pos; break;
|
||||
case 3: if (i == 2) return pos; break;
|
||||
case 4: if (i == 3) return pos; break;
|
||||
}
|
||||
|
||||
// decrement and set byte
|
||||
pos--;
|
||||
b = src[pos];
|
||||
}
|
||||
|
||||
throw Err_.new_wo_type("could not get prv_char", "src", src, "cur", cur);
|
||||
}
|
||||
@gplx.Internal protected static int Increment_char(int cur) {
|
||||
while (cur++ < Codepoint_max) {
|
||||
if (cur == Codepoint_surrogate_bgn) cur = Codepoint_surrogate_end + 1; // skip over surrogate range
|
||||
@ -107,7 +135,7 @@ public class Utf8_ {
|
||||
private static boolean Codepoint_valid(int v) {
|
||||
return Character.isDefined(v);
|
||||
}
|
||||
public static final int
|
||||
public static final int
|
||||
Codepoint_max = 0x10FFFF //see http://unicode.org/glossary/
|
||||
, Codepoint_surrogate_bgn = 0xD800
|
||||
, Codepoint_surrogate_end = 0xDFFF
|
||||
|
@ -17,12 +17,12 @@ package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*;
|
||||
public class Utf8__tst {
|
||||
private Utf8__fxt fxt = new Utf8__fxt();
|
||||
@Test public void Get_pos0_of_char_bwd() {
|
||||
fxt.Test_Get_pos0_of_char_bwd("abcd", 3); // len=1; (note that bry.len = 4)
|
||||
fxt.Test_Get_pos0_of_char_bwd("a", 0); // len=1; short-String
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc¢", 3); // len=2; (note that bry.len = 5)
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc€", 3); // len=3; (note that bry.len = 6)
|
||||
fxt.Test_Get_pos0_of_char_bwd("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
|
||||
@Test public void Get_prv_char_pos0() {
|
||||
fxt.Test__Get_prv_char_pos0("abcd", 3); // len=1; (note that bry.len = 4)
|
||||
fxt.Test__Get_prv_char_pos0("a", 0); // len=1; short-String
|
||||
fxt.Test__Get_prv_char_pos0("abc¢", 3); // len=2; (note that bry.len = 5)
|
||||
fxt.Test__Get_prv_char_pos0("abc€", 3); // len=3; (note that bry.len = 6)
|
||||
fxt.Test__Get_prv_char_pos0("abc" + String_.new_u8(Byte_.Ary_by_ints(240, 164, 173, 162)), 3); // len=4; (note that bry.len = 7)
|
||||
}
|
||||
@Test public void Increment_char_at_last_pos() {
|
||||
fxt.Test_Increment_char_at_last_pos("a", "b");
|
||||
@ -56,10 +56,10 @@ public class Utf8__tst {
|
||||
// }
|
||||
}
|
||||
class Utf8__fxt {
|
||||
public void Test_Get_pos0_of_char_bwd(String str, int expd) {
|
||||
byte[] bry = Bry_.new_u8(str);
|
||||
int pos = bry.length - 1; // always start from last char
|
||||
Tfds.Eq(expd, Utf8_.Get_pos0_of_char_bwd(bry, pos));
|
||||
public void Test__Get_prv_char_pos0(String src_str, int expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0 (src_bry, src_bry.length));
|
||||
Tfds.Eq(expd, Utf8_.Get_prv_char_pos0_old(src_bry, src_bry.length - 1));
|
||||
}
|
||||
public void Test_Increment_char_at_last_pos(String str, String expd) {
|
||||
Tfds.Eq(expd, String_.new_u8(Utf8_.Increment_char_at_last_pos(Bry_.new_u8(str))));
|
||||
|
@ -58,6 +58,40 @@ public class Xosearch_finder_cbk__eval__tst {
|
||||
// n: char exists
|
||||
fxt.Test__eval_n("a");
|
||||
}
|
||||
@Test public void Trim_end() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("a!");
|
||||
// y: many
|
||||
fxt.Test__eval_y("a!!!");
|
||||
}
|
||||
@Test public void Trim_bgn() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("!a");
|
||||
// y: many
|
||||
fxt.Test__eval_y("!!!a");
|
||||
}
|
||||
@Test public void Trim_both() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("'a'");
|
||||
// y: many
|
||||
fxt.Test__eval_y("'''a'''");
|
||||
}
|
||||
@Test public void Slash() {
|
||||
fxt.Init__search("a");
|
||||
// y: slash before, after
|
||||
fxt.Test__eval_y("a/b/c", "b/a/c", "b/c/a");
|
||||
}
|
||||
// .
|
||||
// ...
|
||||
// -
|
||||
// a'b
|
||||
// https://site/page
|
||||
// ()
|
||||
// []
|
||||
// <>
|
||||
}
|
||||
class Xosearch_finder_cbk__eval__fxt {
|
||||
private boolean case_match = false;
|
||||
@ -76,7 +110,7 @@ class Xosearch_finder_cbk__eval__fxt {
|
||||
byte[] text_bry = Bry_.new_u8(text);
|
||||
cbk.found = false;
|
||||
finder.Match(text_bry, 0, text_bry.length, cbk);
|
||||
Gftest.Eq__bool(expd, cbk.found, "query={0}, text={1}", finder.Query(), text);
|
||||
Gftest.Eq__bool(expd, cbk.found, "query={0} text={1}", finder.Query(), text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,6 +22,8 @@ public class Xosearch_finder_mgr {
|
||||
private Xosearch_word_node tree_root;
|
||||
private final Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
private final Xosearch_word_lang lang = new Xosearch_word_lang();
|
||||
private final Xosearch_word_bounds word_bounds = new Xosearch_word_bounds();
|
||||
|
||||
public byte[] Query() {return query;} private byte[] query;
|
||||
public void Init(byte[] query, boolean case_match, boolean auto_wildcard, byte wildchar_byte, byte not_byte) {
|
||||
@ -51,16 +53,17 @@ public class Xosearch_finder_mgr {
|
||||
|
||||
// current byte matches a hook; get hook and hook_end
|
||||
Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
|
||||
int hook_bgn = cur;
|
||||
int hook_end = cur + hook.word_hook.length;
|
||||
|
||||
// get current word bounds by finding flanking ws
|
||||
int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
|
||||
int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
|
||||
if (word_end == -1) word_end = src_end; // WORKAROUND: no match returns -1 instead of src_end
|
||||
// get word_bounds
|
||||
lang.Get_word_bounds(word_bounds, trv, src, src_end, hook_bgn, hook_end);
|
||||
int word_bgn = word_bounds.word_bgn;
|
||||
int word_end = word_bounds.word_end;
|
||||
|
||||
// check if current word matches criteria-word
|
||||
if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
|
||||
cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
|
||||
if (hook.Match_word(lang, src, hook_bgn, hook_end, word_bgn, word_end)) {
|
||||
cbk.Process_item_found(src, hook_bgn, hook_end, word_bgn, word_end, hook);
|
||||
}
|
||||
|
||||
// update position to word_end
|
||||
|
@ -0,0 +1,24 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
public class Xosearch_word_bounds {
|
||||
public int word_bgn;
|
||||
public int word_end;
|
||||
public void Init(int word_bgn, int word_end) {
|
||||
this.word_bgn = word_bgn;
|
||||
this.word_end = word_end;
|
||||
}
|
||||
}
|
@ -0,0 +1,119 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.core.intls.*;
|
||||
public class Xosearch_word_lang {
|
||||
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str("\t", "\n", "\r", " ", "/");
|
||||
private final Btrie_slim_mgr ws_end;
|
||||
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"")
|
||||
;
|
||||
private final Btrie_slim_mgr punct_end;
|
||||
public Xosearch_word_lang() {
|
||||
this.ws_end = ws_bgn;
|
||||
this.punct_end = punct_bgn;
|
||||
}
|
||||
public void Get_word_bounds(Xosearch_word_bounds word_bounds, Btrie_rv trv, byte[] src, int src_end, int hook_bgn, int hook_end) {
|
||||
int tmp_pos = -1;
|
||||
Object tmp_obj = null;
|
||||
|
||||
// find word_bgn
|
||||
int word_bgn = hook_bgn;
|
||||
tmp_pos = word_bgn;
|
||||
while (true) {
|
||||
// stop if BOS
|
||||
if (tmp_pos == 0) break;
|
||||
|
||||
// move back one char
|
||||
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
|
||||
|
||||
// check if char is ws
|
||||
tmp_obj = ws_bgn.Match_at(trv, src, tmp_pos, hook_end);
|
||||
|
||||
// char is ws -> stop
|
||||
if (tmp_obj != null) break;
|
||||
|
||||
// char is not ws -> update word_end
|
||||
word_bgn = tmp_pos;
|
||||
}
|
||||
|
||||
// find word_end
|
||||
int word_end = hook_end;
|
||||
tmp_pos = word_end;
|
||||
while (true) {
|
||||
// stop if passed EOS
|
||||
if (tmp_pos >= src_end) break;
|
||||
|
||||
// check if char is ws
|
||||
tmp_obj = ws_end.Match_at(trv, src, tmp_pos, src_end);
|
||||
|
||||
// stop if ws
|
||||
if (tmp_obj != null) break;
|
||||
|
||||
// increment before
|
||||
tmp_pos++;
|
||||
|
||||
// update word_end
|
||||
word_end = tmp_pos;
|
||||
}
|
||||
|
||||
// trim punct at bgn; EX: "'abc" -> "abc"
|
||||
if (word_bgn < hook_bgn) {
|
||||
tmp_pos = word_bgn;
|
||||
while (true) {
|
||||
// stop if passed hook-end
|
||||
if (tmp_pos >= hook_bgn) break;
|
||||
|
||||
// check if char is punct
|
||||
tmp_obj = punct_bgn.Match_at(trv, src, tmp_pos, word_end);
|
||||
|
||||
// stop if not a punct
|
||||
if (tmp_obj == null) break;
|
||||
|
||||
// increment before
|
||||
tmp_pos++;
|
||||
|
||||
// update word_end
|
||||
word_bgn = tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
// trim punct at end; EX: "abc." -> "abc"
|
||||
if (word_end > hook_end) {
|
||||
tmp_pos = word_end;
|
||||
while (true) {
|
||||
// scan bwd one char
|
||||
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
|
||||
|
||||
// stop if passed hook-end
|
||||
if (tmp_pos < hook_end) break;
|
||||
|
||||
// check if char is punct
|
||||
tmp_obj = punct_end.Match_at(trv, src, tmp_pos, word_end);
|
||||
|
||||
// stop if not a punct
|
||||
if (tmp_obj == null) break;
|
||||
|
||||
// update word_end
|
||||
word_end = tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
word_bounds.Init(word_bgn, word_end);
|
||||
}
|
||||
}
|
@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
|
||||
public class Xosearch_word_node {
|
||||
public int tid;
|
||||
@ -24,9 +25,7 @@ public class Xosearch_word_node {
|
||||
public boolean wildcard_at_end;
|
||||
public boolean found;
|
||||
|
||||
public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
|
||||
// TODO.XO: handle punctuation
|
||||
|
||||
public boolean Match_word(Xosearch_word_lang ctx, byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
|
||||
// if no wildcard at bgn, hook_bgn must match word_bgn
|
||||
if ( !wildcard_at_bgn
|
||||
&& hook_bgn != word_bgn)
|
||||
|
@ -280,7 +280,7 @@ public class Xop_lnke_wkr implements Xop_ctx_wkr {
|
||||
return false; // alpha-numerical is invalid; EX: "titel:" should not generate a lnke for "tel:"
|
||||
}
|
||||
if (prv_byte >= Byte_ascii.Ascii_min && prv_byte <= Byte_ascii.Ascii_max) return true; // consider all other ASCII chars as true; EX: \t\n !, etc;
|
||||
prv_pos = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, prv_pos);
|
||||
prv_pos = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, prv_pos);
|
||||
prv_byte = src[prv_pos];
|
||||
boolean prv_char_is_letter = ctx.Lang().Case_mgr().Match_any_exists(prv_byte, src, prv_pos, bgn_pos);
|
||||
return !prv_char_is_letter;
|
||||
|
@ -29,7 +29,7 @@ public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent in
|
||||
}
|
||||
public boolean Is_boundary_prv(byte[] src, int pos) {
|
||||
if (pos == 0) return true; // BOS is true
|
||||
int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
|
||||
int bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0(src, pos);
|
||||
byte b = src[bgn];
|
||||
Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
|
||||
return o != null;
|
||||
|
@ -285,7 +285,7 @@ class Xomw_regex_html_entity {
|
||||
int numbers = 0;
|
||||
int letters = 0;
|
||||
while (cur >= src_end) {
|
||||
int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
|
||||
int b_bgn = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, cur);
|
||||
switch (src[b_bgn]) {
|
||||
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||
|
Loading…
Reference in New Issue
Block a user