mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Search: Add more punctuation support
This commit is contained in:
@@ -58,6 +58,40 @@ public class Xosearch_finder_cbk__eval__tst {
|
||||
// n: char exists
|
||||
fxt.Test__eval_n("a");
|
||||
}
|
||||
@Test public void Trim_end() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("a!");
|
||||
// y: many
|
||||
fxt.Test__eval_y("a!!!");
|
||||
}
|
||||
@Test public void Trim_bgn() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("!a");
|
||||
// y: many
|
||||
fxt.Test__eval_y("!!!a");
|
||||
}
|
||||
@Test public void Trim_both() {
|
||||
fxt.Init__search("a");
|
||||
// y: single
|
||||
fxt.Test__eval_y("'a'");
|
||||
// y: many
|
||||
fxt.Test__eval_y("'''a'''");
|
||||
}
|
||||
@Test public void Slash() {
|
||||
fxt.Init__search("a");
|
||||
// y: slash before, after
|
||||
fxt.Test__eval_y("a/b/c", "b/a/c", "b/c/a");
|
||||
}
|
||||
// .
|
||||
// ...
|
||||
// -
|
||||
// a'b
|
||||
// https://site/page
|
||||
// ()
|
||||
// []
|
||||
// <>
|
||||
}
|
||||
class Xosearch_finder_cbk__eval__fxt {
|
||||
private boolean case_match = false;
|
||||
@@ -76,7 +110,7 @@ class Xosearch_finder_cbk__eval__fxt {
|
||||
byte[] text_bry = Bry_.new_u8(text);
|
||||
cbk.found = false;
|
||||
finder.Match(text_bry, 0, text_bry.length, cbk);
|
||||
Gftest.Eq__bool(expd, cbk.found, "query={0}, text={1}", finder.Query(), text);
|
||||
Gftest.Eq__bool(expd, cbk.found, "query={0} text={1}", finder.Query(), text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,8 @@ public class Xosearch_finder_mgr {
|
||||
private Xosearch_word_node tree_root;
|
||||
private final Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
private final Xosearch_word_lang lang = new Xosearch_word_lang();
|
||||
private final Xosearch_word_bounds word_bounds = new Xosearch_word_bounds();
|
||||
|
||||
public byte[] Query() {return query;} private byte[] query;
|
||||
public void Init(byte[] query, boolean case_match, boolean auto_wildcard, byte wildchar_byte, byte not_byte) {
|
||||
@@ -51,16 +53,17 @@ public class Xosearch_finder_mgr {
|
||||
|
||||
// current byte matches a hook; get hook and hook_end
|
||||
Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
|
||||
int hook_bgn = cur;
|
||||
int hook_end = cur + hook.word_hook.length;
|
||||
|
||||
// get current word bounds by finding flanking ws
|
||||
int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
|
||||
int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
|
||||
if (word_end == -1) word_end = src_end; // WORKAROUND: no match returns -1 instead of src_end
|
||||
// get word_bounds
|
||||
lang.Get_word_bounds(word_bounds, trv, src, src_end, hook_bgn, hook_end);
|
||||
int word_bgn = word_bounds.word_bgn;
|
||||
int word_end = word_bounds.word_end;
|
||||
|
||||
// check if current word matches criteria-word
|
||||
if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
|
||||
cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
|
||||
if (hook.Match_word(lang, src, hook_bgn, hook_end, word_bgn, word_end)) {
|
||||
cbk.Process_item_found(src, hook_bgn, hook_end, word_bgn, word_end, hook);
|
||||
}
|
||||
|
||||
// update position to word_end
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
public class Xosearch_word_bounds {
|
||||
public int word_bgn;
|
||||
public int word_end;
|
||||
public void Init(int word_bgn, int word_end) {
|
||||
this.word_bgn = word_bgn;
|
||||
this.word_end = word_end;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.core.intls.*;
|
||||
public class Xosearch_word_lang {
|
||||
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str("\t", "\n", "\r", " ", "/");
|
||||
private final Btrie_slim_mgr ws_end;
|
||||
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"")
|
||||
;
|
||||
private final Btrie_slim_mgr punct_end;
|
||||
public Xosearch_word_lang() {
|
||||
this.ws_end = ws_bgn;
|
||||
this.punct_end = punct_bgn;
|
||||
}
|
||||
public void Get_word_bounds(Xosearch_word_bounds word_bounds, Btrie_rv trv, byte[] src, int src_end, int hook_bgn, int hook_end) {
|
||||
int tmp_pos = -1;
|
||||
Object tmp_obj = null;
|
||||
|
||||
// find word_bgn
|
||||
int word_bgn = hook_bgn;
|
||||
tmp_pos = word_bgn;
|
||||
while (true) {
|
||||
// stop if BOS
|
||||
if (tmp_pos == 0) break;
|
||||
|
||||
// move back one char
|
||||
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
|
||||
|
||||
// check if char is ws
|
||||
tmp_obj = ws_bgn.Match_at(trv, src, tmp_pos, hook_end);
|
||||
|
||||
// char is ws -> stop
|
||||
if (tmp_obj != null) break;
|
||||
|
||||
// char is not ws -> update word_end
|
||||
word_bgn = tmp_pos;
|
||||
}
|
||||
|
||||
// find word_end
|
||||
int word_end = hook_end;
|
||||
tmp_pos = word_end;
|
||||
while (true) {
|
||||
// stop if passed EOS
|
||||
if (tmp_pos >= src_end) break;
|
||||
|
||||
// check if char is ws
|
||||
tmp_obj = ws_end.Match_at(trv, src, tmp_pos, src_end);
|
||||
|
||||
// stop if ws
|
||||
if (tmp_obj != null) break;
|
||||
|
||||
// increment before
|
||||
tmp_pos++;
|
||||
|
||||
// update word_end
|
||||
word_end = tmp_pos;
|
||||
}
|
||||
|
||||
// trim punct at bgn; EX: "'abc" -> "abc"
|
||||
if (word_bgn < hook_bgn) {
|
||||
tmp_pos = word_bgn;
|
||||
while (true) {
|
||||
// stop if passed hook-end
|
||||
if (tmp_pos >= hook_bgn) break;
|
||||
|
||||
// check if char is punct
|
||||
tmp_obj = punct_bgn.Match_at(trv, src, tmp_pos, word_end);
|
||||
|
||||
// stop if not a punct
|
||||
if (tmp_obj == null) break;
|
||||
|
||||
// increment before
|
||||
tmp_pos++;
|
||||
|
||||
// update word_end
|
||||
word_bgn = tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
// trim punct at end; EX: "abc." -> "abc"
|
||||
if (word_end > hook_end) {
|
||||
tmp_pos = word_end;
|
||||
while (true) {
|
||||
// scan bwd one char
|
||||
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
|
||||
|
||||
// stop if passed hook-end
|
||||
if (tmp_pos < hook_end) break;
|
||||
|
||||
// check if char is punct
|
||||
tmp_obj = punct_end.Match_at(trv, src, tmp_pos, word_end);
|
||||
|
||||
// stop if not a punct
|
||||
if (tmp_obj == null) break;
|
||||
|
||||
// update word_end
|
||||
word_end = tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
word_bounds.Init(word_bgn, word_end);
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
|
||||
public class Xosearch_word_node {
|
||||
public int tid;
|
||||
@@ -24,9 +25,7 @@ public class Xosearch_word_node {
|
||||
public boolean wildcard_at_end;
|
||||
public boolean found;
|
||||
|
||||
public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
|
||||
// TODO.XO: handle punctuation
|
||||
|
||||
public boolean Match_word(Xosearch_word_lang ctx, byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
|
||||
// if no wildcard at bgn, hook_bgn must match word_bgn
|
||||
if ( !wildcard_at_bgn
|
||||
&& hook_bgn != word_bgn)
|
||||
|
||||
@@ -280,7 +280,7 @@ public class Xop_lnke_wkr implements Xop_ctx_wkr {
|
||||
return false; // alpha-numerical is invalid; EX: "titel:" should not generate a lnke for "tel:"
|
||||
}
|
||||
if (prv_byte >= Byte_ascii.Ascii_min && prv_byte <= Byte_ascii.Ascii_max) return true; // consider all other ASCII chars as true; EX: \t\n !, etc;
|
||||
prv_pos = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, prv_pos);
|
||||
prv_pos = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, prv_pos);
|
||||
prv_byte = src[prv_pos];
|
||||
boolean prv_char_is_letter = ctx.Lang().Case_mgr().Match_any_exists(prv_byte, src, prv_pos, bgn_pos);
|
||||
return !prv_char_is_letter;
|
||||
|
||||
Reference in New Issue
Block a user