1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Search: Add more punctuation support

This commit is contained in:
gnosygnu
2017-03-01 16:37:47 -05:00
parent 9301973825
commit 8de3cf0cc6
10 changed files with 233 additions and 26 deletions

View File

@@ -58,6 +58,40 @@ public class Xosearch_finder_cbk__eval__tst {
// n: char exists
fxt.Test__eval_n("a");
}
@Test public void Trim_end() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("a!");
// y: many
fxt.Test__eval_y("a!!!");
}
@Test public void Trim_bgn() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("!a");
// y: many
fxt.Test__eval_y("!!!a");
}
@Test public void Trim_both() {
fxt.Init__search("a");
// y: single
fxt.Test__eval_y("'a'");
// y: many
fxt.Test__eval_y("'''a'''");
}
@Test public void Slash() {
fxt.Init__search("a");
// y: slash before, after
fxt.Test__eval_y("a/b/c", "b/a/c", "b/c/a");
}
// .
// ...
// -
// a'b
// https://site/page
// ()
// []
// <>
}
class Xosearch_finder_cbk__eval__fxt {
private boolean case_match = false;
@@ -76,7 +110,7 @@ class Xosearch_finder_cbk__eval__fxt {
byte[] text_bry = Bry_.new_u8(text);
cbk.found = false;
finder.Match(text_bry, 0, text_bry.length, cbk);
Gftest.Eq__bool(expd, cbk.found, "query={0}, text={1}", finder.Query(), text);
Gftest.Eq__bool(expd, cbk.found, "query={0} text={1}", finder.Query(), text);
}
}
}

View File

@@ -22,6 +22,8 @@ public class Xosearch_finder_mgr {
private Xosearch_word_node tree_root;
private final Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
private final Btrie_rv trv = new Btrie_rv();
private final Xosearch_word_lang lang = new Xosearch_word_lang();
private final Xosearch_word_bounds word_bounds = new Xosearch_word_bounds();
public byte[] Query() {return query;} private byte[] query;
public void Init(byte[] query, boolean case_match, boolean auto_wildcard, byte wildchar_byte, byte not_byte) {
@@ -51,16 +53,17 @@ public class Xosearch_finder_mgr {
// current byte matches a hook; get hook and hook_end
Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
int hook_bgn = cur;
int hook_end = cur + hook.word_hook.length;
// get current word bounds by finding flanking ws
int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
if (word_end == -1) word_end = src_end; // WORKAROUND: no match returns -1 instead of src_end
// get word_bounds
lang.Get_word_bounds(word_bounds, trv, src, src_end, hook_bgn, hook_end);
int word_bgn = word_bounds.word_bgn;
int word_end = word_bounds.word_end;
// check if current word matches criteria-word
if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
if (hook.Match_word(lang, src, hook_bgn, hook_end, word_bgn, word_end)) {
cbk.Process_item_found(src, hook_bgn, hook_end, word_bgn, word_end, hook);
}
// update position to word_end

View File

@@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
public class Xosearch_word_bounds {
public int word_bgn;
public int word_end;
public void Init(int word_bgn, int word_end) {
this.word_bgn = word_bgn;
this.word_end = word_end;
}
}

View File

@@ -0,0 +1,119 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.core.intls.*;
public class Xosearch_word_lang {
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
.Add_many_str("\t", "\n", "\r", " ", "/");
private final Btrie_slim_mgr ws_end;
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"")
;
private final Btrie_slim_mgr punct_end;
public Xosearch_word_lang() {
this.ws_end = ws_bgn;
this.punct_end = punct_bgn;
}
public void Get_word_bounds(Xosearch_word_bounds word_bounds, Btrie_rv trv, byte[] src, int src_end, int hook_bgn, int hook_end) {
int tmp_pos = -1;
Object tmp_obj = null;
// find word_bgn
int word_bgn = hook_bgn;
tmp_pos = word_bgn;
while (true) {
// stop if BOS
if (tmp_pos == 0) break;
// move back one char
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
// check if char is ws
tmp_obj = ws_bgn.Match_at(trv, src, tmp_pos, hook_end);
// char is ws -> stop
if (tmp_obj != null) break;
// char is not ws -> update word_end
word_bgn = tmp_pos;
}
// find word_end
int word_end = hook_end;
tmp_pos = word_end;
while (true) {
// stop if passed EOS
if (tmp_pos >= src_end) break;
// check if char is ws
tmp_obj = ws_end.Match_at(trv, src, tmp_pos, src_end);
// stop if ws
if (tmp_obj != null) break;
// increment before
tmp_pos++;
// update word_end
word_end = tmp_pos;
}
// trim punct at bgn; EX: "'abc" -> "abc"
if (word_bgn < hook_bgn) {
tmp_pos = word_bgn;
while (true) {
// stop if passed hook-end
if (tmp_pos >= hook_bgn) break;
// check if char is punct
tmp_obj = punct_bgn.Match_at(trv, src, tmp_pos, word_end);
// stop if not a punct
if (tmp_obj == null) break;
// increment before
tmp_pos++;
// update word_end
word_bgn = tmp_pos;
}
}
// trim punct at end; EX: "abc." -> "abc"
if (word_end > hook_end) {
tmp_pos = word_end;
while (true) {
// scan bwd one char
tmp_pos = Utf8_.Get_prv_char_pos0(src, tmp_pos);
// stop if passed hook-end
if (tmp_pos < hook_end) break;
// check if char is punct
tmp_obj = punct_end.Match_at(trv, src, tmp_pos, word_end);
// stop if not a punct
if (tmp_obj == null) break;
// update word_end
word_end = tmp_pos;
}
}
word_bounds.Init(word_bgn, word_end);
}
}

View File

@@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Xosearch_word_node {
public int tid;
@@ -24,9 +25,7 @@ public class Xosearch_word_node {
public boolean wildcard_at_end;
public boolean found;
public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
// TODO.XO: handle punctuation
public boolean Match_word(Xosearch_word_lang ctx, byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
// if no wildcard at bgn, hook_bgn must match word_bgn
if ( !wildcard_at_bgn
&& hook_bgn != word_bgn)

View File

@@ -280,7 +280,7 @@ public class Xop_lnke_wkr implements Xop_ctx_wkr {
return false; // alpha-numerical is invalid; EX: "titel:" should not generate a lnke for "tel:"
}
if (prv_byte >= Byte_ascii.Ascii_min && prv_byte <= Byte_ascii.Ascii_max) return true; // consider all other ASCII chars as true; EX: \t\n !, etc;
prv_pos = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, prv_pos);
prv_pos = gplx.core.intls.Utf8_.Get_prv_char_pos0_old(src, prv_pos);
prv_byte = src[prv_pos];
boolean prv_char_is_letter = ctx.Lang().Case_mgr().Match_any_exists(prv_byte, src, prv_pos, bgn_pos);
return !prv_char_is_letter;