Search: Refactor search to handle multiple boolean searches

pull/620/head
gnosygnu 8 years ago
parent 7f04fc5f74
commit c10cf2ca15

@ -14,124 +14,73 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.cbks; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.langs.jsons.*;
import gplx.dbs.*;
import gplx.dbs.*; import gplx.xowa.wikis.data.tbls.*;
import gplx.xowa.addons.wikis.searchs.fulltexts.specials.*;
import gplx.xowa.wikis.data.tbls.*;
import gplx.xowa.addons.wikis.searchs.fulltexts.finders.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.visitors.*;
class Xosearch_fulltext_svc {
private gplx.xowa.guis.cbks.Xog_cbk_trg cbk_trg = gplx.xowa.guis.cbks.Xog_cbk_trg.New(Xosearch_fulltext_special.Prototype.Special__meta().Ttl_bry());
private final Xoa_app app;
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final gplx.xowa.guis.cbks.Xog_cbk_trg cbk_trg = gplx.xowa.guis.cbks.Xog_cbk_trg.New(Xosearch_fulltext_special.Prototype.Special__meta().Ttl_bry());
private final Xosearch_finder_mgr finder = new Xosearch_finder_mgr();
private final Xosearch_finder_cbk__eval cbk_eval = new Xosearch_finder_cbk__eval();
private final Xosearch_finder_cbk__highlight cbk_highlight;
public Xosearch_fulltext_svc(Xoa_app app) {
this.app = app;
cbk_highlight = new Xosearch_finder_cbk__highlight(app, cbk_trg);
}
public void Search(Json_nde args) {
String wikis = args.Get_as_str("wikis");
byte[] wildcard = Bry_.new_a7("%");
byte[] query_raw = args.Get_as_bry("query");
byte[] query_sql = Bry_.Add(wildcard, query_raw, wildcard);
byte[] query_mcase = args.Get_as_bry("query");
String[] wikis_ary = String_.Split(wikis, "|");
for (String wiki_domain : wikis_ary) {
Xow_wiki wiki = app.Wiki_mgri().Get_by_or_make_init_y(Bry_.new_u8(wiki_domain));
Search_wiki(wiki, query_raw, query_sql);
byte[] query_lcase = wiki.Case_mgr().Case_build_lower(query_mcase);
Search_wiki(wiki, query_lcase);
}
}
private void Search_wiki(Xow_wiki wiki, byte[] query_raw, byte[] query_sql) {
private void Search_wiki(Xow_wiki wiki, byte[] query_lcase) {
Db_conn page_conn = wiki.Data__core_mgr().Tbl__page().Conn();
Db_rdr page_rdr = page_conn.Stmt_sql("SELECT * FROM page WHERE page_namespace IN (0) ORDER BY page_score DESC").Exec_select__rls_auto();
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__wiki__add__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_long("page_count", wiki.Stats().Num_pages())
.Add_long("page_count", 0)
);
finder.Init(query_lcase, false, false, Byte_ascii.Star);
try {
int found = 0;
while (page_rdr.Move_next()) {
int page_id = page_rdr.Read_int("page_id");
int text_db_id = page_rdr.Read_int("page_text_db_id");
byte[] text = wiki.Data__core_mgr().Dbs__get_by_id_or_fail(text_db_id).Tbl__text().Select(page_id);
if (Bry_.Has(text, query_raw)) {
Xowd_text_row text_row = new Xowd_text_row(page_id, text);
byte[] text_mcase = wiki.Data__core_mgr().Dbs__get_by_id_or_fail(text_db_id).Tbl__text().Select(page_id);
cbk_eval.found = false;
finder.Match(text_mcase, 0, text_mcase.length, cbk_eval);
if (cbk_eval.found) {
int ns_id = page_rdr.Read_int("page_namespace");
byte[] ttl_bry = page_rdr.Read_bry_by_str("page_title");
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__wiki__update__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_int("found", ++found)
);
Write(wiki, query_raw, wiki.Ttl_parse(ns_id, ttl_bry), text_row);
}
}
} finally {
page_rdr.Rls();
}
}
private void Write(Xow_wiki wiki, byte[] query_raw, Xoa_ttl ttl, Xowd_text_row text_row) {
Xoa_ttl ttl = wiki.Ttl_parse(ns_id, ttl_bry);
cbk_highlight.Init(wiki, ttl);
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__page__add__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_bry("page", ttl.Full_db())
.Add_int("found", 0)
);
byte[] text_orig = text_row.text;
byte[] text_lcase = wiki.Lang().Case_mgr().Case_build_lower(text_orig);
int pos = 0;
int found = 0;
while (true) {
int find_bgn = Bry_find_.Find_fwd(text_lcase, query_raw, pos);
if (find_bgn == Bry_find_.Not_found)
break;
int snip_bgn = find_bgn - 50;
if (snip_bgn < 0)
snip_bgn = 0;
else {
snip_bgn = Bry_find_.Find_bwd_ws(text_orig, snip_bgn, 0) + 1;
finder.Match(text_mcase, 0, text_mcase.length, cbk_highlight);
}
int find_end = find_bgn + query_raw.length;
int snip_end = find_end + 50;
if (snip_end >= text_lcase.length)
snip_end = text_lcase.length;
else
snip_end = Bry_find_.Find_fwd_until_ws(text_orig, snip_end, text_orig.length);
Add_snip(tmp_bfr, text_orig, text_lcase, snip_bgn, snip_end, query_raw);
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__line__add__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_bry("page", ttl.Full_db())
.Add_int("line", ++found)
.Add_bry("html", tmp_bfr.To_bry_and_clear())
);
pos = snip_end;
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__page__update__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_bry("page", ttl.Full_db())
.Add_int("found", found)
);
}
}
private void Add_snip(Bry_bfr bfr, byte[] src_orig, byte[] src_lcase, int bgn, int end, byte[] qry) {
for (int i = bgn; i < end; i++) {
byte b = src_orig[i];
if (b == Byte_ascii.Nl)
bfr.Add(gplx.langs.htmls.Gfh_tag_.Br_inl);
else {
int qry_end = i + qry.length;
if (Bry_.Eq(src_lcase, i, qry_end, qry)) {
bfr.Add_str_a7("<span class='snip_highlight'>");
bfr.Add_mid(src_orig, i, qry_end);
bfr.Add_str_a7("</span>");
i = qry_end - 1;
}
else
bfr.Add_byte(b);
}
} finally {
page_rdr.Rls();
}
}
}
// class Xosearch_result_wiki {
// public final byte[] wiki;
// public final byte[] page_db;
// public byte
// }

@ -0,0 +1,21 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.xowa.guis.cbks.*;
public interface Xosearch_finder_cbk {
void Process_item_found(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end, Xosearch_word_node term);
void Process_page_done(byte[] src, Xosearch_word_node tree_root);
}

@ -0,0 +1,25 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
public class Xosearch_finder_cbk__eval implements Xosearch_finder_cbk {
public boolean found;
public void Process_item_found(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end, Xosearch_word_node term) {
term.found = true;
}
public void Process_page_done(byte[] src, Xosearch_word_node root) {
this.found = root.Eval();
}
}

@ -0,0 +1,82 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.xowa.guis.cbks.*;
public class Xosearch_finder_cbk__highlight implements Xosearch_finder_cbk {
private final Xog_cbk_trg cbk_trg;
private final Xoa_app app;
private Xow_wiki wiki;
private Xoa_ttl ttl;
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
public int found;
public Xosearch_finder_cbk__highlight(Xoa_app app, Xog_cbk_trg cbk_trg) {
this.app = app;
this.cbk_trg = cbk_trg;
}
public void Init(Xow_wiki wiki, Xoa_ttl ttl) {
this.wiki = wiki;
this.ttl = ttl;
found = 0;
}
public void Process_item_found(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end, Xosearch_word_node term) {
// get snip bounds by finding flanking 50 chars and then expanding to word-bounds
int snip_bgn = hook_bgn - 50;
if (snip_bgn < 0)
snip_bgn = 0;
else {
snip_bgn = Bry_find_.Find_bwd_ws(src, snip_bgn, 0) + 1;
}
int snip_end = hook_end + 50;
if (snip_end >= src.length)
snip_end = src.length;
else {
snip_end = Bry_find_.Find_fwd_until_ws(src, snip_end, src.length);
if (snip_end == Bry_find_.Not_found) { // when snip_end == src.length
snip_end = src.length;
}
}
// build snip
Add_snip(tmp_bfr, src, snip_bgn, hook_bgn);
tmp_bfr.Add_str_a7("<span class='snip_highlight'>");
Add_snip(tmp_bfr, src, hook_bgn, hook_end);
tmp_bfr.Add_str_a7("</span>");
Add_snip(tmp_bfr, src, hook_end, snip_end);
// send notification
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__line__add__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_bry("page", ttl.Full_db())
.Add_int("line", ++found)
.Add_bry("html", tmp_bfr.To_bry_and_clear())
);
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.search_fulltext.results__page__update__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_bry("wiki", wiki.Domain_bry())
.Add_bry("page", ttl.Full_db())
.Add_int("found", found)
);
}
private void Add_snip(Bry_bfr bfr, byte[] src, int bgn, int end) {
for (int i = bgn; i < end; i++) {
byte b = src[i];
if (b == Byte_ascii.Nl)
bfr.Add(gplx.langs.htmls.Gfh_tag_.Br_inl);
else
bfr.Add_byte(b);
}
}
public void Process_page_done(byte[] src, Xosearch_word_node tree_root) {}
}

@ -0,0 +1,71 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.xowa.guis.cbks.*;
import gplx.core.btries.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Xosearch_finder_mgr {
private Btrie_slim_mgr hook_trie;
private Xosearch_word_node tree_root;
private final Srch_crt_parser parser = new Srch_crt_parser(Srch_crt_scanner_syms.Dflt);
private final Btrie_rv trv = new Btrie_rv();
public void Init(byte[] query_mcase, boolean case_match, boolean auto_wildcard, byte wildchar_byte) {
// create a new hook_trie based on case_match
this.hook_trie = case_match ? Btrie_slim_mgr.cs() : Btrie_slim_mgr.ci_u8();
// create a new tree_root for eval
this.tree_root = Xosearch_word_node_.New_root(parser.Parse_or_invalid(query_mcase, auto_wildcard).Root, hook_trie, wildchar_byte);
}
public void Match(byte[] src, int src_bgn, int src_end, Xosearch_finder_cbk cbk) {
// init and clear
int cur = 0;
tree_root.Clear();
// scan through text one-byte at a time
// NOTE: skipping ahead to word-start instead of going byte-by-byte may seem more performant, but will still need to do substring analysis b/c of wildcards and punctuation; EX: "abc" and " 'abc' "; "*abc" and " xyzabc. "
while (cur <= src_end) {
// check each byte against hook_trie
Object hook_obj = hook_trie.Match_at(trv, src, cur, src_end);
// current byte matches no hooks; go to next byte
if (hook_obj == null) {
cur++;
continue;
}
// current byte matches a hook; get hook and hook_end
Xosearch_word_node hook = (Xosearch_word_node)hook_obj;
int hook_end = cur + hook.word_hook.length;
// get current word bounds by finding flanking ws
int word_bgn = Bry_find_.Find_bwd_ws(src, cur, 0) + 1;
int word_end = Bry_find_.Find_fwd_until_ws(src, hook_end, src_end);
if (word_end == -1) word_end = src_end; // WORKAROUND: no match returns -1 instead of src_end
// check if current word matches criteria-word
if (hook.Match_word(src, cur, hook_end, word_bgn, word_end)) {
cbk.Process_item_found(src, cur, hook_end, word_bgn, word_end, hook);
}
// update position to word_end
cur = word_end;
}
// mark page done
cbk.Process_page_done(src, tree_root);
}
}

@ -0,0 +1,70 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Xosearch_word_node {
public int tid;
public Xosearch_word_node[] subs;
public byte[] word_orig;
public byte[] word_hook;
public boolean wildcard_at_bgn;
public boolean wildcard_at_end;
public boolean found;
public boolean Match_word(byte[] src, int hook_bgn, int hook_end, int word_bgn, int word_end) {
// TODO.XO: handle punctuation
// if no wildcard at bgn, hook_bgn must match word_bgn
if ( !wildcard_at_bgn
&& hook_bgn != word_bgn)
return false;
// if no wildcard at end, hook_end must match word_end
if ( !wildcard_at_end
&& hook_bgn != word_end)
return false;
return true;
}
public void Clear() {
found = false;
for (Xosearch_word_node sub : subs)
sub.Clear();
}
public boolean Eval() {
switch (tid) {
case Srch_crt_itm.Tid__and: {
for (Xosearch_word_node sub : subs)
if (!sub.Eval())
return false;
return true;
}
case Srch_crt_itm.Tid__or: {
for (Xosearch_word_node sub : subs)
if (sub.Eval())
return true;
return false;
}
case Srch_crt_itm.Tid__word:
case Srch_crt_itm.Tid__word_quote:
return found;
case Srch_crt_itm.Tid__not:
return !found;
case Srch_crt_itm.Tid__invalid: return false; // should not happen
default: throw Err_.new_unhandled_default(tid);
}
}
}

@ -0,0 +1,64 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.core.btries.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Xosearch_word_node_ {
public static Xosearch_word_node New_root(Srch_crt_itm src, Btrie_slim_mgr word_trie, byte wildchar_byte) {
Xosearch_word_node trg = new Xosearch_word_node();
trg.tid = src.Tid;
// set word-related props
switch (trg.tid) {
case Srch_crt_itm.Tid__word:
case Srch_crt_itm.Tid__word_quote:
byte[] word_orig = src.Raw; // EX: "abc*"
// determine if wildcards at bgn / end
int word_orig_len = word_orig.length;
boolean wildcard_at_bgn = word_orig_len > 1 && word_orig[0] == wildchar_byte;
boolean wildcard_at_end = word_orig_len > 1 && word_orig[word_orig_len - 1] == wildchar_byte;
// get hook
int hook_bgn = wildcard_at_bgn ? 1 : 0;
int hook_end = wildcard_at_end ? word_orig_len - 1 : word_orig_len;
byte[] word_hook = wildcard_at_bgn || wildcard_at_end ? Bry_.Mid(word_orig, hook_bgn, hook_end) : word_orig;
// assign to trg
trg.word_orig = word_orig;
trg.word_hook = word_hook;
trg.wildcard_at_bgn = wildcard_at_bgn;
trg.wildcard_at_end = wildcard_at_end;
// add to hash, trie
if (word_trie.Match_exact(word_hook) == null) { // don't add if exists
word_trie.Add_obj(word_hook, trg);
}
break;
}
// set subs
Srch_crt_itm[] src_subs = src.Subs;
Xosearch_word_node[] trg_subs = new Xosearch_word_node[src_subs.length];
trg.subs = trg_subs;
int len = src_subs.length;
for (int i = 0; i < len; i++) {
trg.subs[i] = New_root(src_subs[i], word_trie, wildchar_byte);
}
return trg;
}
}

@ -172,6 +172,7 @@ public class Gallery_parser {
cur_itm.Ttl_end_(fld_end);
byte[] ttl_bry = Bry_.Mid(src, cur_itm.Ttl_bgn(), fld_end);
ttl_bry = gplx.langs.htmls.encoders.Gfo_url_encoder_.Http_url_ttl.Decode(ttl_bry); // NOTE: must decode url-encoded entries; EX: "A%28b%29.png" -> "A(b).png"; DATE:2014-01-01
if (gplx.core.envs.Env_.Mode_testing() && wiki == null) return; // TEST: else one test will throw benign null ref exception; DATE:2017-03-01
Xoa_ttl ttl = Xoa_ttl.Parse(wiki, ttl_bry);
if ( ttl == null // invalid ttl; EX: "<invalid>"
|| ttl.Anch_bgn() == 1 // anchor-only ttl; EX: "#invalid"; DATE:2014-03-18

Loading…
Cancel
Save