1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Full-text search: Ignore HTML tags

This commit is contained in:
gnosygnu 2017-03-16 10:32:51 -04:00
parent 2270a35c83
commit a214575391
6 changed files with 152 additions and 8 deletions

View File

@ -0,0 +1,74 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
import gplx.core.btries.*;
import gplx.xowa.parsers.htmls.*;
public class Xofulltext_extractor implements Mwh_doc_wkr {
private final Mwh_doc_parser doc_parser = new Mwh_doc_parser();
private final Bry_bfr bfr = Bry_bfr_.New();
private final Btrie_slim_mgr punct_trie = Btrie_slim_mgr.cs();
private final Btrie_rv trv = new Btrie_rv();
public Xofulltext_extractor() {
punct_trie.Add_many_str(Xofulltext_punct_.Punct_bgn_ary);
punct_trie.Add_many_str("/", ")", "]", ">", "<EFBFBD>");
}
public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw();
public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {
// trim flanking ws
itm_bgn = Bry_find_.Find_fwd_while_ws(src, itm_bgn, itm_end);
itm_end = Bry_find_.Find_bwd__skip_ws(src, itm_end, itm_bgn);
// add ws between entries
if (bfr.Len_gt_0()) { // ignore if 1st entry
// identify punct at start of String
int punct_end = itm_bgn;
while (true) {
// exit if at end
if (punct_end >= itm_end) break;
// check if punct
Object o = punct_trie.Match_at(trv, src, punct_end, itm_end);
// b is not punct; exit
if (o == null) {
break;
}
// b is punct; keep going
else {
punct_end++;
}
}
// only add space if no punct at start; prevents building strings like "a <i>b</i>. c d" -> "a b . c d"
if (itm_bgn == punct_end)
bfr.Add_byte_space();
}
// add to bfr
bfr.Add_mid(src, itm_bgn, itm_end);
}
public byte[] Extract(byte[] src) {
doc_parser.Parse(this, src, 0, src.length);
return bfr.To_bry_and_clear();
}
}

View File

@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
import org.junit.*; import gplx.core.tests.*;
public class Xofulltext_extractor__tst {
private final Xofulltext_extractor__fxt fxt = new Xofulltext_extractor__fxt();
@Test public void Basic() {
// simple node
fxt.Test__extract("a <i>b</i> c", "a b c");
// node with attributes
fxt.Test__extract("a <a href='b.html' caption='c d e'>f</a> g", "a f g");
// nested nodes
fxt.Test__extract("a <b>b <i>c</i> d</b> e", "a b c d e");
// periods
fxt.Test__extract("a <b>b</b>. c d", "a b. c d");
// parens
fxt.Test__extract("(a <b>b</b>)", "(a b)");
// parens
fxt.Test__extract("<b>a</b> (b)", "a (b)");
}
}
class Xofulltext_extractor__fxt {
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
public void Test__extract(String src, String expd) {
Gftest.Eq__str(expd, extractor.Extract(Bry_.new_u8(src)));
}
}

View File

@ -0,0 +1,20 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
public class Xofulltext_punct_ {
public static final String[] Ws_bgn_ary = new String[] {"\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">"};
public static final String[] Punct_bgn_ary = new String[] {".", ",", "?", "!", ":", ";", "'", "\"", "-"};
}

View File

@ -16,8 +16,10 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.gflucene.core.*;
import gplx.gflucene.indexers.*;
import gplx.xowa.addons.wikis.fulltexts.core.*;
public class Xofulltext_indexer_wkr {
private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr();
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
public void Init(Xow_wiki wiki) {
// delete existing dir
Io_url index_dir = Xosearch_fulltext_addon.Get_index_dir(wiki);
@ -29,8 +31,9 @@ public class Xofulltext_indexer_wkr {
, index_dir.Xto_api()));
}
public void Index(Xoae_page wpg) {
// TODO: skip if not main_ns
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry());
byte[] html = extractor.Extract(wpg.Db().Html().Html_bry());
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), html);
}
public void Index(int page_id, int score, byte[] ttl, byte[] html) {
Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));

View File

@ -16,13 +16,11 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.*;
import gplx.core.btries.*;
import gplx.core.intls.*;
import gplx.xowa.addons.wikis.fulltexts.core.*;
public class Xofulltext_word_lang {
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
.Add_many_str("\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">");
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Ws_bgn_ary);
private final Btrie_slim_mgr ws_end;
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"", "-")
;
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Punct_bgn_ary);
private final Btrie_slim_mgr punct_end;
public Xofulltext_word_lang() {
this.ws_end = ws_bgn;

View File

@ -20,6 +20,7 @@ import gplx.gflucene.searchers.*;
import gplx.xowa.htmls.*;
import gplx.xowa.wikis.data.tbls.*;
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
import gplx.xowa.addons.wikis.fulltexts.core.*;
class Xofulltext_highlighter_mgr implements Gfo_invk {
private final Xofulltext_searcher_ui ui;
private final Xow_wiki wiki;
@ -30,6 +31,7 @@ class Xofulltext_highlighter_mgr implements Gfo_invk {
private final Xoh_page hpg = new Xoh_page();
private final Xowd_page_itm tmp_page_row = new Xowd_page_itm();
private final List_adp list;
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args searcher_args, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) {
this.ui = ui;
this.wiki = wiki;
@ -70,7 +72,9 @@ class Xofulltext_highlighter_mgr implements Gfo_invk {
// load db.html.html
wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons
item.body = String_.new_u8(hpg.Db().Html().Html_bry());
byte[] html = hpg.Db().Html().Html_bry();
html = extractor.Extract(html);
item.body = String_.new_u8(html);
// loop pages
int page_id = item.page_id;