mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Full-text search: Ignore HTML tags
This commit is contained in:
parent
2270a35c83
commit
a214575391
@ -0,0 +1,74 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
public class Xofulltext_extractor implements Mwh_doc_wkr {
|
||||
private final Mwh_doc_parser doc_parser = new Mwh_doc_parser();
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private final Btrie_slim_mgr punct_trie = Btrie_slim_mgr.cs();
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
public Xofulltext_extractor() {
|
||||
punct_trie.Add_many_str(Xofulltext_punct_.Punct_bgn_ary);
|
||||
punct_trie.Add_many_str("/", ")", "]", ">", "<EFBFBD>");
|
||||
}
|
||||
public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw();
|
||||
public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {}
|
||||
public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {}
|
||||
public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {}
|
||||
public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {}
|
||||
public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {
|
||||
// trim flanking ws
|
||||
itm_bgn = Bry_find_.Find_fwd_while_ws(src, itm_bgn, itm_end);
|
||||
itm_end = Bry_find_.Find_bwd__skip_ws(src, itm_end, itm_bgn);
|
||||
|
||||
// add ws between entries
|
||||
if (bfr.Len_gt_0()) { // ignore if 1st entry
|
||||
// identify punct at start of String
|
||||
int punct_end = itm_bgn;
|
||||
while (true) {
|
||||
// exit if at end
|
||||
if (punct_end >= itm_end) break;
|
||||
|
||||
// check if punct
|
||||
Object o = punct_trie.Match_at(trv, src, punct_end, itm_end);
|
||||
|
||||
// b is not punct; exit
|
||||
if (o == null) {
|
||||
break;
|
||||
}
|
||||
// b is punct; keep going
|
||||
else {
|
||||
punct_end++;
|
||||
}
|
||||
}
|
||||
|
||||
// only add space if no punct at start; prevents building strings like "a <i>b</i>. c d" -> "a b . c d"
|
||||
if (itm_bgn == punct_end)
|
||||
bfr.Add_byte_space();
|
||||
}
|
||||
|
||||
// add to bfr
|
||||
bfr.Add_mid(src, itm_bgn, itm_end);
|
||||
}
|
||||
|
||||
public byte[] Extract(byte[] src) {
|
||||
doc_parser.Parse(this, src, 0, src.length);
|
||||
return bfr.To_bry_and_clear();
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Xofulltext_extractor__tst {
|
||||
private final Xofulltext_extractor__fxt fxt = new Xofulltext_extractor__fxt();
|
||||
@Test public void Basic() {
|
||||
// simple node
|
||||
fxt.Test__extract("a <i>b</i> c", "a b c");
|
||||
|
||||
// node with attributes
|
||||
fxt.Test__extract("a <a href='b.html' caption='c d e'>f</a> g", "a f g");
|
||||
|
||||
// nested nodes
|
||||
fxt.Test__extract("a <b>b <i>c</i> d</b> e", "a b c d e");
|
||||
|
||||
// periods
|
||||
fxt.Test__extract("a <b>b</b>. c d", "a b. c d");
|
||||
|
||||
// parens
|
||||
fxt.Test__extract("(a <b>b</b>)", "(a b)");
|
||||
|
||||
// parens
|
||||
fxt.Test__extract("<b>a</b> (b)", "a (b)");
|
||||
}
|
||||
}
|
||||
class Xofulltext_extractor__fxt {
|
||||
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
|
||||
public void Test__extract(String src, String expd) {
|
||||
Gftest.Eq__str(expd, extractor.Extract(Bry_.new_u8(src)));
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*;
|
||||
public class Xofulltext_punct_ {
|
||||
public static final String[] Ws_bgn_ary = new String[] {"\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">"};
|
||||
public static final String[] Punct_bgn_ary = new String[] {".", ",", "?", "!", ":", ";", "'", "\"", "-"};
|
||||
}
|
@ -16,8 +16,10 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import gplx.gflucene.indexers.*;
|
||||
import gplx.xowa.addons.wikis.fulltexts.core.*;
|
||||
public class Xofulltext_indexer_wkr {
|
||||
private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr();
|
||||
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
|
||||
public void Init(Xow_wiki wiki) {
|
||||
// delete existing dir
|
||||
Io_url index_dir = Xosearch_fulltext_addon.Get_index_dir(wiki);
|
||||
@ -29,8 +31,9 @@ public class Xofulltext_indexer_wkr {
|
||||
, index_dir.Xto_api()));
|
||||
}
|
||||
public void Index(Xoae_page wpg) {
|
||||
// TODO: skip if not main_ns
|
||||
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry());
|
||||
byte[] html = extractor.Extract(wpg.Db().Html().Html_bry());
|
||||
|
||||
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), html);
|
||||
}
|
||||
public void Index(int page_id, int score, byte[] ttl, byte[] html) {
|
||||
Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));
|
||||
|
@ -16,13 +16,11 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.core.intls.*;
|
||||
import gplx.xowa.addons.wikis.fulltexts.core.*;
|
||||
public class Xofulltext_word_lang {
|
||||
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str("\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">");
|
||||
private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Ws_bgn_ary);
|
||||
private final Btrie_slim_mgr ws_end;
|
||||
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs()
|
||||
.Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"", "-")
|
||||
;
|
||||
private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Punct_bgn_ary);
|
||||
private final Btrie_slim_mgr punct_end;
|
||||
public Xofulltext_word_lang() {
|
||||
this.ws_end = ws_bgn;
|
||||
|
@ -20,6 +20,7 @@ import gplx.gflucene.searchers.*;
|
||||
import gplx.xowa.htmls.*;
|
||||
import gplx.xowa.wikis.data.tbls.*;
|
||||
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
|
||||
import gplx.xowa.addons.wikis.fulltexts.core.*;
|
||||
class Xofulltext_highlighter_mgr implements Gfo_invk {
|
||||
private final Xofulltext_searcher_ui ui;
|
||||
private final Xow_wiki wiki;
|
||||
@ -30,6 +31,7 @@ class Xofulltext_highlighter_mgr implements Gfo_invk {
|
||||
private final Xoh_page hpg = new Xoh_page();
|
||||
private final Xowd_page_itm tmp_page_row = new Xowd_page_itm();
|
||||
private final List_adp list;
|
||||
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
|
||||
public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args searcher_args, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) {
|
||||
this.ui = ui;
|
||||
this.wiki = wiki;
|
||||
@ -70,7 +72,9 @@ class Xofulltext_highlighter_mgr implements Gfo_invk {
|
||||
|
||||
// load db.html.html
|
||||
wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons
|
||||
item.body = String_.new_u8(hpg.Db().Html().Html_bry());
|
||||
byte[] html = hpg.Db().Html().Html_bry();
|
||||
html = extractor.Extract(html);
|
||||
item.body = String_.new_u8(html);
|
||||
|
||||
// loop pages
|
||||
int page_id = item.page_id;
|
||||
|
Loading…
Reference in New Issue
Block a user