From a21457539159a2e988ae77037ca4554c2beecd5d Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Thu, 16 Mar 2017 10:32:51 -0400 Subject: [PATCH] Full-text search: Ignore HTML tags --- .../fulltexts/core/Xofulltext_extractor.java | 74 +++++++++++++++++++ .../core/Xofulltext_extractor__tst.java | 45 +++++++++++ .../fulltexts/core/Xofulltext_punct_.java | 20 +++++ .../bldrs/Xofulltext_indexer_wkr.java | 7 +- .../brutes/finders/Xofulltext_word_lang.java | 8 +- .../gflucenes/Xofulltext_highlighter_mgr.java | 6 +- 6 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor.java create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor__tst.java create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_punct_.java diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor.java new file mode 100644 index 000000000..a9b019b70 --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor.java @@ -0,0 +1,74 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; +import gplx.core.btries.*; +import gplx.xowa.parsers.htmls.*; +public class Xofulltext_extractor implements Mwh_doc_wkr { + private final Mwh_doc_parser doc_parser = new Mwh_doc_parser(); + private final Bry_bfr bfr = Bry_bfr_.New(); + private final Btrie_slim_mgr punct_trie = Btrie_slim_mgr.cs(); + private final Btrie_rv trv = new Btrie_rv(); + public Xofulltext_extractor() { + punct_trie.Add_many_str(Xofulltext_punct_.Punct_bgn_ary); + punct_trie.Add_many_str("/", ")", "]", ">", "�"); + } + public Hash_adp_bry Nde_regy() {return nde_regy;} private final Hash_adp_bry nde_regy = Mwh_doc_wkr_.Nde_regy__mw(); + public void On_nde_head_bgn (Mwh_doc_parser mgr, byte[] src, int nde_tid, int key_bgn, int key_end) {} + public void On_nde_head_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end, boolean inline) {} + public void On_nde_tail_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {} + public void On_comment_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {} + public void On_entity_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) {} + public void On_atr_each (Mwh_atr_parser mgr, byte[] src, int nde_tid, boolean valid, boolean repeated, boolean key_exists, byte[] key_bry, byte[] val_bry_manual, int[] itm_ary, int itm_idx) {} + public void On_txt_end (Mwh_doc_parser mgr, byte[] src, int nde_tid, int itm_bgn, int itm_end) { + // trim flanking ws + itm_bgn = Bry_find_.Find_fwd_while_ws(src, itm_bgn, itm_end); + itm_end = Bry_find_.Find_bwd__skip_ws(src, itm_end, itm_bgn); + + // add ws between entries + if (bfr.Len_gt_0()) { // ignore if 1st entry + // identify punct at start of String + int punct_end = itm_bgn; + while (true) { + // exit if at end + if (punct_end >= itm_end) break; + + // check if punct + Object o = punct_trie.Match_at(trv, src, punct_end, itm_end); + + // b is not punct; exit + if (o == null) { + break; + } + // b is punct; keep going + else { + punct_end++; + } + } + + // only add space if no punct at start; prevents building strings like "a b. c d" -> "a b . c d" + if (itm_bgn == punct_end) + bfr.Add_byte_space(); + } + + // add to bfr + bfr.Add_mid(src, itm_bgn, itm_end); + } + + public byte[] Extract(byte[] src) { + doc_parser.Parse(this, src, 0, src.length); + return bfr.To_bry_and_clear(); + } +} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor__tst.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor__tst.java new file mode 100644 index 000000000..2bca75e26 --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_extractor__tst.java @@ -0,0 +1,45 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; +import org.junit.*; import gplx.core.tests.*; +public class Xofulltext_extractor__tst { + private final Xofulltext_extractor__fxt fxt = new Xofulltext_extractor__fxt(); + @Test public void Basic() { + // simple node + fxt.Test__extract("a b c", "a b c"); + + // node with attributes + fxt.Test__extract("a f g", "a f g"); + + // nested nodes + fxt.Test__extract("a b c d e", "a b c d e"); + + // periods + fxt.Test__extract("a b. c d", "a b. c d"); + + // parens + fxt.Test__extract("(a b)", "(a b)"); + + // parens + fxt.Test__extract("a (b)", "a (b)"); + } +} +class Xofulltext_extractor__fxt { + private final Xofulltext_extractor extractor = new Xofulltext_extractor(); + public void Test__extract(String src, String expd) { + Gftest.Eq__str(expd, extractor.Extract(Bry_.new_u8(src))); + } +} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_punct_.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_punct_.java new file mode 100644 index 000000000..327188abf --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/core/Xofulltext_punct_.java @@ -0,0 +1,20 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.fulltexts.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; +public class Xofulltext_punct_ { + public static final String[] Ws_bgn_ary = new String[] {"\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">"}; + public static final String[] Punct_bgn_ary = new String[] {".", ",", "?", "!", ":", ";", "'", "\"", "-"}; +} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java index b64847767..ca73b3448 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java @@ -16,8 +16,10 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*; import gplx.gflucene.core.*; import gplx.gflucene.indexers.*; +import gplx.xowa.addons.wikis.fulltexts.core.*; public class Xofulltext_indexer_wkr { private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr(); + private final Xofulltext_extractor extractor = new Xofulltext_extractor(); public void Init(Xow_wiki wiki) { // delete existing dir Io_url index_dir = Xosearch_fulltext_addon.Get_index_dir(wiki); @@ -29,8 +31,9 @@ public class Xofulltext_indexer_wkr { , index_dir.Xto_api())); } public void Index(Xoae_page wpg) { - // TODO: skip if not main_ns - Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry()); + byte[] html = extractor.Extract(wpg.Db().Html().Html_bry()); + + Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), html); } public void Index(int page_id, int score, byte[] ttl, byte[] html) { Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html)); diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/brutes/finders/Xofulltext_word_lang.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/brutes/finders/Xofulltext_word_lang.java index 89d185f07..830289eb3 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/brutes/finders/Xofulltext_word_lang.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/brutes/finders/Xofulltext_word_lang.java @@ -16,13 +16,11 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.finders; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.*; import gplx.core.btries.*; import gplx.core.intls.*; +import gplx.xowa.addons.wikis.fulltexts.core.*; public class Xofulltext_word_lang { - private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs() - .Add_many_str("\t", "\n", "\r", " ", "/", "(", ")", "[", "]", "<", ">"); + private final Btrie_slim_mgr ws_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Ws_bgn_ary); private final Btrie_slim_mgr ws_end; - private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs() - .Add_many_str(".", ",", "?", "!", ":", ";", "'", "\"", "-") - ; + private final Btrie_slim_mgr punct_bgn = Btrie_slim_mgr.cs().Add_many_str(Xofulltext_punct_.Punct_bgn_ary); private final Btrie_slim_mgr punct_end; public Xofulltext_word_lang() { this.ws_end = ws_bgn; diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java index a27a2ad20..b084f843a 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java @@ -20,6 +20,7 @@ import gplx.gflucene.searchers.*; import gplx.xowa.htmls.*; import gplx.xowa.wikis.data.tbls.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*; +import gplx.xowa.addons.wikis.fulltexts.core.*; class Xofulltext_highlighter_mgr implements Gfo_invk { private final Xofulltext_searcher_ui ui; private final Xow_wiki wiki; @@ -30,6 +31,7 @@ class Xofulltext_highlighter_mgr implements Gfo_invk { private final Xoh_page hpg = new Xoh_page(); private final Xowd_page_itm tmp_page_row = new Xowd_page_itm(); private final List_adp list; + private final Xofulltext_extractor extractor = new Xofulltext_extractor(); public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args searcher_args, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) { this.ui = ui; this.wiki = wiki; @@ -70,7 +72,9 @@ class Xofulltext_highlighter_mgr implements Gfo_invk { // load db.html.html wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons - item.body = String_.new_u8(hpg.Db().Html().Html_bry()); + byte[] html = hpg.Db().Html().Html_bry(); + html = extractor.Extract(html); + item.body = String_.new_u8(html); // loop pages int page_id = item.page_id;