From 8f8e414c8072749cb723bdeb4c24b4eb8077715a Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Wed, 15 Mar 2017 13:11:09 -0400 Subject: [PATCH] Full-text search: Add highlighter --- .../gflucenes/Xofulltext_highlighter_mgr.java | 87 ++++++++++++++++++ .../Xofulltext_searcher__lucene.java | 19 ++-- .../svcs/Xofulltext_searcher_svc.java | 4 +- gplx.gflucene/.classpath | 10 +-- .../Gflucene_highlighter_item.java | 24 +++++ .../Gflucene_highlighter_mgr.java | 90 +++++++++++++++++++ 6 files changed, 221 insertions(+), 13 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java create mode 100644 gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_item.java create mode 100644 gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_mgr.java diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java new file mode 100644 index 000000000..735ff140b --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_highlighter_mgr.java @@ -0,0 +1,87 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*; +import gplx.gflucene.core.*; +import gplx.gflucene.highlighters.*; +import gplx.gflucene.searchers.*; +import gplx.xowa.htmls.*; +import gplx.xowa.wikis.data.tbls.*; +import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*; +class Xofulltext_highlighter_mgr implements Gfo_invk { + private final Xofulltext_searcher_ui ui; + private final Xow_wiki wiki; + private final String wiki_domain; + private final List_adp list; + private final Gflucene_analyzer_data analyzer_data; + private final Gflucene_searcher_qry searcher_data; + private final Gflucene_highlighter_mgr highlighter_mgr = new Gflucene_highlighter_mgr(); + private final Xoh_page hpg = new Xoh_page(); + private final Xowd_page_itm tmp_page_row = new Xowd_page_itm(); + public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) { + this.ui = ui; + this.wiki = wiki; + this.wiki_domain = wiki.Domain_str(); + this.analyzer_data = analyzer_data; + this.searcher_data = searcher_data; + this.list = list; + } + private void Highlight_list() { + // init highlighter + highlighter_mgr.Init(new Gflucene_index_data(analyzer_data, "")); // NOTE: index_dir not needed for highlighter + + // loop items + int len = list.Len(); + for (int i = 0; i < len; i++) { + Gflucene_doc_data item = (Gflucene_doc_data)list.Get_at(i); + try { + Highlight_item(item); + } catch (Exception e) { + Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: failed to highlight lines in page; page=~{0} err=~{1}", item.page_id, Err_.Message_gplx_log(e)); + } + } + + // term highlighter + highlighter_mgr.Term(); + } + private void Highlight_item(Gflucene_doc_data item) { + // load db.core.page + if (!wiki.Data__core_mgr().Db__core().Tbl__page().Select_by_id(tmp_page_row, item.page_id)) { + Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: could not find page; page_id=~{0}", item.page_id); + return; + } + + // init hpg + Xoa_ttl page_ttl = wiki.Ttl_parse(tmp_page_row.Ns_id(), tmp_page_row.Ttl_page_db()); + Xoa_url page_url = wiki.Utl__url_parser().Parse(page_ttl.Page_db()); + hpg.Ctor_by_hview(wiki, page_url, page_ttl, item.page_id); + + // load db.html.html + wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons + item.body = String_.new_u8(hpg.Db().Html().Html_bry()); + + // loop pages + int page_id = item.page_id; + Gflucene_highlighter_item[] lines = highlighter_mgr.Exec(searcher_data, item); + for (Gflucene_highlighter_item line : lines) { + ui.Send_line_add(new Xofulltext_searcher_line(wiki_domain, page_id, line.num, line.text)); + } + } + public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { + if (ctx.Match(k, Invk__highlight)) this.Highlight_list(); + else return Gfo_invk_.Rv_unhandled; + return this; + } public static final String Invk__highlight = "highlight"; +} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java index 522f5d0da..cad1bc857 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java @@ -18,32 +18,39 @@ import gplx.gflucene.*; import gplx.gflucene.core.*; import gplx.gflucene.indexers.*; import gplx.gflucene.searchers.*; +import gplx.gflucene.highlighters.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*; public class Xofulltext_searcher__lucene implements Xofulltext_searcher { private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr(); - public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) { + public void Search(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args args) { // create list List_adp list = List_adp_.New(); // init searcher with wiki + Gflucene_analyzer_data analyzer_data = Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str()); searcher.Init(new Gflucene_index_data - ( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str()) + ( analyzer_data , wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api())); // exec search - searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki)); + Gflucene_searcher_qry searcher_data = new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki); + searcher.Exec(list, searcher_data); // term searcher.Term(); - // loop list + // loop list and send pages int len = list.Len(); for (int i = 0; i < len; i++) { Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i); // call page found - Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false); - cbk.Send_page_add(page); + Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, args.expand_matches_section); + ui.Send_page_add(page); } + + // create highlighter thread and launch it + Xofulltext_highlighter_mgr highlighter_mgr = new Xofulltext_highlighter_mgr(ui, wiki, analyzer_data, searcher_data, list); + gplx.core.threads.Thread_adp_.Start_by_key("highlighter", Cancelable_.Never, highlighter_mgr, Xofulltext_highlighter_mgr.Invk__highlight); } } diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/svcs/Xofulltext_searcher_svc.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/svcs/Xofulltext_searcher_svc.java index 22c2374ae..d9b9ef542 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/svcs/Xofulltext_searcher_svc.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/svcs/Xofulltext_searcher_svc.java @@ -52,8 +52,8 @@ class Xofulltext_searcher_svc implements Gfo_invk { cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.auto_wildcard_end", search_args.auto_wildcard_end); cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.expand_matches_section", search_args.expand_matches_section); cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.show_all_matches", search_args.show_all_matches); - cfg_mgr.Get_int_app_or ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki); - cfg_mgr.Get_str_app_or ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces); + cfg_mgr.Set_int_app ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki); + cfg_mgr.Set_str_app ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces); } // launch thread diff --git a/gplx.gflucene/.classpath b/gplx.gflucene/.classpath index 0748ab713..4d90c328f 100644 --- a/gplx.gflucene/.classpath +++ b/gplx.gflucene/.classpath @@ -3,10 +3,10 @@ - - - - - + + + + + diff --git a/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_item.java b/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_item.java new file mode 100644 index 000000000..20598300d --- /dev/null +++ b/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_item.java @@ -0,0 +1,24 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*; +public class Gflucene_highlighter_item { + public int num; + public String text; + public Gflucene_highlighter_item(int num, String text) { + this.num = num; + this.text = text; + } +} diff --git a/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_mgr.java b/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_mgr.java new file mode 100644 index 000000000..d11149392 --- /dev/null +++ b/gplx.gflucene/src/gplx/gflucene/highlighters/Gflucene_highlighter_mgr.java @@ -0,0 +1,90 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*; +import gplx.gflucene.core.*; +import gplx.gflucene.analyzers.*; +import gplx.gflucene.searchers.*; +import java.io.IOException; +import java.nio.file.Paths; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; +import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.search.highlight.TokenSources; +import org.apache.lucene.store.FSDirectory; +public class Gflucene_highlighter_mgr { + private Analyzer analyzer; + + public Gflucene_highlighter_mgr() { + } + + public void Init(Gflucene_index_data idx_data) { + this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key); + } + public Gflucene_highlighter_item[] Exec(Gflucene_searcher_qry qry_data, Gflucene_doc_data doc_data) { + // create query + QueryParser parser = new QueryParser("body", analyzer); + Query query = null; + try { + query = parser.parse(qry_data.query); + } catch (ParseException e) { + throw Err_.new_exc(e, "lucene_index", "failed to parse", "query", qry_data.query); + } + + // create highlighter + SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("", ""); + Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); + + // get token stream + String text = doc_data.body; + TokenStream tokenStream = analyzer.tokenStream("body", text); + + // get fragments from stream + String[] frags; + try { + frags = highlighter.getBestFragments(tokenStream, text, 10); + } catch (IOException e) { + throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query); + } catch (InvalidTokenOffsetsException e) { + throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query); + } + + // convert fragments to highlighter items + int frags_len = frags.length; + Gflucene_highlighter_item[] array = new Gflucene_highlighter_item[frags_len]; + for (int i = 0; i < frags_len; i++) { + String frag = frags[i]; + array[i] = new Gflucene_highlighter_item(i, frag); + } + return array; + } + public void Term() { + } +}