Full-text search: Add highlighter

pull/620/head
gnosygnu 7 years ago
parent d53fe9628f
commit 8f8e414c80

@ -0,0 +1,87 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*;
import gplx.gflucene.core.*;
import gplx.gflucene.highlighters.*;
import gplx.gflucene.searchers.*;
import gplx.xowa.htmls.*;
import gplx.xowa.wikis.data.tbls.*;
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
class Xofulltext_highlighter_mgr implements Gfo_invk {
private final Xofulltext_searcher_ui ui;
private final Xow_wiki wiki;
private final String wiki_domain;
private final List_adp list;
private final Gflucene_analyzer_data analyzer_data;
private final Gflucene_searcher_qry searcher_data;
private final Gflucene_highlighter_mgr highlighter_mgr = new Gflucene_highlighter_mgr();
private final Xoh_page hpg = new Xoh_page();
private final Xowd_page_itm tmp_page_row = new Xowd_page_itm();
public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) {
this.ui = ui;
this.wiki = wiki;
this.wiki_domain = wiki.Domain_str();
this.analyzer_data = analyzer_data;
this.searcher_data = searcher_data;
this.list = list;
}
private void Highlight_list() {
// init highlighter
highlighter_mgr.Init(new Gflucene_index_data(analyzer_data, "")); // NOTE: index_dir not needed for highlighter
// loop items
int len = list.Len();
for (int i = 0; i < len; i++) {
Gflucene_doc_data item = (Gflucene_doc_data)list.Get_at(i);
try {
Highlight_item(item);
} catch (Exception e) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: failed to highlight lines in page; page=~{0} err=~{1}", item.page_id, Err_.Message_gplx_log(e));
}
}
// term highlighter
highlighter_mgr.Term();
}
private void Highlight_item(Gflucene_doc_data item) {
// load db.core.page
if (!wiki.Data__core_mgr().Db__core().Tbl__page().Select_by_id(tmp_page_row, item.page_id)) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: could not find page; page_id=~{0}", item.page_id);
return;
}
// init hpg
Xoa_ttl page_ttl = wiki.Ttl_parse(tmp_page_row.Ns_id(), tmp_page_row.Ttl_page_db());
Xoa_url page_url = wiki.Utl__url_parser().Parse(page_ttl.Page_db());
hpg.Ctor_by_hview(wiki, page_url, page_ttl, item.page_id);
// load db.html.html
wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons
item.body = String_.new_u8(hpg.Db().Html().Html_bry());
// loop pages
int page_id = item.page_id;
Gflucene_highlighter_item[] lines = highlighter_mgr.Exec(searcher_data, item);
for (Gflucene_highlighter_item line : lines) {
ui.Send_line_add(new Xofulltext_searcher_line(wiki_domain, page_id, line.num, line.text));
}
}
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, Invk__highlight)) this.Highlight_list();
else return Gfo_invk_.Rv_unhandled;
return this;
} public static final String Invk__highlight = "highlight";
}

@ -18,32 +18,39 @@ import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.indexers.*;
import gplx.gflucene.searchers.*;
import gplx.gflucene.highlighters.*;
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr();
public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) {
public void Search(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args args) {
// create list
List_adp list = List_adp_.New();
// init searcher with wiki
Gflucene_analyzer_data analyzer_data = Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str());
searcher.Init(new Gflucene_index_data
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
( analyzer_data
, wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()));
// exec search
searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki));
Gflucene_searcher_qry searcher_data = new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki);
searcher.Exec(list, searcher_data);
// term
searcher.Term();
// loop list
// loop list and send pages
int len = list.Len();
for (int i = 0; i < len; i++) {
Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i);
// call page found
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false);
cbk.Send_page_add(page);
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, args.expand_matches_section);
ui.Send_page_add(page);
}
// create highlighter thread and launch it
Xofulltext_highlighter_mgr highlighter_mgr = new Xofulltext_highlighter_mgr(ui, wiki, analyzer_data, searcher_data, list);
gplx.core.threads.Thread_adp_.Start_by_key("highlighter", Cancelable_.Never, highlighter_mgr, Xofulltext_highlighter_mgr.Invk__highlight);
}
}

@ -52,8 +52,8 @@ class Xofulltext_searcher_svc implements Gfo_invk {
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.auto_wildcard_end", search_args.auto_wildcard_end);
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.expand_matches_section", search_args.expand_matches_section);
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.show_all_matches", search_args.show_all_matches);
cfg_mgr.Get_int_app_or ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki);
cfg_mgr.Get_str_app_or ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces);
cfg_mgr.Set_int_app ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki);
cfg_mgr.Set_str_app ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces);
}
// launch thread

@ -3,10 +3,10 @@
<classpathentry kind="src" path="src"/>
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-memory-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-memory-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-6.4.2.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
public class Gflucene_highlighter_item {
public int num;
public String text;
public Gflucene_highlighter_item(int num, String text) {
this.num = num;
this.text = text;
}
}

@ -0,0 +1,90 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.analyzers.*;
import gplx.gflucene.searchers.*;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_highlighter_mgr {
private Analyzer analyzer;
public Gflucene_highlighter_mgr() {
}
public void Init(Gflucene_index_data idx_data) {
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
}
public Gflucene_highlighter_item[] Exec(Gflucene_searcher_qry qry_data, Gflucene_doc_data doc_data) {
// create query
QueryParser parser = new QueryParser("body", analyzer);
Query query = null;
try {
query = parser.parse(qry_data.query);
} catch (ParseException e) {
throw Err_.new_exc(e, "lucene_index", "failed to parse", "query", qry_data.query);
}
// create highlighter
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span class='snip_highlight'>", "</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
// get token stream
String text = doc_data.body;
TokenStream tokenStream = analyzer.tokenStream("body", text);
// get fragments from stream
String[] frags;
try {
frags = highlighter.getBestFragments(tokenStream, text, 10);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
} catch (InvalidTokenOffsetsException e) {
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
}
// convert fragments to highlighter items
int frags_len = frags.length;
Gflucene_highlighter_item[] array = new Gflucene_highlighter_item[frags_len];
for (int i = 0; i < frags_len; i++) {
String frag = frags[i];
array[i] = new Gflucene_highlighter_item(i, frag);
}
return array;
}
public void Term() {
}
}
Loading…
Cancel
Save