mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Full-text search: Add highlighter
This commit is contained in:
parent
d53fe9628f
commit
8f8e414c80
@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*;
|
||||||
|
import gplx.gflucene.core.*;
|
||||||
|
import gplx.gflucene.highlighters.*;
|
||||||
|
import gplx.gflucene.searchers.*;
|
||||||
|
import gplx.xowa.htmls.*;
|
||||||
|
import gplx.xowa.wikis.data.tbls.*;
|
||||||
|
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
|
||||||
|
class Xofulltext_highlighter_mgr implements Gfo_invk {
|
||||||
|
private final Xofulltext_searcher_ui ui;
|
||||||
|
private final Xow_wiki wiki;
|
||||||
|
private final String wiki_domain;
|
||||||
|
private final List_adp list;
|
||||||
|
private final Gflucene_analyzer_data analyzer_data;
|
||||||
|
private final Gflucene_searcher_qry searcher_data;
|
||||||
|
private final Gflucene_highlighter_mgr highlighter_mgr = new Gflucene_highlighter_mgr();
|
||||||
|
private final Xoh_page hpg = new Xoh_page();
|
||||||
|
private final Xowd_page_itm tmp_page_row = new Xowd_page_itm();
|
||||||
|
public Xofulltext_highlighter_mgr(Xofulltext_searcher_ui ui, Xow_wiki wiki, Gflucene_analyzer_data analyzer_data, Gflucene_searcher_qry searcher_data, List_adp list) {
|
||||||
|
this.ui = ui;
|
||||||
|
this.wiki = wiki;
|
||||||
|
this.wiki_domain = wiki.Domain_str();
|
||||||
|
this.analyzer_data = analyzer_data;
|
||||||
|
this.searcher_data = searcher_data;
|
||||||
|
this.list = list;
|
||||||
|
}
|
||||||
|
private void Highlight_list() {
|
||||||
|
// init highlighter
|
||||||
|
highlighter_mgr.Init(new Gflucene_index_data(analyzer_data, "")); // NOTE: index_dir not needed for highlighter
|
||||||
|
|
||||||
|
// loop items
|
||||||
|
int len = list.Len();
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
Gflucene_doc_data item = (Gflucene_doc_data)list.Get_at(i);
|
||||||
|
try {
|
||||||
|
Highlight_item(item);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: failed to highlight lines in page; page=~{0} err=~{1}", item.page_id, Err_.Message_gplx_log(e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// term highlighter
|
||||||
|
highlighter_mgr.Term();
|
||||||
|
}
|
||||||
|
private void Highlight_item(Gflucene_doc_data item) {
|
||||||
|
// load db.core.page
|
||||||
|
if (!wiki.Data__core_mgr().Db__core().Tbl__page().Select_by_id(tmp_page_row, item.page_id)) {
|
||||||
|
Gfo_usr_dlg_.Instance.Warn_many("", "", "search.highlight: could not find page; page_id=~{0}", item.page_id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// init hpg
|
||||||
|
Xoa_ttl page_ttl = wiki.Ttl_parse(tmp_page_row.Ns_id(), tmp_page_row.Ttl_page_db());
|
||||||
|
Xoa_url page_url = wiki.Utl__url_parser().Parse(page_ttl.Page_db());
|
||||||
|
hpg.Ctor_by_hview(wiki, page_url, page_ttl, item.page_id);
|
||||||
|
|
||||||
|
// load db.html.html
|
||||||
|
wiki.Html__hdump_mgr().Load_mgr().Load_by_xowh(hpg, page_ttl, false); // don't load categories for perf reasons
|
||||||
|
item.body = String_.new_u8(hpg.Db().Html().Html_bry());
|
||||||
|
|
||||||
|
// loop pages
|
||||||
|
int page_id = item.page_id;
|
||||||
|
Gflucene_highlighter_item[] lines = highlighter_mgr.Exec(searcher_data, item);
|
||||||
|
for (Gflucene_highlighter_item line : lines) {
|
||||||
|
ui.Send_line_add(new Xofulltext_searcher_line(wiki_domain, page_id, line.num, line.text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
||||||
|
if (ctx.Match(k, Invk__highlight)) this.Highlight_list();
|
||||||
|
else return Gfo_invk_.Rv_unhandled;
|
||||||
|
return this;
|
||||||
|
} public static final String Invk__highlight = "highlight";
|
||||||
|
}
|
@ -18,32 +18,39 @@ import gplx.gflucene.*;
|
|||||||
import gplx.gflucene.core.*;
|
import gplx.gflucene.core.*;
|
||||||
import gplx.gflucene.indexers.*;
|
import gplx.gflucene.indexers.*;
|
||||||
import gplx.gflucene.searchers.*;
|
import gplx.gflucene.searchers.*;
|
||||||
|
import gplx.gflucene.highlighters.*;
|
||||||
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
|
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
|
||||||
public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
|
public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
|
||||||
private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr();
|
private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr();
|
||||||
public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) {
|
public void Search(Xofulltext_searcher_ui ui, Xow_wiki wiki, Xofulltext_searcher_args args) {
|
||||||
// create list
|
// create list
|
||||||
List_adp list = List_adp_.New();
|
List_adp list = List_adp_.New();
|
||||||
|
|
||||||
// init searcher with wiki
|
// init searcher with wiki
|
||||||
|
Gflucene_analyzer_data analyzer_data = Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str());
|
||||||
searcher.Init(new Gflucene_index_data
|
searcher.Init(new Gflucene_index_data
|
||||||
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
|
( analyzer_data
|
||||||
, wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()));
|
, wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()));
|
||||||
|
|
||||||
// exec search
|
// exec search
|
||||||
searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki));
|
Gflucene_searcher_qry searcher_data = new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki);
|
||||||
|
searcher.Exec(list, searcher_data);
|
||||||
|
|
||||||
// term
|
// term
|
||||||
searcher.Term();
|
searcher.Term();
|
||||||
|
|
||||||
// loop list
|
// loop list and send pages
|
||||||
int len = list.Len();
|
int len = list.Len();
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i);
|
Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i);
|
||||||
|
|
||||||
// call page found
|
// call page found
|
||||||
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false);
|
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, args.expand_matches_section);
|
||||||
cbk.Send_page_add(page);
|
ui.Send_page_add(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// create highlighter thread and launch it
|
||||||
|
Xofulltext_highlighter_mgr highlighter_mgr = new Xofulltext_highlighter_mgr(ui, wiki, analyzer_data, searcher_data, list);
|
||||||
|
gplx.core.threads.Thread_adp_.Start_by_key("highlighter", Cancelable_.Never, highlighter_mgr, Xofulltext_highlighter_mgr.Invk__highlight);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,8 +52,8 @@ class Xofulltext_searcher_svc implements Gfo_invk {
|
|||||||
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.auto_wildcard_end", search_args.auto_wildcard_end);
|
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.auto_wildcard_end", search_args.auto_wildcard_end);
|
||||||
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.expand_matches_section", search_args.expand_matches_section);
|
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.expand_matches_section", search_args.expand_matches_section);
|
||||||
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.show_all_matches", search_args.show_all_matches);
|
cfg_mgr.Set_bool_app("xowa.addon.search.fulltext.special.show_all_matches", search_args.show_all_matches);
|
||||||
cfg_mgr.Get_int_app_or ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki);
|
cfg_mgr.Set_int_app ("xowa.addon.search.fulltext.special.max_pages_per_wiki", search_args.max_pages_per_wiki);
|
||||||
cfg_mgr.Get_str_app_or ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces);
|
cfg_mgr.Set_str_app ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces);
|
||||||
}
|
}
|
||||||
|
|
||||||
// launch thread
|
// launch thread
|
||||||
|
@ -3,10 +3,10 @@
|
|||||||
<classpathentry kind="src" path="src"/>
|
<classpathentry kind="src" path="src"/>
|
||||||
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
|
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
|
||||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||||
<classpathentry kind="lib" path="lib/lucene-analyzers-common-5.5.4.jar"/>
|
<classpathentry kind="lib" path="lib/lucene-analyzers-common-6.4.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/lucene-core-5.5.4.jar"/>
|
<classpathentry kind="lib" path="lib/lucene-core-6.4.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/lucene-highlighter-5.5.4.jar"/>
|
<classpathentry kind="lib" path="lib/lucene-highlighter-6.4.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/lucene-memory-5.5.4.jar"/>
|
<classpathentry kind="lib" path="lib/lucene-memory-6.4.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/lucene-queryparser-5.5.4.jar"/>
|
<classpathentry kind="lib" path="lib/lucene-queryparser-6.4.2.jar"/>
|
||||||
<classpathentry kind="output" path="bin"/>
|
<classpathentry kind="output" path="bin"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
|
||||||
|
public class Gflucene_highlighter_item {
|
||||||
|
public int num;
|
||||||
|
public String text;
|
||||||
|
public Gflucene_highlighter_item(int num, String text) {
|
||||||
|
this.num = num;
|
||||||
|
this.text = text;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,90 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
|
||||||
|
import gplx.gflucene.core.*;
|
||||||
|
import gplx.gflucene.analyzers.*;
|
||||||
|
import gplx.gflucene.searchers.*;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.queryparser.classic.ParseException;
|
||||||
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.highlight.Formatter;
|
||||||
|
import org.apache.lucene.search.highlight.Highlighter;
|
||||||
|
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
||||||
|
import org.apache.lucene.search.highlight.QueryScorer;
|
||||||
|
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||||
|
import org.apache.lucene.search.highlight.TextFragment;
|
||||||
|
import org.apache.lucene.search.highlight.TokenSources;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
public class Gflucene_highlighter_mgr {
|
||||||
|
private Analyzer analyzer;
|
||||||
|
|
||||||
|
public Gflucene_highlighter_mgr() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Init(Gflucene_index_data idx_data) {
|
||||||
|
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
|
||||||
|
}
|
||||||
|
public Gflucene_highlighter_item[] Exec(Gflucene_searcher_qry qry_data, Gflucene_doc_data doc_data) {
|
||||||
|
// create query
|
||||||
|
QueryParser parser = new QueryParser("body", analyzer);
|
||||||
|
Query query = null;
|
||||||
|
try {
|
||||||
|
query = parser.parse(qry_data.query);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
throw Err_.new_exc(e, "lucene_index", "failed to parse", "query", qry_data.query);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create highlighter
|
||||||
|
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span class='snip_highlight'>", "</span>");
|
||||||
|
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
|
||||||
|
|
||||||
|
// get token stream
|
||||||
|
String text = doc_data.body;
|
||||||
|
TokenStream tokenStream = analyzer.tokenStream("body", text);
|
||||||
|
|
||||||
|
// get fragments from stream
|
||||||
|
String[] frags;
|
||||||
|
try {
|
||||||
|
frags = highlighter.getBestFragments(tokenStream, text, 10);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
|
||||||
|
} catch (InvalidTokenOffsetsException e) {
|
||||||
|
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert fragments to highlighter items
|
||||||
|
int frags_len = frags.length;
|
||||||
|
Gflucene_highlighter_item[] array = new Gflucene_highlighter_item[frags_len];
|
||||||
|
for (int i = 0; i < frags_len; i++) {
|
||||||
|
String frag = frags[i];
|
||||||
|
array[i] = new Gflucene_highlighter_item(i, frag);
|
||||||
|
}
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
public void Term() {
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user