1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Full-text search: Add highlighter

This commit is contained in:
gnosygnu
2017-03-15 13:11:09 -04:00
parent d53fe9628f
commit 8f8e414c80
6 changed files with 221 additions and 13 deletions

View File

@@ -3,10 +3,10 @@
<classpathentry kind="src" path="src"/>
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-memory-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-5.5.4.jar"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-memory-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-6.4.2.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View File

@@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
public class Gflucene_highlighter_item {
public int num;
public String text;
public Gflucene_highlighter_item(int num, String text) {
this.num = num;
this.text = text;
}
}

View File

@@ -0,0 +1,90 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.analyzers.*;
import gplx.gflucene.searchers.*;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_highlighter_mgr {
private Analyzer analyzer;
public Gflucene_highlighter_mgr() {
}
public void Init(Gflucene_index_data idx_data) {
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
}
public Gflucene_highlighter_item[] Exec(Gflucene_searcher_qry qry_data, Gflucene_doc_data doc_data) {
// create query
QueryParser parser = new QueryParser("body", analyzer);
Query query = null;
try {
query = parser.parse(qry_data.query);
} catch (ParseException e) {
throw Err_.new_exc(e, "lucene_index", "failed to parse", "query", qry_data.query);
}
// create highlighter
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span class='snip_highlight'>", "</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
// get token stream
String text = doc_data.body;
TokenStream tokenStream = analyzer.tokenStream("body", text);
// get fragments from stream
String[] frags;
try {
frags = highlighter.getBestFragments(tokenStream, text, 10);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
} catch (InvalidTokenOffsetsException e) {
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
}
// convert fragments to highlighter items
int frags_len = frags.length;
Gflucene_highlighter_item[] array = new Gflucene_highlighter_item[frags_len];
for (int i = 0; i < frags_len; i++) {
String frag = frags[i];
array[i] = new Gflucene_highlighter_item(i, frag);
}
return array;
}
public void Term() {
}
}