2017-03-15 17:11:09 +00:00
|
|
|
/*
|
|
|
|
XOWA: the XOWA Offline Wiki Application
|
|
|
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
|
|
|
|
|
|
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
|
|
|
or alternatively under the terms of the Apache License Version 2.0.
|
|
|
|
|
|
|
|
You may use XOWA according to either of these licenses as is most appropriate
|
|
|
|
for your project on a case-by-case basis.
|
|
|
|
|
|
|
|
The terms of each license can be found in the source code repository:
|
|
|
|
|
|
|
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
|
|
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
|
|
|
*/
|
2017-10-24 00:50:50 +00:00
|
|
|
package gplx.gflucene.highlighters; import gplx.*; import gplx.gflucene.*;
|
|
|
|
import gplx.gflucene.core.*;
|
|
|
|
import gplx.gflucene.analyzers.*;
|
|
|
|
import gplx.gflucene.searchers.*;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
|
|
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
|
|
import org.apache.lucene.analysis.TokenStream;
|
|
|
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
|
|
import org.apache.lucene.document.Document;
|
|
|
|
import org.apache.lucene.index.DirectoryReader;
|
|
|
|
import org.apache.lucene.index.IndexReader;
|
|
|
|
import org.apache.lucene.queryparser.classic.ParseException;
|
|
|
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
|
|
|
import org.apache.lucene.search.IndexSearcher;
|
|
|
|
import org.apache.lucene.search.Query;
|
|
|
|
import org.apache.lucene.search.TopDocs;
|
|
|
|
import org.apache.lucene.search.highlight.Formatter;
|
|
|
|
import org.apache.lucene.search.highlight.Fragmenter;
|
|
|
|
import org.apache.lucene.search.highlight.Highlighter;
|
|
|
|
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
|
|
|
import org.apache.lucene.search.highlight.QueryScorer;
|
|
|
|
import org.apache.lucene.search.highlight.SimpleFragmenter;
|
|
|
|
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
|
|
|
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
|
|
|
|
import org.apache.lucene.search.highlight.TextFragment;
|
|
|
|
import org.apache.lucene.search.highlight.TokenSources;
|
|
|
|
import org.apache.lucene.store.FSDirectory;
|
|
|
|
public class Gflucene_highlighter_mgr {
|
|
|
|
private Analyzer analyzer;
|
|
|
|
|
|
|
|
public Gflucene_highlighter_mgr() {
|
|
|
|
}
|
|
|
|
|
|
|
|
public void Init(Gflucene_index_data idx_data) {
|
|
|
|
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
|
|
|
|
}
|
|
|
|
public Gflucene_highlighter_item[] Exec(Gflucene_searcher_qry qry_data, Gflucene_doc_data doc_data) {
|
|
|
|
// create query
|
|
|
|
QueryParser parser = new QueryParser("body", analyzer);
|
|
|
|
Query query = null;
|
|
|
|
try {
|
|
|
|
query = parser.parse(qry_data.query);
|
|
|
|
} catch (ParseException e) {
|
|
|
|
throw Err_.new_exc(e, "lucene_index", "failed to parse", "query", qry_data.query);
|
|
|
|
}
|
|
|
|
|
|
|
|
// create highlighter
|
|
|
|
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span class='snip_highlight'>", "</span>");
|
|
|
|
QueryScorer scorer = new QueryScorer(query);
|
|
|
|
scorer.setExpandMultiTermQuery(false);
|
|
|
|
Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
|
|
|
|
SimpleFragmenter fragmenter = new SimpleFragmenter(100);
|
|
|
|
highlighter.setTextFragmenter(fragmenter);
|
|
|
|
|
|
|
|
// get token stream
|
|
|
|
String text = doc_data.body;
|
|
|
|
TokenStream tokenStream = null;
|
|
|
|
try {
|
|
|
|
tokenStream = analyzer.tokenStream("body", text);
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw Err_.new_exc(e, "lucene_index", "failed to get stream", "query", qry_data.query);
|
|
|
|
}
|
|
|
|
|
|
|
|
// get fragments from stream
|
|
|
|
TextFragment[] frags;
|
|
|
|
try {
|
|
|
|
// frags = highlighter.getBestTextFragments(tokenStream, text, false, 1000);
|
|
|
|
frags = highlighter.getBestTextFragments(tokenStream, text, true, 10);
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
|
|
|
|
} catch (InvalidTokenOffsetsException e) {
|
|
|
|
throw Err_.new_exc(e, "lucene_index", "failed to get best", "query", qry_data.query);
|
|
|
|
}
|
|
|
|
|
|
|
|
// convert fragments to highlighter items
|
|
|
|
int frags_len = frags.length;
|
|
|
|
Gflucene_highlighter_item[] array = new Gflucene_highlighter_item[frags_len];
|
|
|
|
for (int i = 0; i < frags_len; i++) {
|
|
|
|
String frag = frags[i].toString();
|
|
|
|
array[i] = new Gflucene_highlighter_item(i, frag);
|
|
|
|
}
|
|
|
|
return array;
|
|
|
|
}
|
|
|
|
public void Term() {
|
|
|
|
}
|
|
|
|
}
|