1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Full-text search: Add lucene indexer

This commit is contained in:
gnosygnu
2017-03-12 22:57:42 -04:00
parent ae9d0fccd3
commit 77de7215ce
47 changed files with 688 additions and 196 deletions

View File

@@ -0,0 +1,91 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_index_bldr {
private final StandardAnalyzer analyzer = new StandardAnalyzer();
private final IndexWriterConfig config;
private Directory index;
private IndexWriter wtr;
private FieldType body_fld;
public Gflucene_index_bldr() {
this.config = new IndexWriterConfig(analyzer);
}
public void Init(String index_dir) {
// create index
Path path = Paths.get(index_dir);
try {
this.index = FSDirectory.open(path);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to open lucene index", "path", path);
}
// create writer
try {
wtr = new IndexWriter(index, config);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to create writer");
}
// create field for body
this.body_fld = new FieldType();
body_fld.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
// body_fld.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
// body_fld.setStored(true);
body_fld.setTokenized(true);
// body_fld.setStoreTermVectors(true);
// body_fld.setStoreTermVectorOffsets(true);
}
public void Exec(Gflucene_index_data data) {
// org.apache.lucene.document.
Document doc = new Document();
// doc.add(new SortedNumericDocValuesField("page_score", data.score));
doc.add(new StoredField("page_score", data.score));
doc.add(new StoredField("page_id", data.page_id));
doc.add(new TextField("title", data.title, Field.Store.YES));
doc.add(new Field("body", data.body, body_fld));
try {
wtr.addDocument(doc);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", data.title);
}
}
public void Term() {
try {
wtr.close();
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to close writer");
}
try {
index.close();
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to close writer");
}
}
}

View File

@@ -0,0 +1,29 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
public class Gflucene_index_data {
public String title;
public String body;
public int page_id;
public int score;
public float lucene_score = 0;
public Gflucene_index_data(int page_id, int score, String title, String body) {
this.page_id = page_id;
this.score = score;
this.title = title;
this.body = body;
}
}

View File

@@ -0,0 +1,73 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_searcher {
private final StandardAnalyzer analyzer = new StandardAnalyzer();
private Directory index;
public Gflucene_searcher() {
}
public void Init(String index_dir) {
Path path = Paths.get(index_dir);
try {
this.index = FSDirectory.open(path);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", index_dir);
}
}
public void Exec(List_adp list, Gflucene_searcher_data data) {
try {
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new QueryParser("body", analyzer).parse(data.query);
// TopDocs docs = searcher.search(query, reader.maxDoc());
TopDocs docs = searcher.search(query, data.match_max);
ScoreDoc[] hits = docs.scoreDocs;
for(int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
Gflucene_index_data doc = new Gflucene_index_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), "");
doc.lucene_score = hits[i].score;
list.Add(doc);
}
reader.close();
} catch (Exception e) {
throw Err_.new_exc(e, "lucene_index", "failed to exec seearch", "query", data.query);
}
}
public void Term() {
}
}

View File

@@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
public class Gflucene_searcher_data {
public String query;
public int match_max;
public Gflucene_searcher_data(String query, int match_max) {
this.query = query;
this.match_max = match_max;
}
}