mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Full-text search: Add analyzers for non-English languages
This commit is contained in:
parent
77de7215ce
commit
3b6cc45084
@ -14,20 +14,24 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
|
||||
import gplx.gflucene.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import gplx.gflucene.indexers.*;
|
||||
public class Xofulltext_indexer_wkr {
|
||||
private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr();
|
||||
private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr();
|
||||
public void Init(Xow_wiki wiki) {
|
||||
Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search");
|
||||
Io_mgr.Instance.DeleteDirDeep(search_dir);
|
||||
index_wtr.Init(search_dir.Xto_api());
|
||||
;
|
||||
index_wtr.Init(new Gflucene_index_data
|
||||
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
|
||||
, search_dir.Xto_api()));
|
||||
}
|
||||
public void Index(Xoae_page wpg) {
|
||||
// TODO: skip if not main_ns
|
||||
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry());
|
||||
}
|
||||
public void Index(int page_id, int score, byte[] ttl, byte[] html) {
|
||||
Gflucene_index_data data = new Gflucene_index_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));
|
||||
Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));
|
||||
index_wtr.Exec(data);
|
||||
}
|
||||
public void Term() {
|
||||
|
@ -15,18 +15,23 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*;
|
||||
import gplx.gflucene.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import gplx.gflucene.indexers.*;
|
||||
import gplx.gflucene.searchers.*;
|
||||
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
|
||||
public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
|
||||
private final Gflucene_searcher searcher = new Gflucene_searcher();
|
||||
private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr();
|
||||
public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) {
|
||||
// create list
|
||||
List_adp list = List_adp_.New();
|
||||
|
||||
// init searcher with wiki
|
||||
searcher.Init(wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api());
|
||||
searcher.Init(new Gflucene_index_data
|
||||
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
|
||||
, wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()));
|
||||
|
||||
// exec search
|
||||
searcher.Exec(list, new Gflucene_searcher_data(String_.new_u8(args.query), args.max_pages_per_wiki));
|
||||
searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki));
|
||||
|
||||
// term
|
||||
searcher.Term();
|
||||
@ -34,7 +39,7 @@ public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
|
||||
// loop list
|
||||
int len = list.Len();
|
||||
for (int i = 0; i < len; i++) {
|
||||
Gflucene_index_data found = (Gflucene_index_data)list.Get_at(i);
|
||||
Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i);
|
||||
|
||||
// call page found
|
||||
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false);
|
||||
|
@ -7,5 +7,6 @@
|
||||
<classpathentry kind="lib" path="lib/lucene-memory-6.4.2.jar"/>
|
||||
<classpathentry kind="lib" path="lib/lucene-highlighter-6.4.2.jar"/>
|
||||
<classpathentry kind="lib" path="lib/lucene-queryparser-6.4.2.jar"/>
|
||||
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-analyzers-common-6.4.2.jar"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
||||
|
@ -0,0 +1,58 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene.analyzers; import gplx.*; import gplx.gflucene.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
public class Gflucene_analyzer_mgr_ {
|
||||
public static Analyzer New_analyzer(String key) {
|
||||
if (String_.Eq(key, "standard")) return new org.apache.lucene.analysis.standard.StandardAnalyzer();
|
||||
else if (String_.Eq(key, "ar")) return new org.apache.lucene.analysis.ar.ArabicAnalyzer();
|
||||
else if (String_.Eq(key, "bg")) return new org.apache.lucene.analysis.bg.BulgarianAnalyzer();
|
||||
// else if (String_.Eq(key, "br")) return new org.apache.lucene.analysis.br.BrazilianAnalyzer();
|
||||
else if (String_.Eq(key, "ca")) return new org.apache.lucene.analysis.ca.CatalanAnalyzer();
|
||||
else if (String_.Eq(key, "cjk")) return new org.apache.lucene.analysis.cjk.CJKAnalyzer();
|
||||
else if (String_.Eq(key, "ckb")) return new org.apache.lucene.analysis.ckb.SoraniAnalyzer();
|
||||
else if (String_.Eq(key, "cz")) return new org.apache.lucene.analysis.cz.CzechAnalyzer();
|
||||
else if (String_.Eq(key, "da")) return new org.apache.lucene.analysis.da.DanishAnalyzer();
|
||||
else if (String_.Eq(key, "de")) return new org.apache.lucene.analysis.de.GermanAnalyzer();
|
||||
else if (String_.Eq(key, "el")) return new org.apache.lucene.analysis.el.GreekAnalyzer();
|
||||
else if (String_.Eq(key, "en")) return new org.apache.lucene.analysis.en.EnglishAnalyzer();
|
||||
else if (String_.Eq(key, "es")) return new org.apache.lucene.analysis.es.SpanishAnalyzer();
|
||||
else if (String_.Eq(key, "eu")) return new org.apache.lucene.analysis.eu.BasqueAnalyzer();
|
||||
else if (String_.Eq(key, "fa")) return new org.apache.lucene.analysis.fa.PersianAnalyzer();
|
||||
else if (String_.Eq(key, "fi")) return new org.apache.lucene.analysis.fi.FinnishAnalyzer();
|
||||
else if (String_.Eq(key, "fr")) return new org.apache.lucene.analysis.fr.FrenchAnalyzer();
|
||||
else if (String_.Eq(key, "ga")) return new org.apache.lucene.analysis.ga.IrishAnalyzer();
|
||||
else if (String_.Eq(key, "gl")) return new org.apache.lucene.analysis.gl.GalicianAnalyzer();
|
||||
else if (String_.Eq(key, "hi")) return new org.apache.lucene.analysis.hi.HindiAnalyzer();
|
||||
else if (String_.Eq(key, "hu")) return new org.apache.lucene.analysis.hu.HungarianAnalyzer();
|
||||
else if (String_.Eq(key, "hy")) return new org.apache.lucene.analysis.hy.ArmenianAnalyzer();
|
||||
else if (String_.Eq(key, "id")) return new org.apache.lucene.analysis.id.IndonesianAnalyzer();
|
||||
else if (String_.Eq(key, "it")) return new org.apache.lucene.analysis.it.ItalianAnalyzer();
|
||||
else if (String_.Eq(key, "lt")) return new org.apache.lucene.analysis.lt.LithuanianAnalyzer();
|
||||
else if (String_.Eq(key, "lv")) return new org.apache.lucene.analysis.lv.LatvianAnalyzer();
|
||||
else if (String_.Eq(key, "nl")) return new org.apache.lucene.analysis.nl.DutchAnalyzer();
|
||||
else if (String_.Eq(key, "no")) return new org.apache.lucene.analysis.no.NorwegianAnalyzer();
|
||||
else if (String_.Eq(key, "pt")) return new org.apache.lucene.analysis.pt.PortugueseAnalyzer();
|
||||
else if (String_.Eq(key, "ro")) return new org.apache.lucene.analysis.ro.RomanianAnalyzer();
|
||||
else if (String_.Eq(key, "ru")) return new org.apache.lucene.analysis.ru.RussianAnalyzer();
|
||||
else if (String_.Eq(key, "sv")) return new org.apache.lucene.analysis.sv.SwedishAnalyzer();
|
||||
else if (String_.Eq(key, "th")) return new org.apache.lucene.analysis.th.ThaiAnalyzer();
|
||||
else if (String_.Eq(key, "tr")) return new org.apache.lucene.analysis.tr.TurkishAnalyzer();
|
||||
else throw Err_.new_unhandled_default(key);
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
|
||||
public class Gflucene_analyzer_data {
|
||||
public final String key;
|
||||
public Gflucene_analyzer_data(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
public static Gflucene_analyzer_data New_data_from_locale(String locale) {
|
||||
String key = null;
|
||||
if (String_.Eq(locale, "en")) key = "standard"; // NOTE: en exists but use standard for now
|
||||
else if (String_.EqAny(locale
|
||||
, "ar", "bg", "ca", "ckb", "cz", "da", "de", "el", "es", "eu", "fa", "fi", "fr", "ga", "gl", "hi"
|
||||
, "hu", "hy", "id", "it", "lt", "lv", "nl", "no", "pt", "ro", "ru", "sv", "th", "tr")
|
||||
) key = locale;
|
||||
else if (String_.EqAny(locale
|
||||
, "zh", "ja", "ko")
|
||||
) key = "cjk";
|
||||
else key = "standard";
|
||||
return new Gflucene_analyzer_data(key);
|
||||
}
|
||||
}
|
@ -13,14 +13,14 @@ The terms of each license can be found in the source code repository:
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene; import gplx.*;
|
||||
public class Gflucene_index_data {
|
||||
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
|
||||
public class Gflucene_doc_data {
|
||||
public String title;
|
||||
public String body;
|
||||
public int page_id;
|
||||
public int score;
|
||||
public float lucene_score = 0;
|
||||
public Gflucene_index_data(int page_id, int score, String title, String body) {
|
||||
public Gflucene_doc_data(int page_id, int score, String title, String body) {
|
||||
this.page_id = page_id;
|
||||
this.score = score;
|
||||
this.title = title;
|
@ -0,0 +1,25 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
|
||||
import gplx.gflucene.analyzers.*;
|
||||
public class Gflucene_index_data {
|
||||
public final Gflucene_analyzer_data analyzer_data;
|
||||
public final String index_dir;
|
||||
public Gflucene_index_data(Gflucene_analyzer_data analyzer_data, String index_dir) {
|
||||
this.analyzer_data = analyzer_data;
|
||||
this.index_dir = index_dir;
|
||||
}
|
||||
}
|
@ -13,11 +13,13 @@ The terms of each license can be found in the source code repository:
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene; import gplx.*;
|
||||
package gplx.gflucene.indexers; import gplx.*; import gplx.gflucene.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
@ -25,21 +27,25 @@ import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
public class Gflucene_index_bldr {
|
||||
private final StandardAnalyzer analyzer = new StandardAnalyzer();
|
||||
private final IndexWriterConfig config;
|
||||
|
||||
import gplx.gflucene.analyzers.*;;
|
||||
public class Gflucene_indexer_mgr {
|
||||
private Analyzer analyzer;
|
||||
private IndexWriterConfig config;
|
||||
private Directory index;
|
||||
private IndexWriter wtr;
|
||||
private FieldType body_fld;
|
||||
|
||||
public Gflucene_index_bldr() {
|
||||
this.config = new IndexWriterConfig(analyzer);
|
||||
public Gflucene_indexer_mgr() {
|
||||
}
|
||||
|
||||
public void Init(String index_dir) {
|
||||
|
||||
public void Init(Gflucene_index_data idx_data) {
|
||||
// create analyzer
|
||||
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
|
||||
this.config = new IndexWriterConfig(analyzer);
|
||||
|
||||
// create index
|
||||
Path path = Paths.get(index_dir);
|
||||
Path path = Paths.get(idx_data.index_dir);
|
||||
try {
|
||||
this.index = FSDirectory.open(path);
|
||||
} catch (IOException e) {
|
||||
@ -62,18 +68,18 @@ public class Gflucene_index_bldr {
|
||||
// body_fld.setStoreTermVectors(true);
|
||||
// body_fld.setStoreTermVectorOffsets(true);
|
||||
}
|
||||
public void Exec(Gflucene_index_data data) {
|
||||
public void Exec(Gflucene_doc_data doc_data) {
|
||||
// org.apache.lucene.document.
|
||||
Document doc = new Document();
|
||||
// doc.add(new SortedNumericDocValuesField("page_score", data.score));
|
||||
doc.add(new StoredField("page_score", data.score));
|
||||
doc.add(new StoredField("page_id", data.page_id));
|
||||
doc.add(new TextField("title", data.title, Field.Store.YES));
|
||||
doc.add(new Field("body", data.body, body_fld));
|
||||
doc.add(new StoredField("page_score", doc_data.score));
|
||||
doc.add(new StoredField("page_id", doc_data.page_id));
|
||||
doc.add(new TextField("title", doc_data.title, Field.Store.YES));
|
||||
doc.add(new Field("body", doc_data.body, body_fld));
|
||||
try {
|
||||
wtr.addDocument(doc);
|
||||
} catch (IOException e) {
|
||||
throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", data.title);
|
||||
throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", doc_data.title);
|
||||
}
|
||||
}
|
||||
public void Term() {
|
@ -13,12 +13,14 @@ The terms of each license can be found in the source code repository:
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene; import gplx.*;
|
||||
package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*;
|
||||
import gplx.gflucene.core.*;
|
||||
import gplx.gflucene.analyzers.*;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
@ -29,22 +31,28 @@ import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
public class Gflucene_searcher {
|
||||
private final StandardAnalyzer analyzer = new StandardAnalyzer();
|
||||
|
||||
import gplx.gflucene.indexers.*;
|
||||
public class Gflucene_searcher_mgr {
|
||||
private Analyzer analyzer;
|
||||
private Directory index;
|
||||
|
||||
public Gflucene_searcher() {
|
||||
public Gflucene_searcher_mgr() {
|
||||
}
|
||||
|
||||
public void Init(String index_dir) {
|
||||
Path path = Paths.get(index_dir);
|
||||
public void Init(Gflucene_index_data idx_data) {
|
||||
// create analyzer
|
||||
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
|
||||
|
||||
// get index
|
||||
Path path = Paths.get(idx_data.index_dir);
|
||||
try {
|
||||
this.index = FSDirectory.open(path);
|
||||
} catch (IOException e) {
|
||||
throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", index_dir);
|
||||
throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", idx_data.index_dir);
|
||||
}
|
||||
}
|
||||
public void Exec(List_adp list, Gflucene_searcher_data data) {
|
||||
public void Exec(List_adp list, Gflucene_searcher_qry data) {
|
||||
try {
|
||||
IndexReader reader = DirectoryReader.open(index);
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
@ -58,7 +66,7 @@ public class Gflucene_searcher {
|
||||
for(int i = 0; i < hits.length; i++) {
|
||||
int docId = hits[i].doc;
|
||||
Document d = searcher.doc(docId);
|
||||
Gflucene_index_data doc = new Gflucene_index_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), "");
|
||||
Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), "");
|
||||
doc.lucene_score = hits[i].score;
|
||||
list.Add(doc);
|
||||
}
|
@ -13,11 +13,11 @@ The terms of each license can be found in the source code repository:
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.gflucene; import gplx.*;
|
||||
public class Gflucene_searcher_data {
|
||||
package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*;
|
||||
public class Gflucene_searcher_qry {
|
||||
public String query;
|
||||
public int match_max;
|
||||
public Gflucene_searcher_data(String query, int match_max) {
|
||||
public Gflucene_searcher_qry(String query, int match_max) {
|
||||
this.query = query;
|
||||
this.match_max = match_max;
|
||||
}
|
Loading…
Reference in New Issue
Block a user