diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java index b16fe823b..15a5d4017 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_wkr.java @@ -14,20 +14,24 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*; -import gplx.gflucene.*; +import gplx.gflucene.core.*; +import gplx.gflucene.indexers.*; public class Xofulltext_indexer_wkr { - private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr(); + private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr(); public void Init(Xow_wiki wiki) { Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search"); Io_mgr.Instance.DeleteDirDeep(search_dir); - index_wtr.Init(search_dir.Xto_api()); + ; + index_wtr.Init(new Gflucene_index_data + ( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str()) + , search_dir.Xto_api())); } public void Index(Xoae_page wpg) { // TODO: skip if not main_ns Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry()); } public void Index(int page_id, int score, byte[] ttl, byte[] html) { - Gflucene_index_data data = new Gflucene_index_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html)); + Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html)); index_wtr.Exec(data); } public void Term() { diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java index 740515865..522f5d0da 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/searchers/mgrs/gflucenes/Xofulltext_searcher__lucene.java @@ -15,18 +15,23 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*; import gplx.gflucene.*; +import gplx.gflucene.core.*; +import gplx.gflucene.indexers.*; +import gplx.gflucene.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*; public class Xofulltext_searcher__lucene implements Xofulltext_searcher { - private final Gflucene_searcher searcher = new Gflucene_searcher(); + private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr(); public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) { // create list List_adp list = List_adp_.New(); // init searcher with wiki - searcher.Init(wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()); + searcher.Init(new Gflucene_index_data + ( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str()) + , wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api())); // exec search - searcher.Exec(list, new Gflucene_searcher_data(String_.new_u8(args.query), args.max_pages_per_wiki)); + searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki)); // term searcher.Term(); @@ -34,7 +39,7 @@ public class Xofulltext_searcher__lucene implements Xofulltext_searcher { // loop list int len = list.Len(); for (int i = 0; i < len; i++) { - Gflucene_index_data found = (Gflucene_index_data)list.Get_at(i); + Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i); // call page found Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false); diff --git a/gplx.gflucene/.classpath b/gplx.gflucene/.classpath index 76ff4b329..f8007e6e1 100644 --- a/gplx.gflucene/.classpath +++ b/gplx.gflucene/.classpath @@ -7,5 +7,6 @@ + diff --git a/gplx.gflucene/src/gplx/gflucene/analyzers/Gflucene_analyzer_mgr_.java b/gplx.gflucene/src/gplx/gflucene/analyzers/Gflucene_analyzer_mgr_.java new file mode 100644 index 000000000..dfa9db99c --- /dev/null +++ b/gplx.gflucene/src/gplx/gflucene/analyzers/Gflucene_analyzer_mgr_.java @@ -0,0 +1,58 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.gflucene.analyzers; import gplx.*; import gplx.gflucene.*; +import gplx.gflucene.core.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +public class Gflucene_analyzer_mgr_ { + public static Analyzer New_analyzer(String key) { + if (String_.Eq(key, "standard")) return new org.apache.lucene.analysis.standard.StandardAnalyzer(); + else if (String_.Eq(key, "ar")) return new org.apache.lucene.analysis.ar.ArabicAnalyzer(); + else if (String_.Eq(key, "bg")) return new org.apache.lucene.analysis.bg.BulgarianAnalyzer(); +// else if (String_.Eq(key, "br")) return new org.apache.lucene.analysis.br.BrazilianAnalyzer(); + else if (String_.Eq(key, "ca")) return new org.apache.lucene.analysis.ca.CatalanAnalyzer(); + else if (String_.Eq(key, "cjk")) return new org.apache.lucene.analysis.cjk.CJKAnalyzer(); + else if (String_.Eq(key, "ckb")) return new org.apache.lucene.analysis.ckb.SoraniAnalyzer(); + else if (String_.Eq(key, "cz")) return new org.apache.lucene.analysis.cz.CzechAnalyzer(); + else if (String_.Eq(key, "da")) return new org.apache.lucene.analysis.da.DanishAnalyzer(); + else if (String_.Eq(key, "de")) return new org.apache.lucene.analysis.de.GermanAnalyzer(); + else if (String_.Eq(key, "el")) return new org.apache.lucene.analysis.el.GreekAnalyzer(); + else if (String_.Eq(key, "en")) return new org.apache.lucene.analysis.en.EnglishAnalyzer(); + else if (String_.Eq(key, "es")) return new org.apache.lucene.analysis.es.SpanishAnalyzer(); + else if (String_.Eq(key, "eu")) return new org.apache.lucene.analysis.eu.BasqueAnalyzer(); + else if (String_.Eq(key, "fa")) return new org.apache.lucene.analysis.fa.PersianAnalyzer(); + else if (String_.Eq(key, "fi")) return new org.apache.lucene.analysis.fi.FinnishAnalyzer(); + else if (String_.Eq(key, "fr")) return new org.apache.lucene.analysis.fr.FrenchAnalyzer(); + else if (String_.Eq(key, "ga")) return new org.apache.lucene.analysis.ga.IrishAnalyzer(); + else if (String_.Eq(key, "gl")) return new org.apache.lucene.analysis.gl.GalicianAnalyzer(); + else if (String_.Eq(key, "hi")) return new org.apache.lucene.analysis.hi.HindiAnalyzer(); + else if (String_.Eq(key, "hu")) return new org.apache.lucene.analysis.hu.HungarianAnalyzer(); + else if (String_.Eq(key, "hy")) return new org.apache.lucene.analysis.hy.ArmenianAnalyzer(); + else if (String_.Eq(key, "id")) return new org.apache.lucene.analysis.id.IndonesianAnalyzer(); + else if (String_.Eq(key, "it")) return new org.apache.lucene.analysis.it.ItalianAnalyzer(); + else if (String_.Eq(key, "lt")) return new org.apache.lucene.analysis.lt.LithuanianAnalyzer(); + else if (String_.Eq(key, "lv")) return new org.apache.lucene.analysis.lv.LatvianAnalyzer(); + else if (String_.Eq(key, "nl")) return new org.apache.lucene.analysis.nl.DutchAnalyzer(); + else if (String_.Eq(key, "no")) return new org.apache.lucene.analysis.no.NorwegianAnalyzer(); + else if (String_.Eq(key, "pt")) return new org.apache.lucene.analysis.pt.PortugueseAnalyzer(); + else if (String_.Eq(key, "ro")) return new org.apache.lucene.analysis.ro.RomanianAnalyzer(); + else if (String_.Eq(key, "ru")) return new org.apache.lucene.analysis.ru.RussianAnalyzer(); + else if (String_.Eq(key, "sv")) return new org.apache.lucene.analysis.sv.SwedishAnalyzer(); + else if (String_.Eq(key, "th")) return new org.apache.lucene.analysis.th.ThaiAnalyzer(); + else if (String_.Eq(key, "tr")) return new org.apache.lucene.analysis.tr.TurkishAnalyzer(); + else throw Err_.new_unhandled_default(key); + } + } diff --git a/gplx.gflucene/src/gplx/gflucene/core/Gflucene_analyzer_data.java b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_analyzer_data.java new file mode 100644 index 000000000..ce0bfc3fe --- /dev/null +++ b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_analyzer_data.java @@ -0,0 +1,35 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*; +public class Gflucene_analyzer_data { + public final String key; + public Gflucene_analyzer_data(String key) { + this.key = key; + } + public static Gflucene_analyzer_data New_data_from_locale(String locale) { + String key = null; + if (String_.Eq(locale, "en")) key = "standard"; // NOTE: en exists but use standard for now + else if (String_.EqAny(locale + , "ar", "bg", "ca", "ckb", "cz", "da", "de", "el", "es", "eu", "fa", "fi", "fr", "ga", "gl", "hi" + , "hu", "hy", "id", "it", "lt", "lv", "nl", "no", "pt", "ro", "ru", "sv", "th", "tr") + ) key = locale; + else if (String_.EqAny(locale + , "zh", "ja", "ko") + ) key = "cjk"; + else key = "standard"; + return new Gflucene_analyzer_data(key); + } +} diff --git a/gplx.gflucene/src/gplx/gflucene/Gflucene_index_data.java b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_doc_data.java similarity index 80% rename from gplx.gflucene/src/gplx/gflucene/Gflucene_index_data.java rename to gplx.gflucene/src/gplx/gflucene/core/Gflucene_doc_data.java index f17bdd08a..59694df2a 100644 --- a/gplx.gflucene/src/gplx/gflucene/Gflucene_index_data.java +++ b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_doc_data.java @@ -13,14 +13,14 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.gflucene; import gplx.*; -public class Gflucene_index_data { +package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*; +public class Gflucene_doc_data { public String title; public String body; public int page_id; public int score; public float lucene_score = 0; - public Gflucene_index_data(int page_id, int score, String title, String body) { + public Gflucene_doc_data(int page_id, int score, String title, String body) { this.page_id = page_id; this.score = score; this.title = title; diff --git a/gplx.gflucene/src/gplx/gflucene/core/Gflucene_index_data.java b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_index_data.java new file mode 100644 index 000000000..41e987dfd --- /dev/null +++ b/gplx.gflucene/src/gplx/gflucene/core/Gflucene_index_data.java @@ -0,0 +1,25 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*; +import gplx.gflucene.analyzers.*; +public class Gflucene_index_data { + public final Gflucene_analyzer_data analyzer_data; + public final String index_dir; + public Gflucene_index_data(Gflucene_analyzer_data analyzer_data, String index_dir) { + this.analyzer_data = analyzer_data; + this.index_dir = index_dir; + } +} diff --git a/gplx.gflucene/src/gplx/gflucene/Gflucene_index_bldr.java b/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java similarity index 72% rename from gplx.gflucene/src/gplx/gflucene/Gflucene_index_bldr.java rename to gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java index ed4c4f69a..79530626f 100644 --- a/gplx.gflucene/src/gplx/gflucene/Gflucene_index_bldr.java +++ b/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java @@ -13,11 +13,13 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.gflucene; import gplx.*; +package gplx.gflucene.indexers; import gplx.*; import gplx.gflucene.*; +import gplx.gflucene.core.*; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexOptions; @@ -25,21 +27,25 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -public class Gflucene_index_bldr { - private final StandardAnalyzer analyzer = new StandardAnalyzer(); - private final IndexWriterConfig config; + +import gplx.gflucene.analyzers.*;; +public class Gflucene_indexer_mgr { + private Analyzer analyzer; + private IndexWriterConfig config; private Directory index; private IndexWriter wtr; private FieldType body_fld; - public Gflucene_index_bldr() { - this.config = new IndexWriterConfig(analyzer); + public Gflucene_indexer_mgr() { } - public void Init(String index_dir) { - + public void Init(Gflucene_index_data idx_data) { + // create analyzer + this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key); + this.config = new IndexWriterConfig(analyzer); + // create index - Path path = Paths.get(index_dir); + Path path = Paths.get(idx_data.index_dir); try { this.index = FSDirectory.open(path); } catch (IOException e) { @@ -62,18 +68,18 @@ public class Gflucene_index_bldr { // body_fld.setStoreTermVectors(true); // body_fld.setStoreTermVectorOffsets(true); } - public void Exec(Gflucene_index_data data) { + public void Exec(Gflucene_doc_data doc_data) { // org.apache.lucene.document. Document doc = new Document(); // doc.add(new SortedNumericDocValuesField("page_score", data.score)); - doc.add(new StoredField("page_score", data.score)); - doc.add(new StoredField("page_id", data.page_id)); - doc.add(new TextField("title", data.title, Field.Store.YES)); - doc.add(new Field("body", data.body, body_fld)); + doc.add(new StoredField("page_score", doc_data.score)); + doc.add(new StoredField("page_id", doc_data.page_id)); + doc.add(new TextField("title", doc_data.title, Field.Store.YES)); + doc.add(new Field("body", doc_data.body, body_fld)); try { wtr.addDocument(doc); } catch (IOException e) { - throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", data.title); + throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", doc_data.title); } } public void Term() { diff --git a/gplx.gflucene/src/gplx/gflucene/Gflucene_searcher.java b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java similarity index 70% rename from gplx.gflucene/src/gplx/gflucene/Gflucene_searcher.java rename to gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java index beea71bfe..f778af859 100644 --- a/gplx.gflucene/src/gplx/gflucene/Gflucene_searcher.java +++ b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java @@ -13,12 +13,14 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.gflucene; import gplx.*; +package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*; +import gplx.gflucene.core.*; +import gplx.gflucene.analyzers.*; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -29,22 +31,28 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -public class Gflucene_searcher { - private final StandardAnalyzer analyzer = new StandardAnalyzer(); + +import gplx.gflucene.indexers.*; +public class Gflucene_searcher_mgr { + private Analyzer analyzer; private Directory index; - public Gflucene_searcher() { + public Gflucene_searcher_mgr() { } - public void Init(String index_dir) { - Path path = Paths.get(index_dir); + public void Init(Gflucene_index_data idx_data) { + // create analyzer + this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key); + + // get index + Path path = Paths.get(idx_data.index_dir); try { this.index = FSDirectory.open(path); } catch (IOException e) { - throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", index_dir); + throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", idx_data.index_dir); } } - public void Exec(List_adp list, Gflucene_searcher_data data) { + public void Exec(List_adp list, Gflucene_searcher_qry data) { try { IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); @@ -58,7 +66,7 @@ public class Gflucene_searcher { for(int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); - Gflucene_index_data doc = new Gflucene_index_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), ""); + Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), ""); doc.lucene_score = hits[i].score; list.Add(doc); } diff --git a/gplx.gflucene/src/gplx/gflucene/Gflucene_searcher_data.java b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_qry.java similarity index 79% rename from gplx.gflucene/src/gplx/gflucene/Gflucene_searcher_data.java rename to gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_qry.java index 61734f841..a10783b7e 100644 --- a/gplx.gflucene/src/gplx/gflucene/Gflucene_searcher_data.java +++ b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_qry.java @@ -13,11 +13,11 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.gflucene; import gplx.*; -public class Gflucene_searcher_data { +package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*; +public class Gflucene_searcher_qry { public String query; public int match_max; - public Gflucene_searcher_data(String query, int match_max) { + public Gflucene_searcher_qry(String query, int match_max) { this.query = query; this.match_max = match_max; }