Full-text search: Add analyzers for non-English languages

pull/620/head
gnosygnu 7 years ago
parent 77de7215ce
commit 3b6cc45084

@ -14,20 +14,24 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.indexers.*;
public class Xofulltext_indexer_wkr {
private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr();
private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr();
public void Init(Xow_wiki wiki) {
Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search");
Io_mgr.Instance.DeleteDirDeep(search_dir);
index_wtr.Init(search_dir.Xto_api());
;
index_wtr.Init(new Gflucene_index_data
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
, search_dir.Xto_api()));
}
public void Index(Xoae_page wpg) {
// TODO: skip if not main_ns
Index(wpg.Db().Page().Id(), wpg.Db().Page().Score(), wpg.Ttl().Page_txt(), wpg.Db().Html().Html_bry());
}
public void Index(int page_id, int score, byte[] ttl, byte[] html) {
Gflucene_index_data data = new Gflucene_index_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));
Gflucene_doc_data data = new Gflucene_doc_data(page_id, score, String_.new_u8(ttl), String_.new_u8(html));
index_wtr.Exec(data);
}
public void Term() {

@ -15,18 +15,23 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.searchers.*; import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.*;
import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.indexers.*;
import gplx.gflucene.searchers.*;
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.uis.*;
public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
private final Gflucene_searcher searcher = new Gflucene_searcher();
private final Gflucene_searcher_mgr searcher = new Gflucene_searcher_mgr();
public void Search(Xofulltext_searcher_ui cbk, Xow_wiki wiki, Xofulltext_searcher_args args) {
// create list
List_adp list = List_adp_.New();
// init searcher with wiki
searcher.Init(wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api());
searcher.Init(new Gflucene_index_data
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
, wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search").Xto_api()));
// exec search
searcher.Exec(list, new Gflucene_searcher_data(String_.new_u8(args.query), args.max_pages_per_wiki));
searcher.Exec(list, new Gflucene_searcher_qry(String_.new_u8(args.query), args.max_pages_per_wiki));
// term
searcher.Term();
@ -34,7 +39,7 @@ public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
// loop list
int len = list.Len();
for (int i = 0; i < len; i++) {
Gflucene_index_data found = (Gflucene_index_data)list.Get_at(i);
Gflucene_doc_data found = (Gflucene_doc_data)list.Get_at(i);
// call page found
Xofulltext_searcher_page page = new Xofulltext_searcher_page(args.query_id, wiki.Domain_str(), found.page_id, found.title, false);

@ -7,5 +7,6 @@
<classpathentry kind="lib" path="lib/lucene-memory-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-6.4.2.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-6.4.2.jar"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-analyzers-common-6.4.2.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

@ -0,0 +1,58 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.analyzers; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.core.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
public class Gflucene_analyzer_mgr_ {
public static Analyzer New_analyzer(String key) {
if (String_.Eq(key, "standard")) return new org.apache.lucene.analysis.standard.StandardAnalyzer();
else if (String_.Eq(key, "ar")) return new org.apache.lucene.analysis.ar.ArabicAnalyzer();
else if (String_.Eq(key, "bg")) return new org.apache.lucene.analysis.bg.BulgarianAnalyzer();
// else if (String_.Eq(key, "br")) return new org.apache.lucene.analysis.br.BrazilianAnalyzer();
else if (String_.Eq(key, "ca")) return new org.apache.lucene.analysis.ca.CatalanAnalyzer();
else if (String_.Eq(key, "cjk")) return new org.apache.lucene.analysis.cjk.CJKAnalyzer();
else if (String_.Eq(key, "ckb")) return new org.apache.lucene.analysis.ckb.SoraniAnalyzer();
else if (String_.Eq(key, "cz")) return new org.apache.lucene.analysis.cz.CzechAnalyzer();
else if (String_.Eq(key, "da")) return new org.apache.lucene.analysis.da.DanishAnalyzer();
else if (String_.Eq(key, "de")) return new org.apache.lucene.analysis.de.GermanAnalyzer();
else if (String_.Eq(key, "el")) return new org.apache.lucene.analysis.el.GreekAnalyzer();
else if (String_.Eq(key, "en")) return new org.apache.lucene.analysis.en.EnglishAnalyzer();
else if (String_.Eq(key, "es")) return new org.apache.lucene.analysis.es.SpanishAnalyzer();
else if (String_.Eq(key, "eu")) return new org.apache.lucene.analysis.eu.BasqueAnalyzer();
else if (String_.Eq(key, "fa")) return new org.apache.lucene.analysis.fa.PersianAnalyzer();
else if (String_.Eq(key, "fi")) return new org.apache.lucene.analysis.fi.FinnishAnalyzer();
else if (String_.Eq(key, "fr")) return new org.apache.lucene.analysis.fr.FrenchAnalyzer();
else if (String_.Eq(key, "ga")) return new org.apache.lucene.analysis.ga.IrishAnalyzer();
else if (String_.Eq(key, "gl")) return new org.apache.lucene.analysis.gl.GalicianAnalyzer();
else if (String_.Eq(key, "hi")) return new org.apache.lucene.analysis.hi.HindiAnalyzer();
else if (String_.Eq(key, "hu")) return new org.apache.lucene.analysis.hu.HungarianAnalyzer();
else if (String_.Eq(key, "hy")) return new org.apache.lucene.analysis.hy.ArmenianAnalyzer();
else if (String_.Eq(key, "id")) return new org.apache.lucene.analysis.id.IndonesianAnalyzer();
else if (String_.Eq(key, "it")) return new org.apache.lucene.analysis.it.ItalianAnalyzer();
else if (String_.Eq(key, "lt")) return new org.apache.lucene.analysis.lt.LithuanianAnalyzer();
else if (String_.Eq(key, "lv")) return new org.apache.lucene.analysis.lv.LatvianAnalyzer();
else if (String_.Eq(key, "nl")) return new org.apache.lucene.analysis.nl.DutchAnalyzer();
else if (String_.Eq(key, "no")) return new org.apache.lucene.analysis.no.NorwegianAnalyzer();
else if (String_.Eq(key, "pt")) return new org.apache.lucene.analysis.pt.PortugueseAnalyzer();
else if (String_.Eq(key, "ro")) return new org.apache.lucene.analysis.ro.RomanianAnalyzer();
else if (String_.Eq(key, "ru")) return new org.apache.lucene.analysis.ru.RussianAnalyzer();
else if (String_.Eq(key, "sv")) return new org.apache.lucene.analysis.sv.SwedishAnalyzer();
else if (String_.Eq(key, "th")) return new org.apache.lucene.analysis.th.ThaiAnalyzer();
else if (String_.Eq(key, "tr")) return new org.apache.lucene.analysis.tr.TurkishAnalyzer();
else throw Err_.new_unhandled_default(key);
}
}

@ -0,0 +1,35 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
public class Gflucene_analyzer_data {
public final String key;
public Gflucene_analyzer_data(String key) {
this.key = key;
}
public static Gflucene_analyzer_data New_data_from_locale(String locale) {
String key = null;
if (String_.Eq(locale, "en")) key = "standard"; // NOTE: en exists but use standard for now
else if (String_.EqAny(locale
, "ar", "bg", "ca", "ckb", "cz", "da", "de", "el", "es", "eu", "fa", "fi", "fr", "ga", "gl", "hi"
, "hu", "hy", "id", "it", "lt", "lv", "nl", "no", "pt", "ro", "ru", "sv", "th", "tr")
) key = locale;
else if (String_.EqAny(locale
, "zh", "ja", "ko")
) key = "cjk";
else key = "standard";
return new Gflucene_analyzer_data(key);
}
}

@ -13,14 +13,14 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
public class Gflucene_index_data {
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
public class Gflucene_doc_data {
public String title;
public String body;
public int page_id;
public int score;
public float lucene_score = 0;
public Gflucene_index_data(int page_id, int score, String title, String body) {
public Gflucene_doc_data(int page_id, int score, String title, String body) {
this.page_id = page_id;
this.score = score;
this.title = title;

@ -0,0 +1,25 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.core; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.analyzers.*;
public class Gflucene_index_data {
public final Gflucene_analyzer_data analyzer_data;
public final String index_dir;
public Gflucene_index_data(Gflucene_analyzer_data analyzer_data, String index_dir) {
this.analyzer_data = analyzer_data;
this.index_dir = index_dir;
}
}

@ -13,11 +13,13 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
package gplx.gflucene.indexers; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.core.*;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexOptions;
@ -25,21 +27,25 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_index_bldr {
private final StandardAnalyzer analyzer = new StandardAnalyzer();
private final IndexWriterConfig config;
import gplx.gflucene.analyzers.*;;
public class Gflucene_indexer_mgr {
private Analyzer analyzer;
private IndexWriterConfig config;
private Directory index;
private IndexWriter wtr;
private FieldType body_fld;
public Gflucene_index_bldr() {
this.config = new IndexWriterConfig(analyzer);
public Gflucene_indexer_mgr() {
}
public void Init(String index_dir) {
public void Init(Gflucene_index_data idx_data) {
// create analyzer
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
this.config = new IndexWriterConfig(analyzer);
// create index
Path path = Paths.get(index_dir);
Path path = Paths.get(idx_data.index_dir);
try {
this.index = FSDirectory.open(path);
} catch (IOException e) {
@ -62,18 +68,18 @@ public class Gflucene_index_bldr {
// body_fld.setStoreTermVectors(true);
// body_fld.setStoreTermVectorOffsets(true);
}
public void Exec(Gflucene_index_data data) {
public void Exec(Gflucene_doc_data doc_data) {
// org.apache.lucene.document.
Document doc = new Document();
// doc.add(new SortedNumericDocValuesField("page_score", data.score));
doc.add(new StoredField("page_score", data.score));
doc.add(new StoredField("page_id", data.page_id));
doc.add(new TextField("title", data.title, Field.Store.YES));
doc.add(new Field("body", data.body, body_fld));
doc.add(new StoredField("page_score", doc_data.score));
doc.add(new StoredField("page_id", doc_data.page_id));
doc.add(new TextField("title", doc_data.title, Field.Store.YES));
doc.add(new Field("body", doc_data.body, body_fld));
try {
wtr.addDocument(doc);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", data.title);
throw Err_.new_exc(e, "lucene_index", "failed to add document", "title", doc_data.title);
}
}
public void Term() {

@ -13,12 +13,14 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*;
import gplx.gflucene.core.*;
import gplx.gflucene.analyzers.*;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@ -29,22 +31,28 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Gflucene_searcher {
private final StandardAnalyzer analyzer = new StandardAnalyzer();
import gplx.gflucene.indexers.*;
public class Gflucene_searcher_mgr {
private Analyzer analyzer;
private Directory index;
public Gflucene_searcher() {
public Gflucene_searcher_mgr() {
}
public void Init(String index_dir) {
Path path = Paths.get(index_dir);
public void Init(Gflucene_index_data idx_data) {
// create analyzer
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
// get index
Path path = Paths.get(idx_data.index_dir);
try {
this.index = FSDirectory.open(path);
} catch (IOException e) {
throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", index_dir);
throw Err_.new_exc(e, "lucene_index", "failed to init searcher", "dir", idx_data.index_dir);
}
}
public void Exec(List_adp list, Gflucene_searcher_data data) {
public void Exec(List_adp list, Gflucene_searcher_qry data) {
try {
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
@ -58,7 +66,7 @@ public class Gflucene_searcher {
for(int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
Gflucene_index_data doc = new Gflucene_index_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), "");
Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), "");
doc.lucene_score = hits[i].score;
list.Add(doc);
}

@ -13,11 +13,11 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene; import gplx.*;
public class Gflucene_searcher_data {
package gplx.gflucene.searchers; import gplx.*; import gplx.gflucene.*;
public class Gflucene_searcher_qry {
public String query;
public int match_max;
public Gflucene_searcher_data(String query, int match_max) {
public Gflucene_searcher_qry(String query, int match_max) {
this.query = query;
this.match_max = match_max;
}
Loading…
Cancel
Save