Full-text search: Add IndexOptions to Indexer

pull/620/head
gnosygnu 7 years ago
parent 49924110f4
commit a9afa7a827

@ -61,7 +61,7 @@ public class Xomp_parse_mgr {
// init indexer
Xofulltext_indexer_wkr indexer = cfg.Indexer_enabled() ? new Xofulltext_indexer_wkr() : null;
if (indexer != null) indexer.Init(wiki);
if (indexer != null) indexer.Init(wiki, cfg.Indexer_opt());
// init parse_wkrs
for (int i = 0; i < wkr_len; ++i) {

@ -37,6 +37,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
public boolean Indexer_enabled() {return indexer_enabled;} private boolean indexer_enabled;
public String Indexer_opt() {return indexer_opt;} private String indexer_opt = gplx.gflucene.indexers.Gflucene_idx_opt.Docs_and_freqs.Key();
public void Init(Xowe_wiki wiki) {
if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count();
if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000;
@ -66,6 +67,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
else if (ctx.Match(k, Invk__hdump_catboxes_)) hdump_catboxs = m.ReadYn("v");
else if (ctx.Match(k, Invk__log_math_)) log_math = m.ReadYn("v");
else if (ctx.Match(k, "indexer_enabled_")) indexer_enabled = m.ReadYn("v");
else if (ctx.Match(k, "indexer_opt_")) indexer_opt = m.ReadStr("v");
else return Gfo_invk_.Rv_unhandled;
return this;
}

@ -15,9 +15,11 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.xowa.wikis.nss.*;
import gplx.gflucene.indexers.*;
public class Xofulltext_indexer_args implements Gfo_invk {
public byte[] wikis;
public String ns_ids;
public String idx_opt;
public void Init_by_wiki(Xowe_wiki wiki) {
// wikis: null
if (wikis == null)
@ -34,15 +36,21 @@ public class Xofulltext_indexer_args implements Gfo_invk {
Xow_ns ns = ns_ary[i];
int ns_id = ns.Id();
if (ns_id < 0) continue; // ignore media, special
if (i != 0) bfr.Add_byte(Byte_ascii.Pipe);
if (i != 0) bfr.Add_byte(Byte_ascii.Comma);
bfr.Add_int_variable(ns_id);
}
ns_ids = bfr.To_str_and_clear();
}
// idx_opt
if (idx_opt == null) {
idx_opt = Gflucene_idx_opt.Docs_and_freqs.Key();
}
}
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, "wikis_")) this.wikis = m.ReadBryOr("v", null);
else if (ctx.Match(k, "ns_ids")) this.ns_ids = m.ReadStrOr("v", null);
else if (ctx.Match(k, "idx_opt")) this.idx_opt = m.ReadStrOr("v", null);
else return Gfo_invk_.Rv_unhandled;
return this;
}
@ -50,6 +58,7 @@ public class Xofulltext_indexer_args implements Gfo_invk {
Xofulltext_indexer_args rv = new Xofulltext_indexer_args();
rv.wikis = args.Get_as_bry("wikis");
rv.ns_ids = args.Get_as_str("ns_ids");
rv.idx_opt = args.Get_as_str("idx_opt");
return rv;
}
}

@ -23,7 +23,7 @@ public class Xofulltext_indexer_mgr {
public void Exec(Xowe_wiki wiki, Xofulltext_indexer_ui ui, Xofulltext_indexer_args args) {
// init indexer
Xofulltext_indexer_wkr indexer = new Xofulltext_indexer_wkr();
indexer.Init(wiki);
indexer.Init(wiki, args.idx_opt);
// get page tbl
Xow_db_file core_db = wiki.Data__core_mgr().Db__core();
@ -39,7 +39,7 @@ public class Xofulltext_indexer_mgr {
Db_rdr rdr = conn.Exec_rdr(Db_sql_.Make_by_fmt(String_.Ary
( "SELECT page_id, page_score, page_namespace, page_title, page_html_db_id"
, "FROM page"
, "WHERE page_namespace IN ({0});"), String_.Replace(args.ns_ids, "|", ",")));
, "WHERE page_namespace IN ({0});"), args.ns_ids));
while (rdr.Move_next()) {
// read vars
int page_namespace = rdr.Read_int("page_namespace");

@ -20,7 +20,7 @@ import gplx.xowa.addons.wikis.fulltexts.core.*;
public class Xofulltext_indexer_wkr {
private final Gflucene_indexer_mgr index_wtr = new Gflucene_indexer_mgr();
private final Xofulltext_extractor extractor = new Xofulltext_extractor();
public void Init(Xow_wiki wiki) {
public void Init(Xow_wiki wiki, String idx_opt) {
// delete existing dir
Io_url index_dir = Xosearch_fulltext_addon.Get_index_dir(wiki);
Io_mgr.Instance.DeleteDirDeep(index_dir);
@ -28,7 +28,9 @@ public class Xofulltext_indexer_wkr {
// init index_dir
index_wtr.Init(new Gflucene_index_data
( Gflucene_analyzer_data.New_data_from_locale(wiki.Lang().Key_str())
, index_dir.Xto_api()));
, index_dir.Xto_api())
, idx_opt
);
}
public void Index(Xoae_page wpg) {
byte[] html = extractor.Extract(wpg.Db().Html().Html_bry());

@ -1,36 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.langs.mustaches.*;
public class Xofulltext_indexer_doc implements Mustache_doc_itm {
private final String wikis_bry, ns_ids;
public Xofulltext_indexer_doc(String wikis_bry, String ns_ids) {
this.wikis_bry = wikis_bry;
this.ns_ids = ns_ids;
}
public boolean Mustache__write(String key, Mustache_bfr bfr) {
if (String_.Eq(key, "wikis"))
bfr.Add_str_u8(wikis_bry);
else if (String_.Eq(key, "ns_ids"))
bfr.Add_str_u8(ns_ids);
else
return false;
return true;
}
public Mustache_doc_itm[] Mustache__subs(String key) {
return Mustache_doc_itm_.Ary__empty;
}
}

@ -16,16 +16,28 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.xowa.specials.*; import gplx.langs.mustaches.*; import gplx.xowa.wikis.pages.*; import gplx.xowa.wikis.pages.tags.*;
import gplx.dbs.*;
class Xofulltext_indexer_html extends Xow_special_wtr__base {
private final String wikis_bry, ns_ids;
public Xofulltext_indexer_html(String wikis_bry, String ns_ids) {
class Xofulltext_indexer_html extends Xow_special_wtr__base implements Mustache_doc_itm {
private final String wikis_bry, ns_ids, idx_opt;
public Xofulltext_indexer_html(String wikis_bry, String ns_ids, String idx_opt) {
this.wikis_bry = wikis_bry;
this.ns_ids = ns_ids;
this.idx_opt = idx_opt;
}
public boolean Mustache__write(String key, Mustache_bfr bfr) {
if (String_.Eq(key, "wikis")) bfr.Add_str_u8(wikis_bry);
else if (String_.Eq(key, "ns_ids")) bfr.Add_str_u8(ns_ids);
else if (String_.Eq(key, "idx_opt")) bfr.Add_str_u8(idx_opt);
else return false;
return true;
}
public Mustache_doc_itm[] Mustache__subs(String key) {
return Mustache_doc_itm_.Ary__empty;
}
@Override protected Io_url Get_addon_dir(Xoa_app app) {return Addon_dir(app);}
@Override protected Io_url Get_mustache_fil(Io_url addon_dir) {return addon_dir.GenSubFil_nest("bin", "xofulltext_indexer.template.html");}
@Override protected Mustache_doc_itm Bld_mustache_root(Xoa_app app) {
return new Xofulltext_indexer_doc(wikis_bry, ns_ids);
return this;
}
@Override protected void Bld_tags(Xoa_app app, Io_url addon_dir, Xopage_html_data page_data) {
Xopg_tag_mgr head_tags = page_data.Head_tags();

@ -26,6 +26,7 @@ public class Xofulltext_indexer_special implements Xow_special_page {
new Xofulltext_indexer_html
( url_args.Read_str_or("wikis", wiki.Domain_str())
, url_args.Read_str_or("ns_ids", "0")
, url_args.Read_str_or("idx_opt", gplx.gflucene.indexers.Gflucene_idx_opt.Docs_and_freqs.Key())
).Bld_page_by_mustache(wiki.App(), page, this);
}
Xofulltext_indexer_special(Xow_special_meta special__meta) {this.special__meta = special__meta;}

@ -28,10 +28,17 @@ public class Xofulltext_args_qry {
public boolean auto_wildcard_end;
public boolean expand_matches_section;
public boolean show_all_matches;
private boolean canceled;
public byte[] Qry_key(byte[] wiki, byte[] ns_ids) {
return Bry_.Add_w_dlm(Byte_ascii.Nl, wiki, ns_ids, search_text); // EX: "en.wikipedia.org\n0|4\nearth"
}
public void Cancel() {
synchronized (this) {
canceled = true;
}
}
public boolean Canceled() {return canceled;}
public static Xofulltext_args_qry New_by_json(Json_nde args) {
Xofulltext_args_qry rv = new Xofulltext_args_qry();

@ -46,6 +46,7 @@ class Xofulltext_highlighter_mgr implements Gfo_invk {
// loop items
int len = list.Len();
for (int i = 0; i < len; i++) {
if (searcher_args.Canceled()) return;
Gflucene_doc_data item = (Gflucene_doc_data)list.Get_at(i);
try {
Highlight_item(item);

@ -48,10 +48,12 @@ public class Xofulltext_searcher__lucene implements Xofulltext_searcher {
int found = 0;
Gflucene_searcher_qry searcher_data = new Gflucene_searcher_qry(String_.new_u8(args.search_text), 100);
while (found < needed_len) {
if (args.Canceled()) return;
searcher.Exec(temp_list, searcher_data);
int temp_list_len = temp_list.Len();
for (int i = 0; i < temp_list_len; i++) {
if (args.Canceled()) return;
Gflucene_doc_data doc_data = (Gflucene_doc_data)temp_list.Get_at(i);
if (!page_list.Has(doc_data.page_id)) {
// load page

@ -21,7 +21,8 @@ import gplx.xowa.addons.apps.cfgs.*;
class Xofulltext_searcher_html extends Xow_special_wtr__base implements Mustache_doc_itm {
private final boolean case_match, auto_wildcard_bgn, auto_wildcard_end, expand_matches_section, show_all_matches;
private final Hash_adp props = Hash_adp_.New();
public Xofulltext_searcher_html(Xocfg_mgr cfg_mgr, Gfo_qarg_mgr url_args, Xow_wiki wiki) {
public Xofulltext_searcher_html(Xocfg_mgr cfg_mgr, Gfo_qarg_mgr url_args, Xow_wiki wiki, Guid_adp page_guid) {
props.Add("page_guid", page_guid.To_str());
props.Add("cur_wiki", wiki.Domain_str());
props.Add("search", url_args.Read_str_or("search", ""));
props_Add(cfg_mgr, url_args, "wikis" , wiki.Domain_str());

@ -23,7 +23,7 @@ public class Xofulltext_searcher_special implements Xow_special_page {
Xocfg_mgr cfg_mgr = wiki.App().Cfg();
// create page
Xofulltext_searcher_html html = new Xofulltext_searcher_html(cfg_mgr, url_args, wiki);
Xofulltext_searcher_html html = new Xofulltext_searcher_html(cfg_mgr, url_args, wiki, page.Page_guid());
html.Bld_page_by_mustache(wiki.App(), page, this);
}
Xofulltext_searcher_special(Xow_special_meta special__meta) {this.special__meta = special__meta;}

@ -26,15 +26,17 @@ public class Xofulltext_searcher_bridge implements Bridge_cmd_itm {
Json_nde args = data.Get_kv(Bridge_cmd_mgr.Msg__args).Val_as_nde();
switch (proc_id) {
case Proc__search: svc.Search(args); break;
case Proc__cancel: svc.Cancel(args); break;
case Proc__get_lines_rest: svc.Get_lines_rest(args); break;
default: throw Err_.new_unhandled_default(proc_id);
}
return "";
}
private static final byte Proc__search = 0, Proc__get_lines_rest = 1;
private static final byte Proc__search = 0, Proc__cancel = 1, Proc__get_lines_rest = 2;
private static final Hash_adp_bry proc_hash = Hash_adp_bry.cs()
.Add_str_byte("search" , Proc__search)
.Add_str_byte("cancel" , Proc__cancel)
.Add_str_byte("get_lines_rest" , Proc__get_lines_rest)
;

@ -29,13 +29,23 @@ import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.gflucenes.*;
import gplx.xowa.addons.wikis.fulltexts.searchers.mgrs.brutes.*;
class Xofulltext_searcher_svc implements Gfo_invk {
private final Xoa_app app;
private final Hash_adp hash = Hash_adp_.New();
public Xofulltext_searcher_svc(Xoa_app app) {
this.app = app;
}
public void Cancel(Json_nde args) {this.Cancel(args.Get_as_str("page_guid"));}
private void Cancel(String page_guid) {
Xofulltext_args_qry prv_args = (Xofulltext_args_qry)hash.Get_by(page_guid);
if (prv_args != null) {
prv_args.Cancel();
}
}
public void Search(Json_nde args) {
// get search_args
Xofulltext_args_qry search_args = Xofulltext_args_qry.New_by_json(args);
search_args.cache_mgr = this.Cache_mgr();
this.Cancel(search_args.page_guid);
// autosave any changes if enabled
Xocfg_mgr cfg_mgr = app.Cfg();
@ -49,6 +59,8 @@ class Xofulltext_searcher_svc implements Gfo_invk {
// cfg_mgr.Set_bry_app ("xowa.addon.search.fulltext.special.namespaces", search_args.namespaces);
}
hash.Add(search_args.page_guid, search_args);
// launch thread
gplx.core.threads.Thread_adp_.Start_by_val("search", Cancelable_.Never, this, Invk__search, search_args);
}

@ -18,8 +18,8 @@ import gplx.gflucene.analyzers.*;
public class Gflucene_index_data {
public final Gflucene_analyzer_data analyzer_data;
public final String index_dir;
public final float max_merged_segments = 1500; // "limits" maximum file size
public final boolean positional_enabled = false;
public Gflucene_index_data(Gflucene_analyzer_data analyzer_data, String index_dir) {
this.analyzer_data = analyzer_data;
this.index_dir = index_dir;

@ -0,0 +1,49 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.gflucene.indexers; import gplx.*; import gplx.gflucene.*;
public class Gflucene_idx_opt {
public Gflucene_idx_opt(int uid, String key, String name) {
this.uid = uid;
this.key = key;
this.name = name;
}
public int Uid() {return uid;} private final int uid;
public String Key() {return key;} private final String key;
public String Name() {return name;} private final String name;
public static final int
Uid_docs = 0 // basic inverted word index; number of words is always 1 per doc
, Uid_docs_and_freqs = 1 // freqs needed for number of words per doc
, Uid_docs_and_freqs_and_positions = 2 // positions needed for proximity queries
, Uid_docs_and_freqs_and_positions_and_offsets = 3 // offsets needed for highlighter
;
private static final Hash_adp parse_hash = Hash_adp_.New();
public static final Gflucene_idx_opt
Docs = New(Uid_docs, "d", "Documents")
, Docs_and_freqs = New(Uid_docs_and_freqs, "df", "Documents / Frequencies")
, Docs_and_freqs_and_positions = New(Uid_docs_and_freqs_and_positions, "dfp", "Documents / Frequencies / Positions")
, Docs_and_freqs_and_positions_and_offsets = New(Uid_docs_and_freqs_and_positions_and_offsets, "dfpo", "Documents / Frequencies / Positions / Offsets")
;
private static Gflucene_idx_opt New(int uid, String key, String name) {
Gflucene_idx_opt rv = new Gflucene_idx_opt(uid, key, name);
parse_hash.Add(key, rv);
return rv;
}
public static Gflucene_idx_opt Parse(String key) {
return (Gflucene_idx_opt)parse_hash.Get_by_or_fail(key);
}
}

@ -40,7 +40,7 @@ public class Gflucene_indexer_mgr {
public Gflucene_indexer_mgr() {
}
public void Init(Gflucene_index_data idx_data) {
public void Init(Gflucene_index_data idx_data, String idx_opt) {
// create analyzer
this.analyzer = Gflucene_analyzer_mgr_.New_analyzer(idx_data.analyzer_data.key);
this.config = new IndexWriterConfig(analyzer);
@ -67,7 +67,7 @@ public class Gflucene_indexer_mgr {
// create field for body
this.body_fld_type = new FieldType();
IndexOptions index_options = idx_data.positional_enabled ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS : IndexOptions.DOCS_AND_FREQS;
IndexOptions index_options = To_index_options(idx_opt);
body_fld_type.setIndexOptions(index_options);
body_fld_type.setTokenized(true);
body_fld_type.setStored(false);
@ -110,4 +110,15 @@ public class Gflucene_indexer_mgr {
throw Err_.new_exc(e, "lucene_index", "failed to close writer");
}
}
}
private static IndexOptions To_index_options(String key) {
Gflucene_idx_opt opt = Gflucene_idx_opt.Parse(key);
switch (opt.Uid()) {
case Gflucene_idx_opt.Uid_docs: return IndexOptions.DOCS;
case Gflucene_idx_opt.Uid_docs_and_freqs: return IndexOptions.DOCS_AND_FREQS;
case Gflucene_idx_opt.Uid_docs_and_freqs_and_positions: return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
case Gflucene_idx_opt.Uid_docs_and_freqs_and_positions_and_offsets: return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
default: throw Err_.new_unhandled_default(opt.Uid());
}
}
}

Loading…
Cancel
Save