From a4380b6d48010b61aef0cd7c9f2a21c4efdaca53 Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Fri, 17 Mar 2017 09:59:58 -0400 Subject: [PATCH] Full-text search: Specify namespaces for indexing --- 140_dbs/src/gplx/dbs/Db_conn.java | 2 +- .../bldrs/Xofulltext_indexer_args.java | 55 +++++++++++++++++++ .../bldrs/Xofulltext_indexer_cmd.java | 7 ++- .../bldrs/Xofulltext_indexer_mgr.java | 29 +++++++--- .../specials/Xofulltext_indexer_doc.java | 10 ++-- .../specials/Xofulltext_indexer_html.java | 8 +-- .../specials/Xofulltext_indexer_special.java | 4 +- .../indexers/svcs/Xofulltext_indexer_svc.java | 15 ++--- .../searchers/Gflucene_searcher_mgr.java | 8 +-- 9 files changed, 105 insertions(+), 33 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_args.java diff --git a/140_dbs/src/gplx/dbs/Db_conn.java b/140_dbs/src/gplx/dbs/Db_conn.java index d829c47e1..d1b20219f 100644 --- a/140_dbs/src/gplx/dbs/Db_conn.java +++ b/140_dbs/src/gplx/dbs/Db_conn.java @@ -105,7 +105,7 @@ public class Db_conn { public int Exec_sql_args(String sql, Object... args) {return this.Exec_qry(Db_qry_sql.dml_(String_.Format(sql, args)));} public int Exec_sql_plog_ntx(String msg, String sql) {return Exec_sql_plog(Bool_.N, msg, sql);} public int Exec_sql_plog_txn(String msg, String sql) {return Exec_sql_plog(Bool_.Y, msg, sql);} - public int Exec_sql_plog(boolean txn, String msg, String sql) { + public int Exec_sql_plog(boolean txn, String msg, String sql) { Gfo_usr_dlg_.Instance.Plog_many("", "", msg); if (txn) this.Txn_bgn(msg); int rv = Exec_sql(sql); diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_args.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_args.java new file mode 100644 index 000000000..a49f7fbcd --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_args.java @@ -0,0 +1,55 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*; +import gplx.xowa.wikis.nss.*; +public class Xofulltext_indexer_args implements Gfo_invk { + public byte[] wikis; + public String ns_ids; + public void Init_by_wiki(Xowe_wiki wiki) { + // wikis: null + if (wikis == null) + wikis = wiki.Domain_bry(); + + // ns: null / * + if (ns_ids == null) + ns_ids = "0"; + else if (String_.Eq(ns_ids, "*")) { + Xow_ns[] ns_ary = wiki.Ns_mgr().Ords_ary(); + int len = ns_ary.length; + Bry_bfr bfr = Bry_bfr_.New(); + for (int i = 0; i < len; i++) { + Xow_ns ns = ns_ary[i]; + int ns_id = ns.Id(); + if (ns_id < 0) continue; // ignore media, special + if (i != 0) bfr.Add_byte(Byte_ascii.Pipe); + bfr.Add_int_variable(ns_id); + } + ns_ids = bfr.To_str_and_clear(); + } + } + public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { + if (ctx.Match(k, "wikis_")) this.wikis = m.ReadBryOr("v", null); + else if (ctx.Match(k, "ns_ids")) this.ns_ids = m.ReadStrOr("v", null); + else return Gfo_invk_.Rv_unhandled; + return this; + } + public static Xofulltext_indexer_args New_by_json(gplx.langs.jsons.Json_nde args) { + Xofulltext_indexer_args rv = new Xofulltext_indexer_args(); + rv.wikis = args.Get_as_bry("wikis"); + rv.ns_ids = args.Get_as_str("ns_ids"); + return rv; + } +} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_cmd.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_cmd.java index 0c7dc555a..02d107560 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_cmd.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_cmd.java @@ -16,10 +16,15 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; public class Xofulltext_indexer_cmd extends Xob_cmd__base { + private final Xofulltext_indexer_args args = new Xofulltext_indexer_args(); public Xofulltext_indexer_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki);} @Override public void Cmd_run() { wiki.Init_assert(); - new Xofulltext_indexer_mgr().Exec(wiki, null); + new Xofulltext_indexer_mgr().Exec(wiki, null, args); + } + @Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { + if (ctx.Match(k, "args")) return args; + else return Gfo_invk_.Rv_unhandled; } @Override public String Cmd_key() {return "search.index";} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_mgr.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_mgr.java index d1892815b..c8771e09d 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/bldrs/Xofulltext_indexer_mgr.java @@ -20,19 +20,28 @@ import gplx.xowa.wikis.data.*; import gplx.xowa.htmls.core.dbs.*; import gplx.xowa.addons.wikis.fulltexts.indexers.svcs.*; public class Xofulltext_indexer_mgr { - public void Exec(Xowe_wiki wiki, Xofulltext_indexer_ui ui) { - Xow_db_file core_db = wiki.Data__core_mgr().Db__core(); - gplx.xowa.wikis.data.tbls.Xowd_page_tbl page_tbl = core_db.Tbl__page(); - - Xoh_page hpg = new Xoh_page(); - + public void Exec(Xowe_wiki wiki, Xofulltext_indexer_ui ui, Xofulltext_indexer_args args) { + // init indexer Xofulltext_indexer_wkr indexer = new Xofulltext_indexer_wkr(); indexer.Init(wiki); - Db_conn conn = page_tbl.Conn(); - Db_rdr rdr = conn.Exec_rdr("SELECT page_id, page_score, page_namespace, page_title, page_html_db_id FROM page WHERE page_namespace = 0;"); + // get page tbl + Xow_db_file core_db = wiki.Data__core_mgr().Db__core(); + gplx.xowa.wikis.data.tbls.Xowd_page_tbl page_tbl = core_db.Tbl__page(); + + // init args + args.Init_by_wiki(wiki); int count = 0; + Xoh_page hpg = new Xoh_page(); + + // get rdr and loop + Db_conn conn = page_tbl.Conn(); + Db_rdr rdr = conn.Exec_rdr(Db_sql_.Make_by_fmt(String_.Ary + ( "SELECT page_id, page_score, page_namespace, page_title, page_html_db_id" + , "FROM page" + , "WHERE page_namespace IN ({0});"), String_.Replace(args.ns_ids, "|", ","))); while (rdr.Move_next()) { + // read vars int page_namespace = rdr.Read_int("page_namespace"); byte[] page_ttl_bry = rdr.Read_bry_by_str("page_title"); int page_id = rdr.Read_int("page_id"); @@ -52,7 +61,10 @@ public class Xofulltext_indexer_mgr { continue; byte[] html_text = wiki.Html__hdump_mgr().Load_mgr().Parse(hpg, hpg.Db().Html().Zip_tid(), hpg.Db().Html().Hzip_tid(), hpg.Db().Html().Html_bry()); + // run index indexer.Index(page_id, page_score, page_ttl.Page_txt(), html_text); + + // notify if ((++count % 10000) == 0) { Gfo_usr_dlg_.Instance.Prog_many("", "", "indexing page: ~{0}", count); if (ui != null) @@ -63,6 +75,7 @@ public class Xofulltext_indexer_mgr { } } + // term indexer indexer.Term(); } } diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_doc.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_doc.java index 01f933354..269167cfa 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_doc.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_doc.java @@ -16,14 +16,16 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*; import gplx.langs.mustaches.*; public class Xofulltext_indexer_doc implements Mustache_doc_itm { - private final byte[] wikis_bry; - public Xofulltext_indexer_doc - ( byte[] wikis_bry) { + private final String wikis_bry, ns_ids; + public Xofulltext_indexer_doc(String wikis_bry, String ns_ids) { this.wikis_bry = wikis_bry; + this.ns_ids = ns_ids; } public boolean Mustache__write(String key, Mustache_bfr bfr) { if (String_.Eq(key, "wikis")) - bfr.Add_bry(wikis_bry); + bfr.Add_str_u8(wikis_bry); + else if (String_.Eq(key, "ns_ids")) + bfr.Add_str_u8(ns_ids); else return false; return true; diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_html.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_html.java index b0136696c..003eb696e 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_html.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_html.java @@ -17,15 +17,15 @@ package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; impor import gplx.xowa.specials.*; import gplx.langs.mustaches.*; import gplx.xowa.wikis.pages.*; import gplx.xowa.wikis.pages.tags.*; import gplx.dbs.*; class Xofulltext_indexer_html extends Xow_special_wtr__base { - private final byte[] wikis_bry; - public Xofulltext_indexer_html - ( byte[] wikis_bry) { + private final String wikis_bry, ns_ids; + public Xofulltext_indexer_html(String wikis_bry, String ns_ids) { this.wikis_bry = wikis_bry; + this.ns_ids = ns_ids; } @Override protected Io_url Get_addon_dir(Xoa_app app) {return Addon_dir(app);} @Override protected Io_url Get_mustache_fil(Io_url addon_dir) {return addon_dir.GenSubFil_nest("bin", "xofulltext_indexer.template.html");} @Override protected Mustache_doc_itm Bld_mustache_root(Xoa_app app) { - return new Xofulltext_indexer_doc(wikis_bry); + return new Xofulltext_indexer_doc(wikis_bry, ns_ids); } @Override protected void Bld_tags(Xoa_app app, Io_url addon_dir, Xopage_html_data page_data) { Xopg_tag_mgr head_tags = page_data.Head_tags(); diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_special.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_special.java index 9cc395f60..05ed7f6b8 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_special.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/specials/Xofulltext_indexer_special.java @@ -20,12 +20,12 @@ public class Xofulltext_indexer_special implements Xow_special_page { public void Special__gen(Xow_wiki wiki, Xoa_page page, Xoa_url url, Xoa_ttl ttl) { // get qry if any Gfo_qarg_mgr url_args = new Gfo_qarg_mgr().Init(url.Qargs_ary()); - byte[] wikis_bry = url_args.Read_bry_or("wikis", Bry_.Empty); // get options and create page // Xocfg_mgr cfg_mgr = wiki.App().Cfg(); new Xofulltext_indexer_html - ( wikis_bry + ( url_args.Read_str_or("wikis", wiki.Domain_str()) + , url_args.Read_str_or("ns_ids", "0") ).Bld_page_by_mustache(wiki.App(), page, this); } Xofulltext_indexer_special(Xow_special_meta special__meta) {this.special__meta = special__meta;} diff --git a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/svcs/Xofulltext_indexer_svc.java b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/svcs/Xofulltext_indexer_svc.java index 46ea66f45..fed6b3920 100644 --- a/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/svcs/Xofulltext_indexer_svc.java +++ b/400_xowa/src/gplx/xowa/addons/wikis/fulltexts/indexers/svcs/Xofulltext_indexer_svc.java @@ -29,8 +29,7 @@ class Xofulltext_indexer_svc implements Gfo_invk { } public void Index(Json_nde args) { // create args - byte[] wikis_bry = args.Get_as_bry("wikis"); - Xofulltext_indexer_args indexer_args = new Xofulltext_indexer_args(wikis_bry); + Xofulltext_indexer_args indexer_args = Xofulltext_indexer_args.New_by_json(args); // launch thread gplx.core.threads.Thread_adp_.Start_by_val("index", Cancelable_.Never, this, Invk__index, indexer_args); @@ -47,6 +46,7 @@ class Xofulltext_indexer_svc implements Gfo_invk { continue; } + // check if dir exists wiki.Init_by_wiki(); Io_url search_dir = Xosearch_fulltext_addon.Get_index_dir(wiki); if (Io_mgr.Instance.ExistsDir(search_dir)) { @@ -55,11 +55,14 @@ class Xofulltext_indexer_svc implements Gfo_invk { continue; } + // notify bgn app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.fulltext_indexer.status__note__recv", gplx.core.gfobjs.Gfobj_nde.New() .Add_str("note", Datetime_now.Get().XtoStr_fmt_yyyy_MM_dd_HH_mm_ss() + ": wiki index started: " + String_.new_u8(domain))); - new Xofulltext_indexer_mgr().Exec((Xowe_wiki)wiki, new Xofulltext_indexer_ui(app.Gui__cbk_mgr(), cbk_trg)); + // run index + new Xofulltext_indexer_mgr().Exec((Xowe_wiki)wiki, new Xofulltext_indexer_ui(app.Gui__cbk_mgr(), cbk_trg), args); + // notify end app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.fulltext_indexer.status__note__recv", gplx.core.gfobjs.Gfobj_nde.New() .Add_str("note", Datetime_now.Get().XtoStr_fmt_yyyy_MM_dd_HH_mm_ss() + ": wiki index ended: " + String_.new_u8(domain))); } @@ -72,9 +75,3 @@ class Xofulltext_indexer_svc implements Gfo_invk { } private static final String Invk__index = "index"; } -class Xofulltext_indexer_args { - public byte[] wikis; - public Xofulltext_indexer_args(byte[] wikis) { - this.wikis = wikis; - } -} diff --git a/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java index 974c561cb..7943c1617 100644 --- a/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java +++ b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java @@ -63,13 +63,13 @@ public class Gflucene_searcher_mgr { IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); - Query query = new QueryParser("body", analyzer).parse(data.query); -// Query multi_query = MultiFieldQueryParser.parse(data.query, new String[] {"body"}, new BooleanClause.Occur []{BooleanClause.Occur.SHOULD}, analyzer); +// Query query = new QueryParser("body", analyzer).parse(data.query); + Query multi_query = MultiFieldQueryParser.parse(data.query, new String[] {"body"}, new BooleanClause.Occur []{BooleanClause.Occur.SHOULD}, analyzer); // Query body_query = new QueryParser("body", analyzer).parse(data.query); // Query title_query = new QueryParser("title", analyzer).parse(data.query); -// FunctionQuery boost_query = new FunctionQuery(new LongFieldSource("page_score")); -// CustomScoreQuery query = new CustomScoreQuery(multi_query, boost_query); + FunctionQuery boost_query = new FunctionQuery(new LongFieldSource("page_score")); + CustomScoreQuery query = new CustomScoreQuery(multi_query, boost_query); // TopDocs docs = searcher.search(query, reader.maxDoc()); TopDocs docs = searcher.search(query, data.match_max);