From c91416801b57ed65823f2a9a2d18147dc29281d6 Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Fri, 10 Mar 2017 13:24:41 -0500 Subject: [PATCH] Full-text search: Add lucene index generation --- 400_xowa/.classpath | 1 + .../mass_parses/parses/Xow_wiki_utl_.java | 3 ++ .../parses/mgrs/Xomp_parse_mgr.java | 8 ++++- .../parses/mgrs/Xomp_parse_mgr_cfg.java | 2 ++ .../parses/wkrs/Xomp_parse_wkr.java | 12 ++++++- .../fulltexts/indexers/Xosearch_indexer.java | 34 +++++++++++++++++++ .../gplx/xowa/wikis/metas/Xow_sys_cfg.java | 10 ++++-- gplx.gflucene/.classpath | 11 ++++++ 8 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/indexers/Xosearch_indexer.java create mode 100644 gplx.gflucene/.classpath diff --git a/400_xowa/.classpath b/400_xowa/.classpath index 30426ce39..51d3ee90a 100644 --- a/400_xowa/.classpath +++ b/400_xowa/.classpath @@ -9,5 +9,6 @@ + diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/Xow_wiki_utl_.java b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/Xow_wiki_utl_.java index 9cdd0ffda..71ebdf5d8 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/Xow_wiki_utl_.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/Xow_wiki_utl_.java @@ -25,6 +25,9 @@ public class Xow_wiki_utl_ { rv.File_mgr().Repo_mgr().Clone(wiki.File_mgr().Repo_mgr()); rv.File__fsdb_mode().Tid__v2__bld__y_(); + // copy other members + rv.Sys_cfg().Copy(wiki.Sys_cfg()); + Clone_repos(wiki); return rv; } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr.java b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr.java index 4bd5cb24d..b06c6d978 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr.java @@ -18,6 +18,7 @@ import gplx.core.threads.*; import gplx.core.threads.utils.*; import gplx.core.caches.*; import gplx.xowa.wikis.caches.*; import gplx.xowa.addons.bldrs.mass_parses.parses.wkrs.*; import gplx.xowa.addons.bldrs.mass_parses.dbs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*; import gplx.xowa.addons.bldrs.wmdumps.imglinks.*; +import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*; public class Xomp_parse_mgr { private Gfo_countdown_latch latch; public Xomp_parse_mgr_cfg Cfg() {return cfg;} private final Xomp_parse_mgr_cfg cfg = new Xomp_parse_mgr_cfg(); @@ -58,6 +59,10 @@ public class Xomp_parse_mgr { // init ns_ord_mgr Xomp_ns_ord_mgr ns_ord_mgr = new Xomp_ns_ord_mgr(Int_.Ary_parse(mgr_db.Tbl__cfg().Select_str("", Xomp_parse_wkr.Cfg__ns_ids), "|")); + // init indexer + Xosearch_indexer indexer = cfg.Indexer_enabled() ? new Xosearch_indexer() : null; + if (indexer != null) indexer.Init(wiki); + // init parse_wkrs for (int i = 0; i < wkr_len; ++i) { // make wiki @@ -65,7 +70,7 @@ public class Xomp_parse_mgr { wkr_wiki.Cache_mgr().Page_cache_(page_cache).Commons_cache_(commons_cache).Ifexist_cache_(ifexist_cache); // make wkr - Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, i + wkr_uid_bgn); + Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, indexer, i + wkr_uid_bgn); wkrs[i] = wkr; } @@ -78,6 +83,7 @@ public class Xomp_parse_mgr { // wait until wkrs are done latch.Await(); page_pool.Rls(); + if (indexer != null) indexer.Term(); // print stats Bry_bfr bfr = Bry_bfr_.New(); diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr_cfg.java b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr_cfg.java index 104e00fe4..f402f3183 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr_cfg.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/mgrs/Xomp_parse_mgr_cfg.java @@ -36,6 +36,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk { public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url; public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name; public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool; + public boolean Indexer_enabled() {return indexer_enabled;} private boolean indexer_enabled; public void Init(Xowe_wiki wiki) { if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count(); if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000; @@ -64,6 +65,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk { else if (ctx.Match(k, Invk__show_msg__fetched_pool_)) show_msg__fetched_pool = m.ReadYn("v"); else if (ctx.Match(k, Invk__hdump_catboxes_)) hdump_catboxs = m.ReadYn("v"); else if (ctx.Match(k, Invk__log_math_)) log_math = m.ReadYn("v"); + else if (ctx.Match(k, "indexer_enabled_")) indexer_enabled = m.ReadYn("v"); else return Gfo_invk_.Rv_unhandled; return this; } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java index 8bc4dd82c..d6358f7d1 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java @@ -19,6 +19,7 @@ import gplx.xowa.files.origs.*; import gplx.xowa.htmls.core.bldrs.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.logs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.mgrs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*; import gplx.xowa.addons.bldrs.mass_parses.parses.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*; +import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*; public class Xomp_parse_wkr implements Gfo_invk { // mgr vars private final Xomp_parse_mgr mgr; @@ -39,13 +40,19 @@ public class Xomp_parse_wkr implements Gfo_invk { private final int uid; private Xomp_wkr_db wkr_db; + private final Xosearch_indexer indexer; + private final List_adp list = List_adp_.New(); private int list_idx = 0, list_len = 0; private int done_count; private long done_time; - public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg, Xomp_mgr_db mgr_db, Xomp_page_pool page_pool, Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr, Xowe_wiki wiki, int uid) { + public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg + , Xomp_mgr_db mgr_db, Xomp_page_pool page_pool + , Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr + , Xowe_wiki wiki, Xosearch_indexer indexer, int uid) { // mgr vars this.mgr = mgr; this.mgr_db = mgr_db; this.page_pool = page_pool; this.prog_mgr = prog_mgr; this.file_orig_wkr = file_orig_wkr; this.ns_ord_mgr = ns_ord_mgr; + this.indexer = indexer; // cfg vars this.cfg = cfg; @@ -125,6 +132,9 @@ public class Xomp_parse_wkr implements Gfo_invk { // gen_html hdump_bldr.Insert(pctx, wpg); + // index + if (indexer != null) indexer.Index(wpg); + // mark done for sake of progress prog_mgr.Mark_done(ppg.Id()); diff --git a/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/indexers/Xosearch_indexer.java b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/indexers/Xosearch_indexer.java new file mode 100644 index 000000000..787b8c7cf --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/wikis/searchs/fulltexts/indexers/Xosearch_indexer.java @@ -0,0 +1,34 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.wikis.searchs.fulltexts.indexers; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*; +import gplx.gflucene.*; +public class Xosearch_indexer { + private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr(); + public void Init(Xow_wiki wiki) { + Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search"); + Io_mgr.Instance.DeleteDirDeep(search_dir); + index_wtr.Init(search_dir.Xto_api()); + } + public void Index(Xoae_page wpg) { + // TODO: skip if not main_ns + byte[] html = wpg.Db().Html().Html_bry(); + + index_wtr.Exec(String_.new_u8(wpg.Ttl().Page_db()), String_.new_u8(html)); + } + public void Term() { + index_wtr.Term(); + } +} diff --git a/400_xowa/src/gplx/xowa/wikis/metas/Xow_sys_cfg.java b/400_xowa/src/gplx/xowa/wikis/metas/Xow_sys_cfg.java index b53e9a59f..509045b66 100644 --- a/400_xowa/src/gplx/xowa/wikis/metas/Xow_sys_cfg.java +++ b/400_xowa/src/gplx/xowa/wikis/metas/Xow_sys_cfg.java @@ -18,10 +18,14 @@ public class Xow_sys_cfg implements Gfo_invk { public Xow_sys_cfg(Xowe_wiki wiki) {} public boolean Xowa_cmd_enabled() {return xowa_cmd_enabled;} public Xow_sys_cfg Xowa_cmd_enabled_(boolean v) {xowa_cmd_enabled = v; return this;} private boolean xowa_cmd_enabled; public boolean Xowa_proto_enabled() {return xowa_proto_enabled;} public Xow_sys_cfg Xowa_proto_enabled_(boolean v) {xowa_proto_enabled = v; return this;} private boolean xowa_proto_enabled; + public void Copy(Xow_sys_cfg src) { + this.xowa_cmd_enabled = src.xowa_cmd_enabled; + this.xowa_proto_enabled = src.xowa_proto_enabled; + } public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { - if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_cmd_enabled = m.ReadYn("v"); - else if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_proto_enabled = m.ReadYn("v"); + if (ctx.Match(k, "xowa_cmd_enabled_")) xowa_cmd_enabled = m.ReadYn("v"); + else if (ctx.Match(k, "xowa_proto_enabled_")) xowa_proto_enabled = m.ReadYn("v"); else return Gfo_invk_.Rv_unhandled; return this; - } private static final String Invk_xowa_cmd_enabled_ = "xowa_cmd_enabled_"; + } } diff --git a/gplx.gflucene/.classpath b/gplx.gflucene/.classpath new file mode 100644 index 000000000..f3d4d6d08 --- /dev/null +++ b/gplx.gflucene/.classpath @@ -0,0 +1,11 @@ + + + + + + + + + + +