Full-text search: Add lucene index generation

pull/620/head
gnosygnu 7 years ago
parent 062d958ead
commit c91416801b

@ -9,5 +9,6 @@
<classpathentry exported="true" kind="lib" path="lib/jtidy_xowa.jar"/>
<classpathentry exported="true" kind="lib" path="lib/icu4j-4_8.jar"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry combineaccessrules="false" kind="src" path="/gplx.gflucene"/>
<classpathentry kind="output" path="bin"/>
</classpath>

@ -25,6 +25,9 @@ public class Xow_wiki_utl_ {
rv.File_mgr().Repo_mgr().Clone(wiki.File_mgr().Repo_mgr());
rv.File__fsdb_mode().Tid__v2__bld__y_();
// copy other members
rv.Sys_cfg().Copy(wiki.Sys_cfg());
Clone_repos(wiki);
return rv;
}

@ -18,6 +18,7 @@ import gplx.core.threads.*; import gplx.core.threads.utils.*;
import gplx.core.caches.*; import gplx.xowa.wikis.caches.*;
import gplx.xowa.addons.bldrs.mass_parses.parses.wkrs.*; import gplx.xowa.addons.bldrs.mass_parses.dbs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*;
import gplx.xowa.addons.bldrs.wmdumps.imglinks.*;
import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*;
public class Xomp_parse_mgr {
private Gfo_countdown_latch latch;
public Xomp_parse_mgr_cfg Cfg() {return cfg;} private final Xomp_parse_mgr_cfg cfg = new Xomp_parse_mgr_cfg();
@ -58,6 +59,10 @@ public class Xomp_parse_mgr {
// init ns_ord_mgr
Xomp_ns_ord_mgr ns_ord_mgr = new Xomp_ns_ord_mgr(Int_.Ary_parse(mgr_db.Tbl__cfg().Select_str("", Xomp_parse_wkr.Cfg__ns_ids), "|"));
// init indexer
Xosearch_indexer indexer = cfg.Indexer_enabled() ? new Xosearch_indexer() : null;
if (indexer != null) indexer.Init(wiki);
// init parse_wkrs
for (int i = 0; i < wkr_len; ++i) {
// make wiki
@ -65,7 +70,7 @@ public class Xomp_parse_mgr {
wkr_wiki.Cache_mgr().Page_cache_(page_cache).Commons_cache_(commons_cache).Ifexist_cache_(ifexist_cache);
// make wkr
Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, i + wkr_uid_bgn);
Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, indexer, i + wkr_uid_bgn);
wkrs[i] = wkr;
}
@ -78,6 +83,7 @@ public class Xomp_parse_mgr {
// wait until wkrs are done
latch.Await();
page_pool.Rls();
if (indexer != null) indexer.Term();
// print stats
Bry_bfr bfr = Bry_bfr_.New();

@ -36,6 +36,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url;
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
public boolean Indexer_enabled() {return indexer_enabled;} private boolean indexer_enabled;
public void Init(Xowe_wiki wiki) {
if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count();
if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000;
@ -64,6 +65,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
else if (ctx.Match(k, Invk__show_msg__fetched_pool_)) show_msg__fetched_pool = m.ReadYn("v");
else if (ctx.Match(k, Invk__hdump_catboxes_)) hdump_catboxs = m.ReadYn("v");
else if (ctx.Match(k, Invk__log_math_)) log_math = m.ReadYn("v");
else if (ctx.Match(k, "indexer_enabled_")) indexer_enabled = m.ReadYn("v");
else return Gfo_invk_.Rv_unhandled;
return this;
}

@ -19,6 +19,7 @@ import gplx.xowa.files.origs.*;
import gplx.xowa.htmls.core.bldrs.*;
import gplx.xowa.parsers.*; import gplx.xowa.parsers.logs.*;
import gplx.xowa.addons.bldrs.mass_parses.parses.mgrs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*; import gplx.xowa.addons.bldrs.mass_parses.parses.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*;
import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*;
public class Xomp_parse_wkr implements Gfo_invk {
// mgr vars
private final Xomp_parse_mgr mgr;
@ -39,13 +40,19 @@ public class Xomp_parse_wkr implements Gfo_invk {
private final int uid;
private Xomp_wkr_db wkr_db;
private final Xosearch_indexer indexer;
private final List_adp list = List_adp_.New(); private int list_idx = 0, list_len = 0;
private int done_count; private long done_time;
public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg, Xomp_mgr_db mgr_db, Xomp_page_pool page_pool, Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr, Xowe_wiki wiki, int uid) {
public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg
, Xomp_mgr_db mgr_db, Xomp_page_pool page_pool
, Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr
, Xowe_wiki wiki, Xosearch_indexer indexer, int uid) {
// mgr vars
this.mgr = mgr; this.mgr_db = mgr_db;
this.page_pool = page_pool; this.prog_mgr = prog_mgr; this.file_orig_wkr = file_orig_wkr;
this.ns_ord_mgr = ns_ord_mgr;
this.indexer = indexer;
// cfg vars
this.cfg = cfg;
@ -125,6 +132,9 @@ public class Xomp_parse_wkr implements Gfo_invk {
// gen_html
hdump_bldr.Insert(pctx, wpg);
// index
if (indexer != null) indexer.Index(wpg);
// mark done for sake of progress
prog_mgr.Mark_done(ppg.Id());

@ -0,0 +1,34 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.fulltexts.indexers; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
import gplx.gflucene.*;
public class Xosearch_indexer {
private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr();
public void Init(Xow_wiki wiki) {
Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search");
Io_mgr.Instance.DeleteDirDeep(search_dir);
index_wtr.Init(search_dir.Xto_api());
}
public void Index(Xoae_page wpg) {
// TODO: skip if not main_ns
byte[] html = wpg.Db().Html().Html_bry();
index_wtr.Exec(String_.new_u8(wpg.Ttl().Page_db()), String_.new_u8(html));
}
public void Term() {
index_wtr.Term();
}
}

@ -18,10 +18,14 @@ public class Xow_sys_cfg implements Gfo_invk {
public Xow_sys_cfg(Xowe_wiki wiki) {}
public boolean Xowa_cmd_enabled() {return xowa_cmd_enabled;} public Xow_sys_cfg Xowa_cmd_enabled_(boolean v) {xowa_cmd_enabled = v; return this;} private boolean xowa_cmd_enabled;
public boolean Xowa_proto_enabled() {return xowa_proto_enabled;} public Xow_sys_cfg Xowa_proto_enabled_(boolean v) {xowa_proto_enabled = v; return this;} private boolean xowa_proto_enabled;
public void Copy(Xow_sys_cfg src) {
this.xowa_cmd_enabled = src.xowa_cmd_enabled;
this.xowa_proto_enabled = src.xowa_proto_enabled;
}
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_cmd_enabled = m.ReadYn("v");
else if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_proto_enabled = m.ReadYn("v");
if (ctx.Match(k, "xowa_cmd_enabled_")) xowa_cmd_enabled = m.ReadYn("v");
else if (ctx.Match(k, "xowa_proto_enabled_")) xowa_proto_enabled = m.ReadYn("v");
else return Gfo_invk_.Rv_unhandled;
return this;
} private static final String Invk_xowa_cmd_enabled_ = "xowa_cmd_enabled_";
}
}

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-core-6.4.2.jar"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-memory-6.4.2.jar"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-highlighter-6.4.2.jar"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-queryparser-6.4.2.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>
Loading…
Cancel
Save