mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Full-text search: Add lucene index generation
This commit is contained in:
parent
062d958ead
commit
c91416801b
@ -9,5 +9,6 @@
|
|||||||
<classpathentry exported="true" kind="lib" path="lib/jtidy_xowa.jar"/>
|
<classpathentry exported="true" kind="lib" path="lib/jtidy_xowa.jar"/>
|
||||||
<classpathentry exported="true" kind="lib" path="lib/icu4j-4_8.jar"/>
|
<classpathentry exported="true" kind="lib" path="lib/icu4j-4_8.jar"/>
|
||||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||||
|
<classpathentry combineaccessrules="false" kind="src" path="/gplx.gflucene"/>
|
||||||
<classpathentry kind="output" path="bin"/>
|
<classpathentry kind="output" path="bin"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
@ -25,6 +25,9 @@ public class Xow_wiki_utl_ {
|
|||||||
rv.File_mgr().Repo_mgr().Clone(wiki.File_mgr().Repo_mgr());
|
rv.File_mgr().Repo_mgr().Clone(wiki.File_mgr().Repo_mgr());
|
||||||
rv.File__fsdb_mode().Tid__v2__bld__y_();
|
rv.File__fsdb_mode().Tid__v2__bld__y_();
|
||||||
|
|
||||||
|
// copy other members
|
||||||
|
rv.Sys_cfg().Copy(wiki.Sys_cfg());
|
||||||
|
|
||||||
Clone_repos(wiki);
|
Clone_repos(wiki);
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ import gplx.core.threads.*; import gplx.core.threads.utils.*;
|
|||||||
import gplx.core.caches.*; import gplx.xowa.wikis.caches.*;
|
import gplx.core.caches.*; import gplx.xowa.wikis.caches.*;
|
||||||
import gplx.xowa.addons.bldrs.mass_parses.parses.wkrs.*; import gplx.xowa.addons.bldrs.mass_parses.dbs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*;
|
import gplx.xowa.addons.bldrs.mass_parses.parses.wkrs.*; import gplx.xowa.addons.bldrs.mass_parses.dbs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*;
|
||||||
import gplx.xowa.addons.bldrs.wmdumps.imglinks.*;
|
import gplx.xowa.addons.bldrs.wmdumps.imglinks.*;
|
||||||
|
import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*;
|
||||||
public class Xomp_parse_mgr {
|
public class Xomp_parse_mgr {
|
||||||
private Gfo_countdown_latch latch;
|
private Gfo_countdown_latch latch;
|
||||||
public Xomp_parse_mgr_cfg Cfg() {return cfg;} private final Xomp_parse_mgr_cfg cfg = new Xomp_parse_mgr_cfg();
|
public Xomp_parse_mgr_cfg Cfg() {return cfg;} private final Xomp_parse_mgr_cfg cfg = new Xomp_parse_mgr_cfg();
|
||||||
@ -58,6 +59,10 @@ public class Xomp_parse_mgr {
|
|||||||
// init ns_ord_mgr
|
// init ns_ord_mgr
|
||||||
Xomp_ns_ord_mgr ns_ord_mgr = new Xomp_ns_ord_mgr(Int_.Ary_parse(mgr_db.Tbl__cfg().Select_str("", Xomp_parse_wkr.Cfg__ns_ids), "|"));
|
Xomp_ns_ord_mgr ns_ord_mgr = new Xomp_ns_ord_mgr(Int_.Ary_parse(mgr_db.Tbl__cfg().Select_str("", Xomp_parse_wkr.Cfg__ns_ids), "|"));
|
||||||
|
|
||||||
|
// init indexer
|
||||||
|
Xosearch_indexer indexer = cfg.Indexer_enabled() ? new Xosearch_indexer() : null;
|
||||||
|
if (indexer != null) indexer.Init(wiki);
|
||||||
|
|
||||||
// init parse_wkrs
|
// init parse_wkrs
|
||||||
for (int i = 0; i < wkr_len; ++i) {
|
for (int i = 0; i < wkr_len; ++i) {
|
||||||
// make wiki
|
// make wiki
|
||||||
@ -65,7 +70,7 @@ public class Xomp_parse_mgr {
|
|||||||
wkr_wiki.Cache_mgr().Page_cache_(page_cache).Commons_cache_(commons_cache).Ifexist_cache_(ifexist_cache);
|
wkr_wiki.Cache_mgr().Page_cache_(page_cache).Commons_cache_(commons_cache).Ifexist_cache_(ifexist_cache);
|
||||||
|
|
||||||
// make wkr
|
// make wkr
|
||||||
Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, i + wkr_uid_bgn);
|
Xomp_parse_wkr wkr = new Xomp_parse_wkr(this, cfg, mgr_db, page_pool, prog_mgr, file_orig_wkr, ns_ord_mgr, wkr_wiki, indexer, i + wkr_uid_bgn);
|
||||||
wkrs[i] = wkr;
|
wkrs[i] = wkr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,6 +83,7 @@ public class Xomp_parse_mgr {
|
|||||||
// wait until wkrs are done
|
// wait until wkrs are done
|
||||||
latch.Await();
|
latch.Await();
|
||||||
page_pool.Rls();
|
page_pool.Rls();
|
||||||
|
if (indexer != null) indexer.Term();
|
||||||
|
|
||||||
// print stats
|
// print stats
|
||||||
Bry_bfr bfr = Bry_bfr_.New();
|
Bry_bfr bfr = Bry_bfr_.New();
|
||||||
|
@ -36,6 +36,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
|
|||||||
public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url;
|
public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url;
|
||||||
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
|
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
|
||||||
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
|
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
|
||||||
|
public boolean Indexer_enabled() {return indexer_enabled;} private boolean indexer_enabled;
|
||||||
public void Init(Xowe_wiki wiki) {
|
public void Init(Xowe_wiki wiki) {
|
||||||
if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count();
|
if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count();
|
||||||
if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000;
|
if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000;
|
||||||
@ -64,6 +65,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
|
|||||||
else if (ctx.Match(k, Invk__show_msg__fetched_pool_)) show_msg__fetched_pool = m.ReadYn("v");
|
else if (ctx.Match(k, Invk__show_msg__fetched_pool_)) show_msg__fetched_pool = m.ReadYn("v");
|
||||||
else if (ctx.Match(k, Invk__hdump_catboxes_)) hdump_catboxs = m.ReadYn("v");
|
else if (ctx.Match(k, Invk__hdump_catboxes_)) hdump_catboxs = m.ReadYn("v");
|
||||||
else if (ctx.Match(k, Invk__log_math_)) log_math = m.ReadYn("v");
|
else if (ctx.Match(k, Invk__log_math_)) log_math = m.ReadYn("v");
|
||||||
|
else if (ctx.Match(k, "indexer_enabled_")) indexer_enabled = m.ReadYn("v");
|
||||||
else return Gfo_invk_.Rv_unhandled;
|
else return Gfo_invk_.Rv_unhandled;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@ import gplx.xowa.files.origs.*;
|
|||||||
import gplx.xowa.htmls.core.bldrs.*;
|
import gplx.xowa.htmls.core.bldrs.*;
|
||||||
import gplx.xowa.parsers.*; import gplx.xowa.parsers.logs.*;
|
import gplx.xowa.parsers.*; import gplx.xowa.parsers.logs.*;
|
||||||
import gplx.xowa.addons.bldrs.mass_parses.parses.mgrs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*; import gplx.xowa.addons.bldrs.mass_parses.parses.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*;
|
import gplx.xowa.addons.bldrs.mass_parses.parses.mgrs.*; import gplx.xowa.addons.bldrs.mass_parses.parses.utls.*; import gplx.xowa.addons.bldrs.mass_parses.parses.*; import gplx.xowa.addons.bldrs.mass_parses.parses.pools.*;
|
||||||
|
import gplx.xowa.addons.wikis.searchs.fulltexts.indexers.*;
|
||||||
public class Xomp_parse_wkr implements Gfo_invk {
|
public class Xomp_parse_wkr implements Gfo_invk {
|
||||||
// mgr vars
|
// mgr vars
|
||||||
private final Xomp_parse_mgr mgr;
|
private final Xomp_parse_mgr mgr;
|
||||||
@ -39,13 +40,19 @@ public class Xomp_parse_wkr implements Gfo_invk {
|
|||||||
private final int uid;
|
private final int uid;
|
||||||
private Xomp_wkr_db wkr_db;
|
private Xomp_wkr_db wkr_db;
|
||||||
|
|
||||||
|
private final Xosearch_indexer indexer;
|
||||||
|
|
||||||
private final List_adp list = List_adp_.New(); private int list_idx = 0, list_len = 0;
|
private final List_adp list = List_adp_.New(); private int list_idx = 0, list_len = 0;
|
||||||
private int done_count; private long done_time;
|
private int done_count; private long done_time;
|
||||||
public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg, Xomp_mgr_db mgr_db, Xomp_page_pool page_pool, Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr, Xowe_wiki wiki, int uid) {
|
public Xomp_parse_wkr(Xomp_parse_mgr mgr, Xomp_parse_mgr_cfg cfg
|
||||||
|
, Xomp_mgr_db mgr_db, Xomp_page_pool page_pool
|
||||||
|
, Xomp_prog_mgr prog_mgr, Xof_orig_wkr file_orig_wkr, Xomp_ns_ord_mgr ns_ord_mgr
|
||||||
|
, Xowe_wiki wiki, Xosearch_indexer indexer, int uid) {
|
||||||
// mgr vars
|
// mgr vars
|
||||||
this.mgr = mgr; this.mgr_db = mgr_db;
|
this.mgr = mgr; this.mgr_db = mgr_db;
|
||||||
this.page_pool = page_pool; this.prog_mgr = prog_mgr; this.file_orig_wkr = file_orig_wkr;
|
this.page_pool = page_pool; this.prog_mgr = prog_mgr; this.file_orig_wkr = file_orig_wkr;
|
||||||
this.ns_ord_mgr = ns_ord_mgr;
|
this.ns_ord_mgr = ns_ord_mgr;
|
||||||
|
this.indexer = indexer;
|
||||||
|
|
||||||
// cfg vars
|
// cfg vars
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
@ -125,6 +132,9 @@ public class Xomp_parse_wkr implements Gfo_invk {
|
|||||||
// gen_html
|
// gen_html
|
||||||
hdump_bldr.Insert(pctx, wpg);
|
hdump_bldr.Insert(pctx, wpg);
|
||||||
|
|
||||||
|
// index
|
||||||
|
if (indexer != null) indexer.Index(wpg);
|
||||||
|
|
||||||
// mark done for sake of progress
|
// mark done for sake of progress
|
||||||
prog_mgr.Mark_done(ppg.Id());
|
prog_mgr.Mark_done(ppg.Id());
|
||||||
|
|
||||||
|
@ -0,0 +1,34 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.addons.wikis.searchs.fulltexts.indexers; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*; import gplx.xowa.addons.wikis.searchs.fulltexts.*;
|
||||||
|
import gplx.gflucene.*;
|
||||||
|
public class Xosearch_indexer {
|
||||||
|
private final Gflucene_index_bldr index_wtr = new Gflucene_index_bldr();
|
||||||
|
public void Init(Xow_wiki wiki) {
|
||||||
|
Io_url search_dir = wiki.Fsys_mgr().Root_dir().GenSubDir_nest("data", "search");
|
||||||
|
Io_mgr.Instance.DeleteDirDeep(search_dir);
|
||||||
|
index_wtr.Init(search_dir.Xto_api());
|
||||||
|
}
|
||||||
|
public void Index(Xoae_page wpg) {
|
||||||
|
// TODO: skip if not main_ns
|
||||||
|
byte[] html = wpg.Db().Html().Html_bry();
|
||||||
|
|
||||||
|
index_wtr.Exec(String_.new_u8(wpg.Ttl().Page_db()), String_.new_u8(html));
|
||||||
|
}
|
||||||
|
public void Term() {
|
||||||
|
index_wtr.Term();
|
||||||
|
}
|
||||||
|
}
|
@ -18,10 +18,14 @@ public class Xow_sys_cfg implements Gfo_invk {
|
|||||||
public Xow_sys_cfg(Xowe_wiki wiki) {}
|
public Xow_sys_cfg(Xowe_wiki wiki) {}
|
||||||
public boolean Xowa_cmd_enabled() {return xowa_cmd_enabled;} public Xow_sys_cfg Xowa_cmd_enabled_(boolean v) {xowa_cmd_enabled = v; return this;} private boolean xowa_cmd_enabled;
|
public boolean Xowa_cmd_enabled() {return xowa_cmd_enabled;} public Xow_sys_cfg Xowa_cmd_enabled_(boolean v) {xowa_cmd_enabled = v; return this;} private boolean xowa_cmd_enabled;
|
||||||
public boolean Xowa_proto_enabled() {return xowa_proto_enabled;} public Xow_sys_cfg Xowa_proto_enabled_(boolean v) {xowa_proto_enabled = v; return this;} private boolean xowa_proto_enabled;
|
public boolean Xowa_proto_enabled() {return xowa_proto_enabled;} public Xow_sys_cfg Xowa_proto_enabled_(boolean v) {xowa_proto_enabled = v; return this;} private boolean xowa_proto_enabled;
|
||||||
|
public void Copy(Xow_sys_cfg src) {
|
||||||
|
this.xowa_cmd_enabled = src.xowa_cmd_enabled;
|
||||||
|
this.xowa_proto_enabled = src.xowa_proto_enabled;
|
||||||
|
}
|
||||||
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
||||||
if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_cmd_enabled = m.ReadYn("v");
|
if (ctx.Match(k, "xowa_cmd_enabled_")) xowa_cmd_enabled = m.ReadYn("v");
|
||||||
else if (ctx.Match(k, Invk_xowa_cmd_enabled_)) xowa_proto_enabled = m.ReadYn("v");
|
else if (ctx.Match(k, "xowa_proto_enabled_")) xowa_proto_enabled = m.ReadYn("v");
|
||||||
else return Gfo_invk_.Rv_unhandled;
|
else return Gfo_invk_.Rv_unhandled;
|
||||||
return this;
|
return this;
|
||||||
} private static final String Invk_xowa_cmd_enabled_ = "xowa_cmd_enabled_";
|
}
|
||||||
}
|
}
|
||||||
|
11
gplx.gflucene/.classpath
Normal file
11
gplx.gflucene/.classpath
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<classpath>
|
||||||
|
<classpathentry kind="src" path="src"/>
|
||||||
|
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
|
||||||
|
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
|
||||||
|
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-core-6.4.2.jar"/>
|
||||||
|
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-memory-6.4.2.jar"/>
|
||||||
|
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-highlighter-6.4.2.jar"/>
|
||||||
|
<classpathentry kind="lib" path="C:/000/200_dev/110_java/gplx.gflucene/lib/lucene-queryparser-6.4.2.jar"/>
|
||||||
|
<classpathentry kind="output" path="bin"/>
|
||||||
|
</classpath>
|
Loading…
Reference in New Issue
Block a user