mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Full-text search: Add pack_lucene
This commit is contained in:
parent
f2520ef81f
commit
a43c0d17ba
@ -28,5 +28,6 @@ public class Xobc_import_type {
|
||||
, Tid__wiki__ctg = 9
|
||||
, Tid__misc = 10
|
||||
, Tid__wiki__wbase = 11
|
||||
, Tid__wiki__lucene = 12
|
||||
;
|
||||
}
|
||||
|
@ -20,6 +20,8 @@ public class Pack_file_cfg implements Gfo_invk {
|
||||
public boolean Pack_text() {return pack_text;} private boolean pack_text = false;
|
||||
public boolean Pack_html() {return pack_html;} private boolean pack_html = true;
|
||||
public boolean Pack_file() {return pack_file;} private boolean pack_file = true;
|
||||
public boolean Pack_lucene() {return pack_lucene;} private boolean pack_lucene;
|
||||
public long Lucene_max() {return lucene_max;} private long lucene_max = Io_mgr.Len_mb * 1500;
|
||||
public boolean Pack_fsdb_delete() {return pack_fsdb_delete;} private boolean pack_fsdb_delete;
|
||||
public boolean Pack_custom() {return pack_custom_files != null;}
|
||||
public String Pack_custom_files() {return pack_custom_files;} private String pack_custom_files;
|
||||
@ -30,22 +32,18 @@ public class Pack_file_cfg implements Gfo_invk {
|
||||
} private String wiki_date = null;
|
||||
|
||||
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
||||
if (ctx.Match(k, Invk__deploy_dir_)) deploy_dir = m.ReadIoUrl("v");
|
||||
else if (ctx.Match(k, Invk__pack_text_)) pack_text = m.ReadYn("v");
|
||||
else if (ctx.Match(k, Invk__pack_html_)) pack_html = m.ReadYn("v");
|
||||
else if (ctx.Match(k, Invk__pack_file_)) pack_file = m.ReadYn("v");
|
||||
else if (ctx.Match(k, Invk__pack_file_cutoff_)) pack_file_cutoff = m.ReadDate("v");
|
||||
else if (ctx.Match(k, Invk__pack_fsdb_delete_)) pack_fsdb_delete = m.ReadYn("v");
|
||||
else if (ctx.Match(k, Invk__pack_custom_name_)) pack_custom_name = m.ReadStr("v");
|
||||
else if (ctx.Match(k, Invk__pack_custom_files_)) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}}
|
||||
else if (ctx.Match(k, Invk__wiki_date_)) wiki_date = m.ReadStr("v");
|
||||
if (ctx.Match(k, "deploy_dir_")) deploy_dir = m.ReadIoUrl("v");
|
||||
else if (ctx.Match(k, "pack_text_")) pack_text = m.ReadYn("v");
|
||||
else if (ctx.Match(k, "pack_html_")) pack_html = m.ReadYn("v");
|
||||
else if (ctx.Match(k, "pack_file_")) pack_file = m.ReadYn("v");
|
||||
else if (ctx.Match(k, "pack_file_cutoff_")) pack_file_cutoff = m.ReadDate("v");
|
||||
else if (ctx.Match(k, "pack_fsdb_delete_")) pack_fsdb_delete = m.ReadYn("v");
|
||||
else if (ctx.Match(k, "pack_custom_name_")) pack_custom_name = m.ReadStr("v");
|
||||
else if (ctx.Match(k, "pack_custom_files_")) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}}
|
||||
else if (ctx.Match(k, "wiki_date_")) wiki_date = m.ReadStr("v");
|
||||
else if (ctx.Match(k, "pack_lucene_")) pack_lucene = m.ReadYn("v");
|
||||
else if (ctx.Match(k, "lucene_max_")) lucene_max = m.ReadLong("v") * Io_mgr.Len_mb;
|
||||
else return Gfo_invk_.Rv_unhandled;
|
||||
return this;
|
||||
}
|
||||
private static final String Invk__deploy_dir_ = "deploy_dir_"
|
||||
, Invk__pack_text_ = "pack_text_", Invk__pack_html_ = "pack_html_", Invk__pack_file_ = "pack_file_", Invk__pack_file_cutoff_ = "pack_file_cutoff_"
|
||||
, Invk__pack_fsdb_delete_ = "pack_fsdb_delete_"
|
||||
, Invk__pack_custom_name_ = "pack_custom_name_", Invk__pack_custom_files_ = "pack_custom_files_"
|
||||
, Invk__wiki_date_ = "wiki_date_"
|
||||
;
|
||||
}
|
||||
|
@ -19,14 +19,16 @@ class Pack_hash {
|
||||
public int Len() {return hash.Len();}
|
||||
public Pack_list Get_at(int i) {return (Pack_list)hash.Get_at(i);}
|
||||
public Pack_list Get_by(int tid) {return (Pack_list)hash.Get_by(tid);}
|
||||
public void Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) {
|
||||
public Pack_itm Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) {return Add(list_tid, bldr.Bld(file_url), file_url);}
|
||||
public Pack_itm Add(int list_tid, Io_url pack_url, Io_url... raw_urls) {
|
||||
Pack_list list = (Pack_list)hash.Get_by(list_tid);
|
||||
if (list == null) {
|
||||
list = new Pack_list(list_tid);
|
||||
hash.Add(list_tid, list);
|
||||
}
|
||||
Pack_itm itm = new Pack_itm(list_tid, bldr.Bld(file_url), file_url);
|
||||
Pack_itm itm = new Pack_itm(list_tid, pack_url, raw_urls);
|
||||
list.Add(itm);
|
||||
return itm;
|
||||
}
|
||||
public void Consolidate(int... tids) { // merge n itms into 1 itm; needed for search-core + search-link -> search
|
||||
int tids_len = tids.length;
|
||||
|
@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.bldrs.exports.packs.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.exports.*; import gplx.xowa.addons.bldrs.exports.packs.*;
|
||||
import gplx.core.ios.*;
|
||||
import gplx.fsdb.meta.*;
|
||||
import gplx.xowa.wikis.data.*;
|
||||
import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*;
|
||||
@ -51,6 +52,11 @@ class Pack_hash_bldr {
|
||||
}
|
||||
}
|
||||
|
||||
// bld lucene pack
|
||||
if (cfg.Pack_lucene()) {
|
||||
Pack_lucene(rv, wiki, zip_name_bldr, cfg);
|
||||
}
|
||||
|
||||
// bld file pack
|
||||
if (cfg.Pack_file()) {
|
||||
Fsm_mnt_itm mnt_itm = wiki.File__mnt_mgr().Mnts__get_at(Fsm_mnt_mgr.Mnt_idx_main);
|
||||
@ -80,6 +86,51 @@ class Pack_hash_bldr {
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
private static void Pack_lucene(Pack_hash rv, Xow_wiki wiki, Pack_zip_name_bldr zip_name_bldr, Pack_file_cfg cfg) {
|
||||
// read files from lucene_dir
|
||||
Io_url lucene_dir = gplx.xowa.addons.wikis.fulltexts.Xosearch_fulltext_addon.Get_index_dir(wiki);
|
||||
IoItmHash fils = Io_mgr.Instance.QueryDir_args(lucene_dir).ExecAsItmHash();
|
||||
|
||||
// init vars
|
||||
int pack_num = 0;
|
||||
long size_cur = 0;
|
||||
long size_max = cfg.Lucene_max();
|
||||
List_adp url_list = List_adp_.New();
|
||||
int fil_idx = 0;
|
||||
int fils_len = fils.Len();
|
||||
|
||||
// loop over each file
|
||||
while (fil_idx < fils_len) {
|
||||
IoItmFil fil = (IoItmFil)fils.Get_at(fil_idx);
|
||||
|
||||
// calc size_new
|
||||
long size_new = size_cur + fil.Size();
|
||||
|
||||
// if last file, set size_new to max
|
||||
boolean add_file = true;
|
||||
if (fil_idx == fils_len - 1) {
|
||||
size_new = size_max;
|
||||
url_list.Add(fil.Url());
|
||||
add_file = false;
|
||||
}
|
||||
|
||||
// size exceeded; make new pack
|
||||
if (size_new >= size_max) {
|
||||
rv.Add(Xobc_import_type.Tid__wiki__lucene, zip_name_bldr.Bld_by_suffix("xtn.fulltext_search", pack_num), (Io_url[])url_list.To_ary_and_clear(Io_url.class));
|
||||
pack_num++;
|
||||
size_cur = 0;
|
||||
}
|
||||
// size too small; just update
|
||||
else {
|
||||
size_cur = size_new;
|
||||
}
|
||||
|
||||
// add file to list
|
||||
if (add_file)
|
||||
url_list.Add(fil.Url());
|
||||
fil_idx++;
|
||||
}
|
||||
}
|
||||
private static Pack_hash Bld_custom_files(Pack_hash rv, Xow_wiki wiki, Io_url wiki_dir, Pack_zip_name_bldr zip_name_bldr, String custom_files_blob) {
|
||||
String[] custom_files = String_.Split(custom_files_blob, "|");
|
||||
int len = custom_files.length;
|
||||
|
@ -26,12 +26,22 @@ public class Pack_zip_name_bldr { // en.wikipedia.org-file-ns.000-db.001.xowa ->
|
||||
this.zip_name_prefix = Bry_.new_u8("Xowa_" + wiki_abrv + "_" + zip_name_suffix);
|
||||
}
|
||||
public Io_url Bld(Io_url orig_url) {
|
||||
String orig_str = orig_url.NameOnly() + ".zip";
|
||||
byte[] orig_bry = Bry_.new_u8(orig_str);
|
||||
// get name and add .zip; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org-file-core.zip"
|
||||
byte[] orig_bry = Bry_.new_u8(orig_url.NameOnly() + ".zip");
|
||||
|
||||
// swap dashes with unders; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org_file_core.zip"
|
||||
orig_bry = Bry_.Replace(orig_bry, Byte_ascii.Dash, Byte_ascii.Underline);
|
||||
|
||||
// swap domain with xobc-style-prefix; EX: "en.wikipedia.org_file_core.zip" -> "Xowa_enwiki_2017-03_file_core.zip"
|
||||
orig_bry = Bry_.Replace(orig_bry, wiki_domain, zip_name_prefix);
|
||||
|
||||
return pack_dir.GenSubFil(String_.new_u8(orig_bry));
|
||||
}
|
||||
public Io_url Bld_by_suffix(String suffix, int pack_num) {
|
||||
// make fil_name EX: "Xowa_enwiki_2017-03" + "_" + "xtn.fulltext_search.001" + .zip
|
||||
String fil_name = String_.new_u8(zip_name_prefix) + "_" + suffix + "." + Int_.To_str_pad_bgn_zero(pack_num, 3) + ".zip";
|
||||
return pack_dir.GenSubFil(fil_name);
|
||||
}
|
||||
public static Io_url To_wiki_url(Io_url wiki_dir, Io_url zip_dir) {
|
||||
// get wiki_url based on wiki_dir and xobc_zip_fil; EX: "/wiki/en.wikipedia.org/", "/wiki/tmp/Xowa_enwiki_2016-09_file_core_deletion_2016-09/" -> "/wiki/en.wikipedia.org-file-core-deletion-2016.09.zip"
|
||||
String name_str = zip_dir.NameOnly() + ".xowa";
|
||||
|
@ -20,9 +20,20 @@ public class Pack_zip_name_bldr__tst {
|
||||
@Test public void Basic() {
|
||||
fxt.Test__to_wiki_url("mem/wiki/en.wikipedia.org/", "mem/wiki/en.wikipedia.org/tmp/Xowa_enwiki_2016-09_file_deletion_2016.09/", "mem/wiki/en.wikipedia.org/en.wikipedia.org-file-deletion-2016.09.xowa");
|
||||
}
|
||||
@Test public void Bld_by_suffix() {
|
||||
Pack_zip_name_bldr bldr = fxt.Make__bldr("mem/wiki/en.wikipedia.org/tmp/pack/", "en.wikipedia.org", "enwiki", "2017-03", null);
|
||||
fxt.Test__bld_by_suffix(bldr, "xtn.fulltext_search", 1, "mem/wiki/en.wikipedia.org/tmp/pack/Xowa_enwiki_2017-03_xtn.fulltext_search.001.zip");
|
||||
}
|
||||
}
|
||||
class Pack_zip_name_bldr__fxt {
|
||||
public void Test__to_wiki_url(String wiki_dir, String zip_fil, String expd) {
|
||||
Gftest.Eq__str(expd, Pack_zip_name_bldr.To_wiki_url(Io_url_.mem_fil_(wiki_dir), Io_url_.mem_dir_(zip_fil)).Raw(), "wiki_url");
|
||||
}
|
||||
|
||||
public Pack_zip_name_bldr Make__bldr(String wiki_dir, String domain, String wiki_abrv, String wiki_date, String custom_name) {
|
||||
return new Pack_zip_name_bldr(Io_url_.new_dir_(wiki_dir), domain, wiki_abrv, wiki_date, custom_name);
|
||||
}
|
||||
public void Test__bld_by_suffix(Pack_zip_name_bldr bldr, String suffix, int pack_num, String expd) {
|
||||
Gftest.Eq__str(expd, bldr.Bld_by_suffix(suffix, pack_num).Xto_api());
|
||||
}
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ import gplx.dbs.*;
|
||||
import gplx.xowa.wikis.data.*;
|
||||
import gplx.xowa.addons.bldrs.centrals.dbs.*; import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*; import gplx.xowa.addons.bldrs.centrals.steps.*;
|
||||
import gplx.xowa.addons.bldrs.exports.splits.mgrs.*; import gplx.xowa.addons.bldrs.exports.splits.rslts.*;
|
||||
// NOTE: used for experimental pack / split approach (html,file,search in one db)
|
||||
class Pack_mgr {
|
||||
public void Exec(Xowe_wiki wiki, long pack_size_max) {
|
||||
// init
|
||||
|
Loading…
Reference in New Issue
Block a user