1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Full-text search: Add pack_lucene

This commit is contained in:
gnosygnu 2017-03-26 20:14:18 -04:00
parent f2520ef81f
commit a43c0d17ba
7 changed files with 93 additions and 19 deletions

View File

@ -28,5 +28,6 @@ public class Xobc_import_type {
, Tid__wiki__ctg = 9
, Tid__misc = 10
, Tid__wiki__wbase = 11
, Tid__wiki__lucene = 12
;
}

View File

@ -20,6 +20,8 @@ public class Pack_file_cfg implements Gfo_invk {
public boolean Pack_text() {return pack_text;} private boolean pack_text = false;
public boolean Pack_html() {return pack_html;} private boolean pack_html = true;
public boolean Pack_file() {return pack_file;} private boolean pack_file = true;
public boolean Pack_lucene() {return pack_lucene;} private boolean pack_lucene;
public long Lucene_max() {return lucene_max;} private long lucene_max = Io_mgr.Len_mb * 1500;
public boolean Pack_fsdb_delete() {return pack_fsdb_delete;} private boolean pack_fsdb_delete;
public boolean Pack_custom() {return pack_custom_files != null;}
public String Pack_custom_files() {return pack_custom_files;} private String pack_custom_files;
@ -30,22 +32,18 @@ public class Pack_file_cfg implements Gfo_invk {
} private String wiki_date = null;
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, Invk__deploy_dir_)) deploy_dir = m.ReadIoUrl("v");
else if (ctx.Match(k, Invk__pack_text_)) pack_text = m.ReadYn("v");
else if (ctx.Match(k, Invk__pack_html_)) pack_html = m.ReadYn("v");
else if (ctx.Match(k, Invk__pack_file_)) pack_file = m.ReadYn("v");
else if (ctx.Match(k, Invk__pack_file_cutoff_)) pack_file_cutoff = m.ReadDate("v");
else if (ctx.Match(k, Invk__pack_fsdb_delete_)) pack_fsdb_delete = m.ReadYn("v");
else if (ctx.Match(k, Invk__pack_custom_name_)) pack_custom_name = m.ReadStr("v");
else if (ctx.Match(k, Invk__pack_custom_files_)) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}}
else if (ctx.Match(k, Invk__wiki_date_)) wiki_date = m.ReadStr("v");
if (ctx.Match(k, "deploy_dir_")) deploy_dir = m.ReadIoUrl("v");
else if (ctx.Match(k, "pack_text_")) pack_text = m.ReadYn("v");
else if (ctx.Match(k, "pack_html_")) pack_html = m.ReadYn("v");
else if (ctx.Match(k, "pack_file_")) pack_file = m.ReadYn("v");
else if (ctx.Match(k, "pack_file_cutoff_")) pack_file_cutoff = m.ReadDate("v");
else if (ctx.Match(k, "pack_fsdb_delete_")) pack_fsdb_delete = m.ReadYn("v");
else if (ctx.Match(k, "pack_custom_name_")) pack_custom_name = m.ReadStr("v");
else if (ctx.Match(k, "pack_custom_files_")) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}}
else if (ctx.Match(k, "wiki_date_")) wiki_date = m.ReadStr("v");
else if (ctx.Match(k, "pack_lucene_")) pack_lucene = m.ReadYn("v");
else if (ctx.Match(k, "lucene_max_")) lucene_max = m.ReadLong("v") * Io_mgr.Len_mb;
else return Gfo_invk_.Rv_unhandled;
return this;
}
private static final String Invk__deploy_dir_ = "deploy_dir_"
, Invk__pack_text_ = "pack_text_", Invk__pack_html_ = "pack_html_", Invk__pack_file_ = "pack_file_", Invk__pack_file_cutoff_ = "pack_file_cutoff_"
, Invk__pack_fsdb_delete_ = "pack_fsdb_delete_"
, Invk__pack_custom_name_ = "pack_custom_name_", Invk__pack_custom_files_ = "pack_custom_files_"
, Invk__wiki_date_ = "wiki_date_"
;
}

View File

@ -19,14 +19,16 @@ class Pack_hash {
public int Len() {return hash.Len();}
public Pack_list Get_at(int i) {return (Pack_list)hash.Get_at(i);}
public Pack_list Get_by(int tid) {return (Pack_list)hash.Get_by(tid);}
public void Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) {
public Pack_itm Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) {return Add(list_tid, bldr.Bld(file_url), file_url);}
public Pack_itm Add(int list_tid, Io_url pack_url, Io_url... raw_urls) {
Pack_list list = (Pack_list)hash.Get_by(list_tid);
if (list == null) {
list = new Pack_list(list_tid);
hash.Add(list_tid, list);
}
Pack_itm itm = new Pack_itm(list_tid, bldr.Bld(file_url), file_url);
Pack_itm itm = new Pack_itm(list_tid, pack_url, raw_urls);
list.Add(itm);
return itm;
}
public void Consolidate(int... tids) { // merge n itms into 1 itm; needed for search-core + search-link -> search
int tids_len = tids.length;

View File

@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.bldrs.exports.packs.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.exports.*; import gplx.xowa.addons.bldrs.exports.packs.*;
import gplx.core.ios.*;
import gplx.fsdb.meta.*;
import gplx.xowa.wikis.data.*;
import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*;
@ -51,6 +52,11 @@ class Pack_hash_bldr {
}
}
// bld lucene pack
if (cfg.Pack_lucene()) {
Pack_lucene(rv, wiki, zip_name_bldr, cfg);
}
// bld file pack
if (cfg.Pack_file()) {
Fsm_mnt_itm mnt_itm = wiki.File__mnt_mgr().Mnts__get_at(Fsm_mnt_mgr.Mnt_idx_main);
@ -80,6 +86,51 @@ class Pack_hash_bldr {
}
return rv;
}
private static void Pack_lucene(Pack_hash rv, Xow_wiki wiki, Pack_zip_name_bldr zip_name_bldr, Pack_file_cfg cfg) {
// read files from lucene_dir
Io_url lucene_dir = gplx.xowa.addons.wikis.fulltexts.Xosearch_fulltext_addon.Get_index_dir(wiki);
IoItmHash fils = Io_mgr.Instance.QueryDir_args(lucene_dir).ExecAsItmHash();
// init vars
int pack_num = 0;
long size_cur = 0;
long size_max = cfg.Lucene_max();
List_adp url_list = List_adp_.New();
int fil_idx = 0;
int fils_len = fils.Len();
// loop over each file
while (fil_idx < fils_len) {
IoItmFil fil = (IoItmFil)fils.Get_at(fil_idx);
// calc size_new
long size_new = size_cur + fil.Size();
// if last file, set size_new to max
boolean add_file = true;
if (fil_idx == fils_len - 1) {
size_new = size_max;
url_list.Add(fil.Url());
add_file = false;
}
// size exceeded; make new pack
if (size_new >= size_max) {
rv.Add(Xobc_import_type.Tid__wiki__lucene, zip_name_bldr.Bld_by_suffix("xtn.fulltext_search", pack_num), (Io_url[])url_list.To_ary_and_clear(Io_url.class));
pack_num++;
size_cur = 0;
}
// size too small; just update
else {
size_cur = size_new;
}
// add file to list
if (add_file)
url_list.Add(fil.Url());
fil_idx++;
}
}
private static Pack_hash Bld_custom_files(Pack_hash rv, Xow_wiki wiki, Io_url wiki_dir, Pack_zip_name_bldr zip_name_bldr, String custom_files_blob) {
String[] custom_files = String_.Split(custom_files_blob, "|");
int len = custom_files.length;

View File

@ -26,12 +26,22 @@ public class Pack_zip_name_bldr { // en.wikipedia.org-file-ns.000-db.001.xowa ->
this.zip_name_prefix = Bry_.new_u8("Xowa_" + wiki_abrv + "_" + zip_name_suffix);
}
public Io_url Bld(Io_url orig_url) {
String orig_str = orig_url.NameOnly() + ".zip";
byte[] orig_bry = Bry_.new_u8(orig_str);
// get name and add .zip; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org-file-core.zip"
byte[] orig_bry = Bry_.new_u8(orig_url.NameOnly() + ".zip");
// swap dashes with unders; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org_file_core.zip"
orig_bry = Bry_.Replace(orig_bry, Byte_ascii.Dash, Byte_ascii.Underline);
// swap domain with xobc-style-prefix; EX: "en.wikipedia.org_file_core.zip" -> "Xowa_enwiki_2017-03_file_core.zip"
orig_bry = Bry_.Replace(orig_bry, wiki_domain, zip_name_prefix);
return pack_dir.GenSubFil(String_.new_u8(orig_bry));
}
public Io_url Bld_by_suffix(String suffix, int pack_num) {
// make fil_name EX: "Xowa_enwiki_2017-03" + "_" + "xtn.fulltext_search.001" + .zip
String fil_name = String_.new_u8(zip_name_prefix) + "_" + suffix + "." + Int_.To_str_pad_bgn_zero(pack_num, 3) + ".zip";
return pack_dir.GenSubFil(fil_name);
}
public static Io_url To_wiki_url(Io_url wiki_dir, Io_url zip_dir) {
// get wiki_url based on wiki_dir and xobc_zip_fil; EX: "/wiki/en.wikipedia.org/", "/wiki/tmp/Xowa_enwiki_2016-09_file_core_deletion_2016-09/" -> "/wiki/en.wikipedia.org-file-core-deletion-2016.09.zip"
String name_str = zip_dir.NameOnly() + ".xowa";

View File

@ -20,9 +20,20 @@ public class Pack_zip_name_bldr__tst {
@Test public void Basic() {
fxt.Test__to_wiki_url("mem/wiki/en.wikipedia.org/", "mem/wiki/en.wikipedia.org/tmp/Xowa_enwiki_2016-09_file_deletion_2016.09/", "mem/wiki/en.wikipedia.org/en.wikipedia.org-file-deletion-2016.09.xowa");
}
@Test public void Bld_by_suffix() {
Pack_zip_name_bldr bldr = fxt.Make__bldr("mem/wiki/en.wikipedia.org/tmp/pack/", "en.wikipedia.org", "enwiki", "2017-03", null);
fxt.Test__bld_by_suffix(bldr, "xtn.fulltext_search", 1, "mem/wiki/en.wikipedia.org/tmp/pack/Xowa_enwiki_2017-03_xtn.fulltext_search.001.zip");
}
}
class Pack_zip_name_bldr__fxt {
public void Test__to_wiki_url(String wiki_dir, String zip_fil, String expd) {
Gftest.Eq__str(expd, Pack_zip_name_bldr.To_wiki_url(Io_url_.mem_fil_(wiki_dir), Io_url_.mem_dir_(zip_fil)).Raw(), "wiki_url");
}
public Pack_zip_name_bldr Make__bldr(String wiki_dir, String domain, String wiki_abrv, String wiki_date, String custom_name) {
return new Pack_zip_name_bldr(Io_url_.new_dir_(wiki_dir), domain, wiki_abrv, wiki_date, custom_name);
}
public void Test__bld_by_suffix(Pack_zip_name_bldr bldr, String suffix, int pack_num, String expd) {
Gftest.Eq__str(expd, bldr.Bld_by_suffix(suffix, pack_num).Xto_api());
}
}

View File

@ -19,6 +19,7 @@ import gplx.dbs.*;
import gplx.xowa.wikis.data.*;
import gplx.xowa.addons.bldrs.centrals.dbs.*; import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*; import gplx.xowa.addons.bldrs.centrals.steps.*;
import gplx.xowa.addons.bldrs.exports.splits.mgrs.*; import gplx.xowa.addons.bldrs.exports.splits.rslts.*;
// NOTE: used for experimental pack / split approach (html,file,search in one db)
class Pack_mgr {
public void Exec(Xowe_wiki wiki, long pack_size_max) {
// init