From a43c0d17ba94f2e39f0be82faaae34aa4ce4470e Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sun, 26 Mar 2017 20:14:18 -0400 Subject: [PATCH] Full-text search: Add pack_lucene --- .../dbs/datas/imports/Xobc_import_type.java | 1 + .../exports/packs/files/Pack_file_cfg.java | 28 +++++----- .../bldrs/exports/packs/files/Pack_hash.java | 6 ++- .../exports/packs/files/Pack_hash_bldr.java | 51 +++++++++++++++++++ .../packs/files/Pack_zip_name_bldr.java | 14 ++++- .../packs/files/Pack_zip_name_bldr__tst.java | 11 ++++ .../bldrs/exports/packs/splits/Pack_mgr.java | 1 + 7 files changed, 93 insertions(+), 19 deletions(-) diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/centrals/dbs/datas/imports/Xobc_import_type.java b/400_xowa/src/gplx/xowa/addons/bldrs/centrals/dbs/datas/imports/Xobc_import_type.java index 6b62cfc90..e237decf6 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/centrals/dbs/datas/imports/Xobc_import_type.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/centrals/dbs/datas/imports/Xobc_import_type.java @@ -28,5 +28,6 @@ public class Xobc_import_type { , Tid__wiki__ctg = 9 , Tid__misc = 10 , Tid__wiki__wbase = 11 + , Tid__wiki__lucene = 12 ; } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_file_cfg.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_file_cfg.java index d78623b6d..2c1b7e43e 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_file_cfg.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_file_cfg.java @@ -20,6 +20,8 @@ public class Pack_file_cfg implements Gfo_invk { public boolean Pack_text() {return pack_text;} private boolean pack_text = false; public boolean Pack_html() {return pack_html;} private boolean pack_html = true; public boolean Pack_file() {return pack_file;} private boolean pack_file = true; + public boolean Pack_lucene() {return pack_lucene;} private boolean pack_lucene; + public long Lucene_max() {return lucene_max;} private long lucene_max = Io_mgr.Len_mb * 1500; public boolean Pack_fsdb_delete() {return pack_fsdb_delete;} private boolean pack_fsdb_delete; public boolean Pack_custom() {return pack_custom_files != null;} public String Pack_custom_files() {return pack_custom_files;} private String pack_custom_files; @@ -30,22 +32,18 @@ public class Pack_file_cfg implements Gfo_invk { } private String wiki_date = null; public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { - if (ctx.Match(k, Invk__deploy_dir_)) deploy_dir = m.ReadIoUrl("v"); - else if (ctx.Match(k, Invk__pack_text_)) pack_text = m.ReadYn("v"); - else if (ctx.Match(k, Invk__pack_html_)) pack_html = m.ReadYn("v"); - else if (ctx.Match(k, Invk__pack_file_)) pack_file = m.ReadYn("v"); - else if (ctx.Match(k, Invk__pack_file_cutoff_)) pack_file_cutoff = m.ReadDate("v"); - else if (ctx.Match(k, Invk__pack_fsdb_delete_)) pack_fsdb_delete = m.ReadYn("v"); - else if (ctx.Match(k, Invk__pack_custom_name_)) pack_custom_name = m.ReadStr("v"); - else if (ctx.Match(k, Invk__pack_custom_files_)) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}} - else if (ctx.Match(k, Invk__wiki_date_)) wiki_date = m.ReadStr("v"); + if (ctx.Match(k, "deploy_dir_")) deploy_dir = m.ReadIoUrl("v"); + else if (ctx.Match(k, "pack_text_")) pack_text = m.ReadYn("v"); + else if (ctx.Match(k, "pack_html_")) pack_html = m.ReadYn("v"); + else if (ctx.Match(k, "pack_file_")) pack_file = m.ReadYn("v"); + else if (ctx.Match(k, "pack_file_cutoff_")) pack_file_cutoff = m.ReadDate("v"); + else if (ctx.Match(k, "pack_fsdb_delete_")) pack_fsdb_delete = m.ReadYn("v"); + else if (ctx.Match(k, "pack_custom_name_")) pack_custom_name = m.ReadStr("v"); + else if (ctx.Match(k, "pack_custom_files_")) pack_custom_files = m.ReadStr("v"); // pack_custom {files='en.wikipedia.org-core.xowa|en.wikipedia.org-html-ns.008.xowa'}} + else if (ctx.Match(k, "wiki_date_")) wiki_date = m.ReadStr("v"); + else if (ctx.Match(k, "pack_lucene_")) pack_lucene = m.ReadYn("v"); + else if (ctx.Match(k, "lucene_max_")) lucene_max = m.ReadLong("v") * Io_mgr.Len_mb; else return Gfo_invk_.Rv_unhandled; return this; } - private static final String Invk__deploy_dir_ = "deploy_dir_" - , Invk__pack_text_ = "pack_text_", Invk__pack_html_ = "pack_html_", Invk__pack_file_ = "pack_file_", Invk__pack_file_cutoff_ = "pack_file_cutoff_" - , Invk__pack_fsdb_delete_ = "pack_fsdb_delete_" - , Invk__pack_custom_name_ = "pack_custom_name_", Invk__pack_custom_files_ = "pack_custom_files_" - , Invk__wiki_date_ = "wiki_date_" - ; } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash.java index 9cd658264..951f825d5 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash.java @@ -19,14 +19,16 @@ class Pack_hash { public int Len() {return hash.Len();} public Pack_list Get_at(int i) {return (Pack_list)hash.Get_at(i);} public Pack_list Get_by(int tid) {return (Pack_list)hash.Get_by(tid);} - public void Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) { + public Pack_itm Add(Pack_zip_name_bldr bldr, int list_tid, Io_url file_url) {return Add(list_tid, bldr.Bld(file_url), file_url);} + public Pack_itm Add(int list_tid, Io_url pack_url, Io_url... raw_urls) { Pack_list list = (Pack_list)hash.Get_by(list_tid); if (list == null) { list = new Pack_list(list_tid); hash.Add(list_tid, list); } - Pack_itm itm = new Pack_itm(list_tid, bldr.Bld(file_url), file_url); + Pack_itm itm = new Pack_itm(list_tid, pack_url, raw_urls); list.Add(itm); + return itm; } public void Consolidate(int... tids) { // merge n itms into 1 itm; needed for search-core + search-link -> search int tids_len = tids.length; diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash_bldr.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash_bldr.java index ff229a761..914194688 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash_bldr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_hash_bldr.java @@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.addons.bldrs.exports.packs.files; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.exports.*; import gplx.xowa.addons.bldrs.exports.packs.*; +import gplx.core.ios.*; import gplx.fsdb.meta.*; import gplx.xowa.wikis.data.*; import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*; @@ -51,6 +52,11 @@ class Pack_hash_bldr { } } + // bld lucene pack + if (cfg.Pack_lucene()) { + Pack_lucene(rv, wiki, zip_name_bldr, cfg); + } + // bld file pack if (cfg.Pack_file()) { Fsm_mnt_itm mnt_itm = wiki.File__mnt_mgr().Mnts__get_at(Fsm_mnt_mgr.Mnt_idx_main); @@ -80,6 +86,51 @@ class Pack_hash_bldr { } return rv; } + private static void Pack_lucene(Pack_hash rv, Xow_wiki wiki, Pack_zip_name_bldr zip_name_bldr, Pack_file_cfg cfg) { + // read files from lucene_dir + Io_url lucene_dir = gplx.xowa.addons.wikis.fulltexts.Xosearch_fulltext_addon.Get_index_dir(wiki); + IoItmHash fils = Io_mgr.Instance.QueryDir_args(lucene_dir).ExecAsItmHash(); + + // init vars + int pack_num = 0; + long size_cur = 0; + long size_max = cfg.Lucene_max(); + List_adp url_list = List_adp_.New(); + int fil_idx = 0; + int fils_len = fils.Len(); + + // loop over each file + while (fil_idx < fils_len) { + IoItmFil fil = (IoItmFil)fils.Get_at(fil_idx); + + // calc size_new + long size_new = size_cur + fil.Size(); + + // if last file, set size_new to max + boolean add_file = true; + if (fil_idx == fils_len - 1) { + size_new = size_max; + url_list.Add(fil.Url()); + add_file = false; + } + + // size exceeded; make new pack + if (size_new >= size_max) { + rv.Add(Xobc_import_type.Tid__wiki__lucene, zip_name_bldr.Bld_by_suffix("xtn.fulltext_search", pack_num), (Io_url[])url_list.To_ary_and_clear(Io_url.class)); + pack_num++; + size_cur = 0; + } + // size too small; just update + else { + size_cur = size_new; + } + + // add file to list + if (add_file) + url_list.Add(fil.Url()); + fil_idx++; + } + } private static Pack_hash Bld_custom_files(Pack_hash rv, Xow_wiki wiki, Io_url wiki_dir, Pack_zip_name_bldr zip_name_bldr, String custom_files_blob) { String[] custom_files = String_.Split(custom_files_blob, "|"); int len = custom_files.length; diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr.java index 59186ca8a..b2a888704 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr.java @@ -26,12 +26,22 @@ public class Pack_zip_name_bldr { // en.wikipedia.org-file-ns.000-db.001.xowa -> this.zip_name_prefix = Bry_.new_u8("Xowa_" + wiki_abrv + "_" + zip_name_suffix); } public Io_url Bld(Io_url orig_url) { - String orig_str = orig_url.NameOnly() + ".zip"; - byte[] orig_bry = Bry_.new_u8(orig_str); + // get name and add .zip; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org-file-core.zip" + byte[] orig_bry = Bry_.new_u8(orig_url.NameOnly() + ".zip"); + + // swap dashes with unders; EX: "en.wikipedia.org-file-core.xowa" -> "en.wikipedia.org_file_core.zip" orig_bry = Bry_.Replace(orig_bry, Byte_ascii.Dash, Byte_ascii.Underline); + + // swap domain with xobc-style-prefix; EX: "en.wikipedia.org_file_core.zip" -> "Xowa_enwiki_2017-03_file_core.zip" orig_bry = Bry_.Replace(orig_bry, wiki_domain, zip_name_prefix); + return pack_dir.GenSubFil(String_.new_u8(orig_bry)); } + public Io_url Bld_by_suffix(String suffix, int pack_num) { + // make fil_name EX: "Xowa_enwiki_2017-03" + "_" + "xtn.fulltext_search.001" + .zip + String fil_name = String_.new_u8(zip_name_prefix) + "_" + suffix + "." + Int_.To_str_pad_bgn_zero(pack_num, 3) + ".zip"; + return pack_dir.GenSubFil(fil_name); + } public static Io_url To_wiki_url(Io_url wiki_dir, Io_url zip_dir) { // get wiki_url based on wiki_dir and xobc_zip_fil; EX: "/wiki/en.wikipedia.org/", "/wiki/tmp/Xowa_enwiki_2016-09_file_core_deletion_2016-09/" -> "/wiki/en.wikipedia.org-file-core-deletion-2016.09.zip" String name_str = zip_dir.NameOnly() + ".xowa"; diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr__tst.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr__tst.java index ca888de7a..8f33a6be7 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr__tst.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/files/Pack_zip_name_bldr__tst.java @@ -20,9 +20,20 @@ public class Pack_zip_name_bldr__tst { @Test public void Basic() { fxt.Test__to_wiki_url("mem/wiki/en.wikipedia.org/", "mem/wiki/en.wikipedia.org/tmp/Xowa_enwiki_2016-09_file_deletion_2016.09/", "mem/wiki/en.wikipedia.org/en.wikipedia.org-file-deletion-2016.09.xowa"); } + @Test public void Bld_by_suffix() { + Pack_zip_name_bldr bldr = fxt.Make__bldr("mem/wiki/en.wikipedia.org/tmp/pack/", "en.wikipedia.org", "enwiki", "2017-03", null); + fxt.Test__bld_by_suffix(bldr, "xtn.fulltext_search", 1, "mem/wiki/en.wikipedia.org/tmp/pack/Xowa_enwiki_2017-03_xtn.fulltext_search.001.zip"); + } } class Pack_zip_name_bldr__fxt { public void Test__to_wiki_url(String wiki_dir, String zip_fil, String expd) { Gftest.Eq__str(expd, Pack_zip_name_bldr.To_wiki_url(Io_url_.mem_fil_(wiki_dir), Io_url_.mem_dir_(zip_fil)).Raw(), "wiki_url"); } + + public Pack_zip_name_bldr Make__bldr(String wiki_dir, String domain, String wiki_abrv, String wiki_date, String custom_name) { + return new Pack_zip_name_bldr(Io_url_.new_dir_(wiki_dir), domain, wiki_abrv, wiki_date, custom_name); + } + public void Test__bld_by_suffix(Pack_zip_name_bldr bldr, String suffix, int pack_num, String expd) { + Gftest.Eq__str(expd, bldr.Bld_by_suffix(suffix, pack_num).Xto_api()); + } } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/splits/Pack_mgr.java b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/splits/Pack_mgr.java index c95cfac5b..d7c0fafe4 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/splits/Pack_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/exports/packs/splits/Pack_mgr.java @@ -19,6 +19,7 @@ import gplx.dbs.*; import gplx.xowa.wikis.data.*; import gplx.xowa.addons.bldrs.centrals.dbs.*; import gplx.xowa.addons.bldrs.centrals.dbs.datas.imports.*; import gplx.xowa.addons.bldrs.centrals.steps.*; import gplx.xowa.addons.bldrs.exports.splits.mgrs.*; import gplx.xowa.addons.bldrs.exports.splits.rslts.*; +// NOTE: used for experimental pack / split approach (html,file,search in one db) class Pack_mgr { public void Exec(Xowe_wiki wiki, long pack_size_max) { // init