1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Mass_parse: Refactor ifexist_mgr into separate class; Load more titles into ifexist_mgr

This commit is contained in:
gnosygnu
2016-12-16 13:23:48 -05:00
parent bfc5c5b68e
commit 5b42a90cd5
8 changed files with 207 additions and 102 deletions

View File

@@ -35,21 +35,23 @@ public class Xomp_parse_mgr {
Xomp_prog_mgr prog_mgr = new Xomp_prog_mgr();
prog_mgr.Init(page_pool_loader.Get_pending_count(), cfg.Progress_interval());
// cache: disable general settings
wiki.App().User().User_db_mgr().Cache_mgr().Enabled_n_(); // disable db lookups of cache
Gfo_cache_mgr commons_cache = new Gfo_cache_mgr().Max_size_(Int_.Max_value).Reduce_by_(Int_.Max_value);
Gfo_cache_mgr ifexist_cache = new Gfo_cache_mgr().Max_size_(Int_.Max_value).Reduce_by_(Int_.Max_value);
// cache: preload tmpls and imglinks
Xow_page_cache page_cache = Xomp_tmpl_cache_bldr.New(wiki, cfg.Load_all_templates());
wiki.App().User().User_db_mgr().Cache_mgr().Enabled_n_(); // disable db lookups of user cache
Gfo_cache_mgr commons_cache = new Gfo_cache_mgr().Max_size_(Int_.Max_value).Reduce_by_(Int_.Max_value);
Xow_ifexist_cache ifexist_cache = new Xow_ifexist_cache(wiki, page_cache).Cache_sizes_(Int_.Max_value, Int_.Max_value);
if (cfg.Load_ifexists_ns() != null) Load_ifexists_ns(wiki, ifexist_cache, cfg.Load_ifexists_ns());
Xof_orig_wkr__img_links file_orig_wkr = new Xof_orig_wkr__img_links(wiki);
if (cfg.Load_all_imglnks()) Xof_orig_wkr__img_links_.Load_all(file_orig_wkr);
if (cfg.Load_all_imglinks()) Xof_orig_wkr__img_links_.Load_all(file_orig_wkr);
// load_wkr: init and start
// Xomp_load_wkr load_wkr = new Xomp_load_wkr(wiki, db_mgr.Mgr_db().Conn(), cfg.Num_pages_in_pool(), cfg.Num_wkrs());
// Thread_adp_.Start_by_key("xomp.load", Cancelable_.Never, load_wkr, Xomp_load_wkr.Invk__exec);
// assert wkr_tbl
Gfo_usr_dlg_.Instance.Prog_many("", "", "initing wkrs");
int wkr_len = cfg.Num_wkrs();
int wkr_uid_bgn = mgr_db.Tbl__wkr().Init_wkrs(cfg.Wkr_machine_name(), wkr_len);
latch = new Gfo_countdown_latch(wkr_len);
@@ -86,4 +88,46 @@ public class Xomp_parse_mgr {
}
Gfo_usr_dlg_.Instance.Note_many("", "", bfr.To_str_and_clear());
}
private static void Load_ifexists_ns(Xow_wiki wiki, Xow_ifexist_cache cache, String ns_list) {
// expand "*" to all
if (String_.Eq(ns_list, "*")) {
Bry_bfr bfr = Bry_bfr_.New();
gplx.xowa.wikis.nss.Xow_ns_mgr ns_mgr = wiki.Ns_mgr();
int len = ns_mgr.Ids_len();
for (int i = 0; i < len; i++) {
gplx.xowa.wikis.nss.Xow_ns ns = ns_mgr.Ids_get_at(i);
if (ns.Id() >= 0) { // skip Media / Special
if (bfr.Len() != 0) bfr.Add_byte_comma();
bfr.Add_int_variable(ns.Id());
}
}
ns_list = bfr.To_str_and_clear();
}
// load all titles
gplx.xowa.wikis.data.tbls.Xowd_page_tbl page_tbl = wiki.Data__core_mgr().Db__core().Tbl__page();
String sql = gplx.dbs.Db_sql_.Make_by_fmt(String_.Ary
( "SELECT {0}, {1}"
, "FROM {2}"
, "WHERE {0} IN ({3})"
), page_tbl.Fld_page_ns(), page_tbl.Fld_page_title()
, page_tbl.Tbl_name()
, ns_list
);
gplx.dbs.Db_rdr rdr = page_tbl.Conn().Stmt_sql(sql).Exec_select__rls_auto();
try {
int counter = 0;
while (rdr.Move_next()) {
int ns = rdr.Read_int(page_tbl.Fld_page_ns());
byte[] page_db = rdr.Read_bry_by_str(page_tbl.Fld_page_title());
Xoa_ttl ttl = wiki.Ttl_parse(ns, page_db);
cache.Add(ttl);
if (counter % 100000 == 0) Gfo_usr_dlg_.Instance.Prog_many("", "", "loading ifexists: " + counter);
counter++;
}
} finally {rdr.Rls();}
// mark ns
int[] ns_ids = Int_.Ary_parse(ns_list, ",");
cache.Add_ns_loaded(ns_ids);
}
}

View File

@@ -18,23 +18,24 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.xowa.addons.bldrs.mass_parses.parses.mgrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.mass_parses.*; import gplx.xowa.addons.bldrs.mass_parses.parses.*;
import gplx.core.ios.streams.*;
public class Xomp_parse_mgr_cfg implements Gfo_invk {
public int Num_wkrs() {return num_wkrs;} private int num_wkrs = -1;
public int Num_pages_in_pool() {return num_pages_in_pool;} private int num_pages_in_pool = -1;
public int Num_pages_per_wkr() {return num_pages_per_wkr;} private int num_pages_per_wkr = 1000;
public int Progress_interval() {return progress_interval;} private int progress_interval = 1000;
public int Commit_interval() {return commit_interval;} private int commit_interval = 10000;
public int Cleanup_interval() {return cleanup_interval;} private int cleanup_interval = 50; // setting at 1000 uses lots of memory
public boolean Hdump_enabled() {return hdump_enabled;} private boolean hdump_enabled = true;
public boolean Hdump_catboxs() {return hdump_catboxs;} private boolean hdump_catboxs = false;
public boolean Hzip_enabled() {return hzip_enabled;} private boolean hzip_enabled = true;
public boolean Hdiff_enabled() {return hdiff_enabled;} private boolean hdiff_enabled = true;
public boolean Log_file_lnkis() {return log_file_lnkis;} private boolean log_file_lnkis = true;
public boolean Load_all_templates() {return load_all_templates;} private boolean load_all_templates = true;
public boolean Load_all_imglnks() {return load_all_imglnks;} private boolean load_all_imglnks = true;
public byte Zip_tid() {return zip_tid;} private byte zip_tid = Io_stream_tid_.Tid__gzip;
public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url;
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
public int Num_wkrs() {return num_wkrs;} private int num_wkrs = -1;
public int Num_pages_in_pool() {return num_pages_in_pool;} private int num_pages_in_pool = -1;
public int Num_pages_per_wkr() {return num_pages_per_wkr;} private int num_pages_per_wkr = 1000;
public int Progress_interval() {return progress_interval;} private int progress_interval = 1000;
public int Commit_interval() {return commit_interval;} private int commit_interval = 10000;
public int Cleanup_interval() {return cleanup_interval;} private int cleanup_interval = 50; // setting at 1000 uses lots of memory
public boolean Hdump_enabled() {return hdump_enabled;} private boolean hdump_enabled = true;
public boolean Hdump_catboxs() {return hdump_catboxs;} private boolean hdump_catboxs = false;
public boolean Hzip_enabled() {return hzip_enabled;} private boolean hzip_enabled = true;
public boolean Hdiff_enabled() {return hdiff_enabled;} private boolean hdiff_enabled = true;
public boolean Log_file_lnkis() {return log_file_lnkis;} private boolean log_file_lnkis = true;
public boolean Load_all_templates() {return load_all_templates;} private boolean load_all_templates = true;
public boolean Load_all_imglinks() {return load_all_imglinks;} private boolean load_all_imglinks = true;
public String Load_ifexists_ns() {return load_ifexists_ns;} private String load_ifexists_ns = null;
public byte Zip_tid() {return zip_tid;} private byte zip_tid = Io_stream_tid_.Tid__gzip;
public Io_url Mgr_url() {return mgr_url;} private Io_url mgr_url;
public String Wkr_machine_name() {return wkr_machine_name;} private String wkr_machine_name;
public boolean Show_msg__fetched_pool() {return show_msg__fetched_pool;} private boolean show_msg__fetched_pool;
public void Init(Xowe_wiki wiki) {
if (num_wkrs == -1) num_wkrs = gplx.core.envs.Runtime_.Cpu_count();
if (num_pages_in_pool == -1) num_pages_in_pool = num_wkrs * 1000;
@@ -54,7 +55,8 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
else if (ctx.Match(k, Invk__hdiff_enabled_)) hdiff_enabled = m.ReadYn("v");
else if (ctx.Match(k, Invk__zip_tid_)) zip_tid = m.ReadByte("v");
else if (ctx.Match(k, Invk__load_all_templates_)) load_all_templates = m.ReadYn("v");
else if (ctx.Match(k, Invk__load_all_imglnks_)) load_all_imglnks = m.ReadYn("v");
else if (ctx.Match(k, Invk__load_all_imglinks_)) load_all_imglinks = m.ReadYn("v");
else if (ctx.Match(k, Invk__load_ifexists_ns_)) load_ifexists_ns = m.ReadStr("v");
else if (ctx.Match(k, Invk__manual_now_)) Datetime_now.Manual_and_freeze_(m.ReadDate("v"));
else if (ctx.Match(k, Invk__mgr_url_)) mgr_url = m.ReadIoUrl("v");
else if (ctx.Match(k, Invk__wkr_machine_name_)) wkr_machine_name = m.ReadStr("v");
@@ -67,7 +69,7 @@ public class Xomp_parse_mgr_cfg implements Gfo_invk {
Invk__num_wkrs_ = "num_wkrs_", Invk__num_pages_in_pool_ = "num_pages_in_pool_", Invk__num_pages_per_wkr_ = "num_pages_per_wkr_"
, Invk__progress_interval_ = "progress_interval_", Invk__commit_interval_ = "commit_interval_", Invk__cleanup_interval_ = "cleanup_interval_"
, Invk__hdump_enabled_ = "hdump_enabled_", Invk__hzip_enabled_ = "hzip_enabled_", Invk__hdiff_enabled_ = "hdiff_enabled_", Invk__zip_tid_ = "zip_tid_"
, Invk__load_all_templates_ = "load_all_templates_", Invk__load_all_imglnks_ = "load_all_imglnks_", Invk__manual_now_ = "manual_now_"
, Invk__load_all_templates_ = "load_all_templates_", Invk__load_all_imglinks_ = "load_all_imglinks_", Invk__load_ifexists_ns_ = "load_ifexists_ns_", Invk__manual_now_ = "manual_now_"
, Invk__hdump_catboxes_ = "hdump_catboxes_"
, Invk__mgr_url_ = "mgr_url_", Invk__wkr_machine_name_ = "wkr_machine_name_"
, Invk__show_msg__fetched_pool_ = "show_msg__fetched_pool_"

View File

@@ -32,7 +32,7 @@ public class Xomp_tmpl_cache_bldr {
, ", pp.page_text_db_id"
, ", pp.page_redirect_id"
, "FROM page pp"
, "WHERE pp.page_namespace IN (10, 828)"
, "WHERE pp.page_namespace IN (8, 10, 828)"
);
Xomp_text_db_loader text_db_loader = new Xomp_text_db_loader(wiki);

View File

@@ -27,7 +27,7 @@ public class Xop_mediawiki_wkr {
}
public void Loader_(Xop_mediawiki_loader loader) {
if (loader != null)
wiki.Cache_mgr().Page_cache().Load_wkr_(new Xow_page_cache_wkr__embeddable(loader));
wiki.Cache_mgr().Load_wkr_(new Xow_page_cache_wkr__embeddable(loader));
}
public void Free_memory() {
wiki.Cache_mgr().Tmpl_result_cache().Clear();