mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Make: Rewrite pagelink builder [#396]
This commit is contained in:
parent
14d978e2af
commit
ef0300dfeb
@ -17,35 +17,15 @@ package gplx.xowa.addons.bldrs.wmdumps.pagelinks.bldrs; import gplx.*; import gp
|
||||
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; import gplx.xowa.bldrs.sql_dumps.*;
|
||||
import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.data.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs.*;
|
||||
public class Pglnk_bldr_cmd extends Xob_sql_dump_base implements Xosql_dump_cbk {
|
||||
private Db_conn conn;
|
||||
private Pglnk_page_link_temp_tbl temp_tbl;
|
||||
private Pglnk_tempdb_mgr tempdb_mgr;
|
||||
private int tmp_src_id, tmp_trg_ns; private byte[] tmp_trg_ttl;
|
||||
private int rows = 0;
|
||||
private int row_max = 200 * 1000 * 1000;
|
||||
public Pglnk_bldr_cmd(Xob_bldr bldr, Xowe_wiki wiki) {this.Cmd_ctor(bldr, wiki); this.make_fil_len = Io_mgr.Len_mb;}
|
||||
@Override public String Sql_file_name() {return Dump_type_key;} public static final String Dump_type_key = "pagelinks";
|
||||
@Override protected Xosql_dump_parser New_parser() {return new Xosql_dump_parser(this, "pl_from", "pl_namespace", "pl_title");}
|
||||
@Override public void Cmd_bgn_hook(Xob_bldr bldr, Xosql_dump_parser parser) {
|
||||
wiki.Init_assert();
|
||||
Xob_db_file page_link_db = Xob_db_file.New__page_link(wiki);
|
||||
this.conn = page_link_db.Conn();
|
||||
this.temp_tbl = new Pglnk_page_link_temp_tbl(conn);
|
||||
conn.Meta_tbl_delete(temp_tbl.Tbl_name());
|
||||
temp_tbl.Create_tbl();
|
||||
temp_tbl.Insert_bgn();
|
||||
}
|
||||
@Override public void Cmd_end() {
|
||||
if (fail) return;
|
||||
temp_tbl.Insert_end();
|
||||
temp_tbl.Create_idx();
|
||||
Pglnk_page_link_tbl actl_tbl = new Pglnk_page_link_tbl(conn);
|
||||
conn.Meta_tbl_delete(actl_tbl.Tbl_name());
|
||||
actl_tbl.Create_tbl();
|
||||
new Db_attach_mgr(conn, new Db_attach_itm("page_db", wiki.Data__core_mgr().Db__core().Conn()))
|
||||
.Exec_sql_w_msg("updating page_link", Sql__page_link__make);
|
||||
conn.Meta_tbl_delete(temp_tbl.Tbl_name());
|
||||
actl_tbl.Create_idx__src_trg();
|
||||
actl_tbl.Create_idx__trg_src();
|
||||
conn.Env_vacuum();
|
||||
tempdb_mgr = new Pglnk_tempdb_mgr(usr_dlg, wiki, row_max);
|
||||
}
|
||||
public void On_fld_done(int fld_idx, byte[] src, int val_bgn, int val_end) {
|
||||
switch (fld_idx) {
|
||||
@ -55,19 +35,20 @@ public class Pglnk_bldr_cmd extends Xob_sql_dump_base implements Xosql_dump_cbk
|
||||
}
|
||||
}
|
||||
public void On_row_done() {
|
||||
temp_tbl.Insert(tmp_src_id, tmp_trg_ns, tmp_trg_ttl);
|
||||
if (++rows % 100000 == 0) usr_dlg.Prog_many("", "", "reading row ~{0}", Int_.To_str_fmt(rows, "#,##0"));
|
||||
tempdb_mgr.Dump__insert_row(tmp_src_id, tmp_trg_ns, tmp_trg_ttl);
|
||||
}
|
||||
@Override public void Cmd_end() {
|
||||
if (fail) return;
|
||||
tempdb_mgr.Live__create();
|
||||
}
|
||||
@Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
||||
if (ctx.Match(k, Invk_row_max_)) row_max = m.ReadInt("v");
|
||||
else return super.Invk(ctx, ikey, k, m);
|
||||
return this;
|
||||
}
|
||||
|
||||
private static final byte Fld__pl_from = 0, Fld__pl_namespace = 1, Fld__pl_title = 2;
|
||||
private static final String Sql__page_link__make = String_.Concat_lines_nl_skip_last
|
||||
( "INSERT INTO page_link (src_id, trg_id, trg_count)"
|
||||
, "SELECT pl.src_id"
|
||||
, ", p.page_id"
|
||||
, ", Count(p.page_id)"
|
||||
, "FROM page_link_temp pl"
|
||||
, " JOIN <page_db>page p ON pl.trg_ns = p.page_namespace AND pl.trg_ttl = p.page_title"
|
||||
, "GROUP BY pl.src_id, p.page_id"
|
||||
);
|
||||
private static final String Invk_row_max_ = "row_max_";
|
||||
|
||||
public static final String BLDR_CMD_KEY = "wiki.page_link";
|
||||
@Override public String Cmd_key() {return BLDR_CMD_KEY;}
|
||||
|
@ -0,0 +1,153 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
|
||||
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
|
||||
import gplx.xowa.bldrs.*;
|
||||
import gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs.*;
|
||||
class Pglnk_tempdb_mgr {
|
||||
private final Gfo_usr_dlg usr_dlg;
|
||||
private final Xow_wiki wiki;
|
||||
private final Db_conn conn;
|
||||
private final int row_max;
|
||||
private int rows;
|
||||
|
||||
private final Dbmeta_fld_list dump_flds;
|
||||
private final String dump_tbl_name = "pagelink_dump", dump_src_id, dump_trg_ns, dump_trg_ttl;
|
||||
private Db_stmt dump_insert;
|
||||
|
||||
private final String temp_tbl_name = "pagelink_temp";
|
||||
|
||||
public Pglnk_tempdb_mgr(Gfo_usr_dlg usr_dlg, Xow_wiki wiki, int row_max) {
|
||||
// init members
|
||||
this.usr_dlg = usr_dlg;
|
||||
this.wiki = wiki;
|
||||
this.row_max = row_max;
|
||||
|
||||
// create conn
|
||||
conn = Xob_db_file.New__page_link(wiki).Conn();
|
||||
|
||||
// create dump_tbl
|
||||
dump_flds = new Dbmeta_fld_list();
|
||||
dump_flds.Add_int_pkey_autonum("uid");
|
||||
dump_src_id = dump_flds.Add_int("src_id");
|
||||
dump_trg_ns = dump_flds.Add_int("trg_ns");
|
||||
dump_trg_ttl = dump_flds.Add_str("trg_ttl", 255);
|
||||
|
||||
// create temp_tbl
|
||||
Dbmeta_fld_list temp_flds = new Dbmeta_fld_list();
|
||||
temp_flds.Add_int("src_id");
|
||||
temp_flds.Add_int("trg_id");
|
||||
temp_flds.Add_int("trg_count");
|
||||
conn.Meta_tbl_create(Dbmeta_tbl_itm.New(temp_tbl_name, temp_flds));
|
||||
}
|
||||
private void Dump__insert_bgn() {
|
||||
conn.Meta_tbl_create(Dbmeta_tbl_itm.New(dump_tbl_name, dump_flds));
|
||||
conn.Txn_bgn("pagelink__dump__insert");
|
||||
dump_insert = conn.Stmt_insert(dump_tbl_name, dump_flds);
|
||||
}
|
||||
public void Dump__insert_row(int src_id, int trg_ns, byte[] trg_ttl) {
|
||||
// move rows from dump to temp every n million
|
||||
if (rows % row_max == 0) {
|
||||
if (rows != 0) Dump__insert_end();
|
||||
Dump__insert_bgn();
|
||||
}
|
||||
|
||||
// do insert; write prog
|
||||
dump_insert.Clear().Val_int(dump_src_id, src_id).Val_int(dump_trg_ns, trg_ns).Val_bry_as_str(dump_trg_ttl, trg_ttl).Exec_insert();
|
||||
if (++rows % 100000 == 0) usr_dlg.Prog_many("", "", "reading row ~{0}", Int_.To_str_fmt(rows, "#,##0"));
|
||||
}
|
||||
private void Dump__insert_end() {
|
||||
// clean-up insert_stmt
|
||||
conn.Txn_end();
|
||||
dump_insert = Db_stmt_.Rls(dump_insert);
|
||||
conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(dump_tbl_name, "main", dump_src_id, dump_trg_ns, dump_trg_ttl));
|
||||
|
||||
// move rows from dump to temp
|
||||
new Db_attach_mgr(conn, new Db_attach_itm("page_db", wiki.Data__core_mgr().Db__core().Conn()))
|
||||
.Exec_sql_w_msg
|
||||
( String_.Format("transferring from dump to temp: row={0}", Int_.To_str_fmt(rows, "#,##0"))
|
||||
, String_.Concat_lines_nl_skip_last
|
||||
( "INSERT INTO pagelink_temp (src_id, trg_id, trg_count)"
|
||||
, "SELECT pl.src_id"
|
||||
, ", p.page_id"
|
||||
, ", Count(p.page_id)"
|
||||
, "FROM pagelink_dump pl"
|
||||
, " JOIN <page_db>page p ON pl.trg_ns = p.page_namespace AND pl.trg_ttl = p.page_title"
|
||||
, "GROUP BY pl.src_id, p.page_id"
|
||||
));
|
||||
|
||||
// drop dump_tbl; vaccuum
|
||||
conn.Meta_tbl_delete(dump_tbl_name);
|
||||
conn.Env_vacuum();
|
||||
}
|
||||
public void Live__create() {
|
||||
// end current batch
|
||||
if (rows > 0) this.Dump__insert_end();
|
||||
|
||||
// index temp tbl
|
||||
conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(temp_tbl_name, "main", "src_id", "trg_id"));
|
||||
|
||||
// create live_tbl
|
||||
Pglnk_page_link_tbl live_tbl = new Pglnk_page_link_tbl(conn);
|
||||
live_tbl.Create_tbl();
|
||||
|
||||
// move rows from temp to live; drop temp tbl
|
||||
conn.Exec_sql_concat_w_msg
|
||||
( String_.Format("creating live tbl: row={0}", Int_.To_str_fmt(rows, "#,##0"))
|
||||
, "INSERT INTO page_link (src_id, trg_id, trg_count)"
|
||||
, "SELECT pl.src_id"
|
||||
, ", pl.trg_id"
|
||||
, ", Sum(pl.trg_count)"
|
||||
, "FROM pagelink_temp pl"
|
||||
, "GROUP BY pl.src_id, pl.trg_id"
|
||||
);
|
||||
conn.Meta_tbl_delete(temp_tbl_name);
|
||||
conn.Env_vacuum(); // NOTE: do not VACCUUM after indexes
|
||||
|
||||
// create idxs
|
||||
live_tbl.Create_idx__src_trg();
|
||||
live_tbl.Create_idx__trg_src();
|
||||
}
|
||||
}
|
||||
/*
|
||||
"A fatal error has been detected by the Java Runtime Environment" ISSUE#:396 DATE:2019-03-23
|
||||
|
||||
== Background ==
|
||||
* Occurred on Linux openSUSE 13.2 (desb42 reported completion (Windows OS?))
|
||||
* Size of xowa.wiki.pagelinks.sqlite3 at time of crash was 42 GB
|
||||
* Number of rows in dump table was 1,215,821,988
|
||||
|
||||
== Original attempt ==
|
||||
* Insert rows into dump table
|
||||
* Fatal error occurred during CREATE INDEX on dump table
|
||||
** Possibly b/c of INDEX on 1.2 billion varchar fields?
|
||||
|
||||
== Second attempt ==
|
||||
* Insert rows into dump table N
|
||||
* After 500 million, INDEX old dump table N and create a new dump table N + 1
|
||||
** INDEX now on 500 million varchar fields
|
||||
* After EOS, loop each dump table and transfer into live table
|
||||
* Fatal error occurred during final VACCUUM (possibly b/c of space from the 1.2 billion varchar fields?)
|
||||
|
||||
== Third attempt ==
|
||||
* Insert rows into dump table
|
||||
* After 200 million rows, INDEX dump table, transfer into temp table, and recreate dump table
|
||||
** INDEX now on 200 million varchar fields
|
||||
* After EOS, drop dump table, INDEX temp table and transfer into live table
|
||||
** INDEX now on 1.2 billion int,int fields
|
||||
** No varchar fields during final transfer and VACCUUM; db rougly 55 GB
|
||||
* NOTE: VACCUM after live index still causes failure but Java one (not core dump)
|
||||
*/
|
@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
|
||||
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
|
||||
public class Pglnk_page_link_tbl implements Rls_able {
|
||||
private final String tbl_name = "page_link"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
|
||||
private static final String tbl_name = "page_link"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
|
||||
private final String fld_src_id, fld_trg_id;
|
||||
private final Db_conn conn;
|
||||
public Pglnk_page_link_tbl(Db_conn conn) {
|
||||
|
@ -1,43 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
|
||||
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
|
||||
public class Pglnk_page_link_temp_tbl implements Rls_able {
|
||||
private final String tbl_name = "page_link_temp"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
|
||||
private final String fld_src_id, fld_trg_ns, fld_trg_ttl;
|
||||
private final Db_conn conn; private Db_stmt stmt_insert;
|
||||
public Pglnk_page_link_temp_tbl(Db_conn conn) {
|
||||
this.conn = conn;
|
||||
flds.Add_int_pkey_autonum("uid");
|
||||
fld_src_id = flds.Add_int("src_id");
|
||||
fld_trg_ns = flds.Add_int("trg_ns");
|
||||
fld_trg_ttl = flds.Add_str("trg_ttl", 255);
|
||||
conn.Rls_reg(this);
|
||||
}
|
||||
public Db_conn Conn() {return conn;}
|
||||
public String Tbl_name() {return tbl_name;}
|
||||
public void Create_tbl() {conn.Meta_tbl_create(Dbmeta_tbl_itm.New(tbl_name, flds));}
|
||||
public void Create_idx() {conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(tbl_name, "main", fld_src_id, fld_trg_ns, fld_trg_ttl));}
|
||||
public void Insert_bgn() {conn.Txn_bgn("page_link__insert_bulk"); stmt_insert = conn.Stmt_insert(tbl_name, flds);}
|
||||
public void Insert_end() {conn.Txn_end(); stmt_insert = Db_stmt_.Rls(stmt_insert);}
|
||||
public void Insert(int src_id, int trg_ns, byte[] trg_ttl) {
|
||||
if (stmt_insert == null) stmt_insert = conn.Stmt_insert(tbl_name, flds);
|
||||
stmt_insert.Clear().Val_int(fld_src_id, src_id).Val_int(fld_trg_ns, trg_ns).Val_bry_as_str(fld_trg_ttl, trg_ttl).Exec_insert();
|
||||
}
|
||||
public void Rls() {
|
||||
stmt_insert = Db_stmt_.Rls(stmt_insert);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user