1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Make: Rewrite pagelink builder [#396]

This commit is contained in:
gnosygnu 2019-03-24 07:54:33 -04:00
parent 14d978e2af
commit ef0300dfeb
4 changed files with 169 additions and 78 deletions

View File

@ -17,35 +17,15 @@ package gplx.xowa.addons.bldrs.wmdumps.pagelinks.bldrs; import gplx.*; import gp
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; import gplx.xowa.bldrs.sql_dumps.*;
import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.data.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs.*;
public class Pglnk_bldr_cmd extends Xob_sql_dump_base implements Xosql_dump_cbk {
private Db_conn conn;
private Pglnk_page_link_temp_tbl temp_tbl;
private Pglnk_tempdb_mgr tempdb_mgr;
private int tmp_src_id, tmp_trg_ns; private byte[] tmp_trg_ttl;
private int rows = 0;
private int row_max = 200 * 1000 * 1000;
public Pglnk_bldr_cmd(Xob_bldr bldr, Xowe_wiki wiki) {this.Cmd_ctor(bldr, wiki); this.make_fil_len = Io_mgr.Len_mb;}
@Override public String Sql_file_name() {return Dump_type_key;} public static final String Dump_type_key = "pagelinks";
@Override protected Xosql_dump_parser New_parser() {return new Xosql_dump_parser(this, "pl_from", "pl_namespace", "pl_title");}
@Override public void Cmd_bgn_hook(Xob_bldr bldr, Xosql_dump_parser parser) {
wiki.Init_assert();
Xob_db_file page_link_db = Xob_db_file.New__page_link(wiki);
this.conn = page_link_db.Conn();
this.temp_tbl = new Pglnk_page_link_temp_tbl(conn);
conn.Meta_tbl_delete(temp_tbl.Tbl_name());
temp_tbl.Create_tbl();
temp_tbl.Insert_bgn();
}
@Override public void Cmd_end() {
if (fail) return;
temp_tbl.Insert_end();
temp_tbl.Create_idx();
Pglnk_page_link_tbl actl_tbl = new Pglnk_page_link_tbl(conn);
conn.Meta_tbl_delete(actl_tbl.Tbl_name());
actl_tbl.Create_tbl();
new Db_attach_mgr(conn, new Db_attach_itm("page_db", wiki.Data__core_mgr().Db__core().Conn()))
.Exec_sql_w_msg("updating page_link", Sql__page_link__make);
conn.Meta_tbl_delete(temp_tbl.Tbl_name());
actl_tbl.Create_idx__src_trg();
actl_tbl.Create_idx__trg_src();
conn.Env_vacuum();
tempdb_mgr = new Pglnk_tempdb_mgr(usr_dlg, wiki, row_max);
}
public void On_fld_done(int fld_idx, byte[] src, int val_bgn, int val_end) {
switch (fld_idx) {
@ -55,19 +35,20 @@ public class Pglnk_bldr_cmd extends Xob_sql_dump_base implements Xosql_dump_cbk
}
}
public void On_row_done() {
temp_tbl.Insert(tmp_src_id, tmp_trg_ns, tmp_trg_ttl);
if (++rows % 100000 == 0) usr_dlg.Prog_many("", "", "reading row ~{0}", Int_.To_str_fmt(rows, "#,##0"));
tempdb_mgr.Dump__insert_row(tmp_src_id, tmp_trg_ns, tmp_trg_ttl);
}
@Override public void Cmd_end() {
if (fail) return;
tempdb_mgr.Live__create();
}
@Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, Invk_row_max_)) row_max = m.ReadInt("v");
else return super.Invk(ctx, ikey, k, m);
return this;
}
private static final byte Fld__pl_from = 0, Fld__pl_namespace = 1, Fld__pl_title = 2;
private static final String Sql__page_link__make = String_.Concat_lines_nl_skip_last
( "INSERT INTO page_link (src_id, trg_id, trg_count)"
, "SELECT pl.src_id"
, ", p.page_id"
, ", Count(p.page_id)"
, "FROM page_link_temp pl"
, " JOIN <page_db>page p ON pl.trg_ns = p.page_namespace AND pl.trg_ttl = p.page_title"
, "GROUP BY pl.src_id, p.page_id"
);
private static final String Invk_row_max_ = "row_max_";
public static final String BLDR_CMD_KEY = "wiki.page_link";
@Override public String Cmd_key() {return BLDR_CMD_KEY;}

View File

@ -0,0 +1,153 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
import gplx.xowa.bldrs.*;
import gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs.*;
class Pglnk_tempdb_mgr {
private final Gfo_usr_dlg usr_dlg;
private final Xow_wiki wiki;
private final Db_conn conn;
private final int row_max;
private int rows;
private final Dbmeta_fld_list dump_flds;
private final String dump_tbl_name = "pagelink_dump", dump_src_id, dump_trg_ns, dump_trg_ttl;
private Db_stmt dump_insert;
private final String temp_tbl_name = "pagelink_temp";
public Pglnk_tempdb_mgr(Gfo_usr_dlg usr_dlg, Xow_wiki wiki, int row_max) {
// init members
this.usr_dlg = usr_dlg;
this.wiki = wiki;
this.row_max = row_max;
// create conn
conn = Xob_db_file.New__page_link(wiki).Conn();
// create dump_tbl
dump_flds = new Dbmeta_fld_list();
dump_flds.Add_int_pkey_autonum("uid");
dump_src_id = dump_flds.Add_int("src_id");
dump_trg_ns = dump_flds.Add_int("trg_ns");
dump_trg_ttl = dump_flds.Add_str("trg_ttl", 255);
// create temp_tbl
Dbmeta_fld_list temp_flds = new Dbmeta_fld_list();
temp_flds.Add_int("src_id");
temp_flds.Add_int("trg_id");
temp_flds.Add_int("trg_count");
conn.Meta_tbl_create(Dbmeta_tbl_itm.New(temp_tbl_name, temp_flds));
}
private void Dump__insert_bgn() {
conn.Meta_tbl_create(Dbmeta_tbl_itm.New(dump_tbl_name, dump_flds));
conn.Txn_bgn("pagelink__dump__insert");
dump_insert = conn.Stmt_insert(dump_tbl_name, dump_flds);
}
public void Dump__insert_row(int src_id, int trg_ns, byte[] trg_ttl) {
// move rows from dump to temp every n million
if (rows % row_max == 0) {
if (rows != 0) Dump__insert_end();
Dump__insert_bgn();
}
// do insert; write prog
dump_insert.Clear().Val_int(dump_src_id, src_id).Val_int(dump_trg_ns, trg_ns).Val_bry_as_str(dump_trg_ttl, trg_ttl).Exec_insert();
if (++rows % 100000 == 0) usr_dlg.Prog_many("", "", "reading row ~{0}", Int_.To_str_fmt(rows, "#,##0"));
}
private void Dump__insert_end() {
// clean-up insert_stmt
conn.Txn_end();
dump_insert = Db_stmt_.Rls(dump_insert);
conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(dump_tbl_name, "main", dump_src_id, dump_trg_ns, dump_trg_ttl));
// move rows from dump to temp
new Db_attach_mgr(conn, new Db_attach_itm("page_db", wiki.Data__core_mgr().Db__core().Conn()))
.Exec_sql_w_msg
( String_.Format("transferring from dump to temp: row={0}", Int_.To_str_fmt(rows, "#,##0"))
, String_.Concat_lines_nl_skip_last
( "INSERT INTO pagelink_temp (src_id, trg_id, trg_count)"
, "SELECT pl.src_id"
, ", p.page_id"
, ", Count(p.page_id)"
, "FROM pagelink_dump pl"
, " JOIN <page_db>page p ON pl.trg_ns = p.page_namespace AND pl.trg_ttl = p.page_title"
, "GROUP BY pl.src_id, p.page_id"
));
// drop dump_tbl; vaccuum
conn.Meta_tbl_delete(dump_tbl_name);
conn.Env_vacuum();
}
public void Live__create() {
// end current batch
if (rows > 0) this.Dump__insert_end();
// index temp tbl
conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(temp_tbl_name, "main", "src_id", "trg_id"));
// create live_tbl
Pglnk_page_link_tbl live_tbl = new Pglnk_page_link_tbl(conn);
live_tbl.Create_tbl();
// move rows from temp to live; drop temp tbl
conn.Exec_sql_concat_w_msg
( String_.Format("creating live tbl: row={0}", Int_.To_str_fmt(rows, "#,##0"))
, "INSERT INTO page_link (src_id, trg_id, trg_count)"
, "SELECT pl.src_id"
, ", pl.trg_id"
, ", Sum(pl.trg_count)"
, "FROM pagelink_temp pl"
, "GROUP BY pl.src_id, pl.trg_id"
);
conn.Meta_tbl_delete(temp_tbl_name);
conn.Env_vacuum(); // NOTE: do not VACCUUM after indexes
// create idxs
live_tbl.Create_idx__src_trg();
live_tbl.Create_idx__trg_src();
}
}
/*
"A fatal error has been detected by the Java Runtime Environment" ISSUE#:396 DATE:2019-03-23
== Background ==
* Occurred on Linux openSUSE 13.2 (desb42 reported completion (Windows OS?))
* Size of xowa.wiki.pagelinks.sqlite3 at time of crash was 42 GB
* Number of rows in dump table was 1,215,821,988
== Original attempt ==
* Insert rows into dump table
* Fatal error occurred during CREATE INDEX on dump table
** Possibly b/c of INDEX on 1.2 billion varchar fields?
== Second attempt ==
* Insert rows into dump table N
* After 500 million, INDEX old dump table N and create a new dump table N + 1
** INDEX now on 500 million varchar fields
* After EOS, loop each dump table and transfer into live table
* Fatal error occurred during final VACCUUM (possibly b/c of space from the 1.2 billion varchar fields?)
== Third attempt ==
* Insert rows into dump table
* After 200 million rows, INDEX dump table, transfer into temp table, and recreate dump table
** INDEX now on 200 million varchar fields
* After EOS, drop dump table, INDEX temp table and transfer into live table
** INDEX now on 1.2 billion int,int fields
** No varchar fields during final transfer and VACCUUM; db rougly 55 GB
* NOTE: VACCUM after live index still causes failure but Java one (not core dump)
*/

View File

@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
public class Pglnk_page_link_tbl implements Rls_able {
private final String tbl_name = "page_link"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
private static final String tbl_name = "page_link"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
private final String fld_src_id, fld_trg_id;
private final Db_conn conn;
public Pglnk_page_link_tbl(Db_conn conn) {

View File

@ -1,43 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.bldrs.wmdumps.pagelinks.dbs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.wmdumps.*; import gplx.xowa.addons.bldrs.wmdumps.pagelinks.*;
import gplx.core.ios.*; import gplx.dbs.*; import gplx.dbs.qrys.*; import gplx.xowa.wikis.dbs.*; import gplx.dbs.cfgs.*;
public class Pglnk_page_link_temp_tbl implements Rls_able {
private final String tbl_name = "page_link_temp"; private final Dbmeta_fld_list flds = new Dbmeta_fld_list();
private final String fld_src_id, fld_trg_ns, fld_trg_ttl;
private final Db_conn conn; private Db_stmt stmt_insert;
public Pglnk_page_link_temp_tbl(Db_conn conn) {
this.conn = conn;
flds.Add_int_pkey_autonum("uid");
fld_src_id = flds.Add_int("src_id");
fld_trg_ns = flds.Add_int("trg_ns");
fld_trg_ttl = flds.Add_str("trg_ttl", 255);
conn.Rls_reg(this);
}
public Db_conn Conn() {return conn;}
public String Tbl_name() {return tbl_name;}
public void Create_tbl() {conn.Meta_tbl_create(Dbmeta_tbl_itm.New(tbl_name, flds));}
public void Create_idx() {conn.Meta_idx_create(Gfo_usr_dlg_.Instance, Dbmeta_idx_itm.new_normal_by_tbl(tbl_name, "main", fld_src_id, fld_trg_ns, fld_trg_ttl));}
public void Insert_bgn() {conn.Txn_bgn("page_link__insert_bulk"); stmt_insert = conn.Stmt_insert(tbl_name, flds);}
public void Insert_end() {conn.Txn_end(); stmt_insert = Db_stmt_.Rls(stmt_insert);}
public void Insert(int src_id, int trg_ns, byte[] trg_ttl) {
if (stmt_insert == null) stmt_insert = conn.Stmt_insert(tbl_name, flds);
stmt_insert.Clear().Val_int(fld_src_id, src_id).Val_int(fld_trg_ns, trg_ns).Val_bry_as_str(fld_trg_ttl, trg_ttl).Exec_insert();
}
public void Rls() {
stmt_insert = Db_stmt_.Rls(stmt_insert);
}
}