mirror of https://github.com/gnosygnu/xowa
parent
83cf992f48
commit
b1ad1411e4
@ -1,90 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*;
|
||||
import gplx.core.ios.*; import gplx.core.ios.streams.*;
|
||||
import gplx.xowa.bldrs.*;
|
||||
import gplx.xowa.wikis.data.tbls.*;
|
||||
class Xob_wbase_json_dump_parser {
|
||||
private final Gfo_usr_dlg usr_dlg; private final Xoae_app app; private final Xob_bldr bldr; private final Xowe_wiki wiki;
|
||||
private final Xob_wbase_json_dump_db dump_db;
|
||||
private final Io_stream_unzip_mgr unzip_mgr;
|
||||
public Xob_wbase_json_dump_parser(Xob_bldr bldr, Xowe_wiki wiki) {
|
||||
this.bldr = bldr; this.wiki = wiki;
|
||||
this.app = bldr.App(); this.usr_dlg = app.Usr_dlg();
|
||||
this.dump_db = new Xob_wbase_json_dump_db(bldr, wiki);
|
||||
this.unzip_mgr = new Io_stream_unzip_mgr(app.Setup_mgr().Dump_mgr().Import_bz2_by_stdout(), app.Prog_mgr().App_decompress_bz2_by_stdout(), String_.Ary(".bz2", ".gz", ".zip"));
|
||||
}
|
||||
public void Parse(Io_url src_fil) {
|
||||
byte[] json_bgn = Bry_.new_a7("[\n"), id_bgn = Bry_.new_a7("{\"id\":");
|
||||
String prog_fmt = "reading ~{0} MB: ~{1} ~{2}";
|
||||
Io_stream_rdr stream_rdr = Io_stream_rdr_mgr.Get_rdr_or_null(src_fil, wiki.Fsys_mgr().Root_dir(), unzip_mgr, "*wikidata-*-all.json", "*wikidata-*-all.json.gz");
|
||||
if (stream_rdr == null) {usr_dlg.Warn_many("", "", "wbase.import:file not found: src_dir=~{0}", wiki.Fsys_mgr().Root_dir()); return;}
|
||||
Io_buffer_rdr buffer_rdr = Io_buffer_rdr.new_(stream_rdr, 10 * Io_mgr.Len_mb); long buffer_rdr_len = buffer_rdr.Fil_len();
|
||||
try {
|
||||
Io_url stream_rdr_url = stream_rdr.Url();
|
||||
int page_bgn = Bry_find_.Find_fwd(buffer_rdr.Bfr(), id_bgn);
|
||||
if (page_bgn == Bry_find_.Not_found) {usr_dlg.Warn_many("", "", "wbase.import:initial id not found: url=~{0}", stream_rdr_url.Raw()); return;}
|
||||
if (!Bry_.Match(buffer_rdr.Bfr(), 0, page_bgn, json_bgn)) {usr_dlg.Warn_many("", "", "wbase.import:doc_bgn is not '[\n': url=~{0}", stream_rdr_url.Raw()); return;}
|
||||
Xowd_page_itm page = new Xowd_page_itm();
|
||||
dump_db.Parse_bgn(stream_rdr.Len(), stream_rdr.Url().NameAndExt());
|
||||
while (true) {
|
||||
int cur_pos = Extract_page(page, buffer_rdr, page_bgn);
|
||||
if (cur_pos == -1) break;
|
||||
if (cur_pos < page_bgn)
|
||||
bldr.Print_prog_msg(buffer_rdr.Fil_pos(), buffer_rdr_len, 1, prog_fmt, Int_.To_str_pad_bgn_zero((int)(buffer_rdr.Fil_pos() / Io_mgr.Len_mb), Int_.DigitCount((int)(buffer_rdr.Fil_len() / Io_mgr.Len_mb))), "", page.Ttl_page_db());
|
||||
page_bgn = cur_pos;
|
||||
}
|
||||
dump_db.Parse_end();
|
||||
}
|
||||
catch (Exception e) {
|
||||
String msg = usr_dlg.Warn_many("", "", "dump_rdr:error while reading; url=~{0} err=~{1}", src_fil.Raw(), Err_.Message_lang(e));
|
||||
throw Err_.new_wo_type(msg);
|
||||
}
|
||||
finally {buffer_rdr.Rls();}
|
||||
}
|
||||
private int Extract_page(Xowd_page_itm page, Io_buffer_rdr rdr, int page_bgn) {
|
||||
int pos = page_bgn;
|
||||
byte[] bry = rdr.Bfr();
|
||||
int bry_len = rdr.Bfr_len();
|
||||
while (true) {
|
||||
if (pos == bry_len) {
|
||||
rdr.Bfr_load_from(page_bgn); // refill src from pos;
|
||||
bry_len = rdr.Bfr_len();
|
||||
pos -= page_bgn;
|
||||
page_bgn = 0;
|
||||
}
|
||||
byte b = Byte_.Zero;
|
||||
boolean exit = false;
|
||||
if (pos < bry_len)
|
||||
b = bry[pos];
|
||||
else {
|
||||
b = Byte_ascii.Nl;
|
||||
pos = bry_len;
|
||||
exit = true;
|
||||
}
|
||||
if (b == Byte_ascii.Nl) {
|
||||
byte[] json_bry = Bry_.Mid(bry, page_bgn, pos);
|
||||
if (json_bry.length == 1 && json_bry[0] == Byte_ascii.Brack_end) return -1;
|
||||
if (exit) return -1;
|
||||
dump_db.Parse_cmd(json_bry);
|
||||
return pos + 1;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*;
|
||||
import gplx.xowa.addons.*;
|
||||
import gplx.xowa.bldrs.wkrs.*;
|
||||
public class Xowb_bldr_addon implements Xoax_addon_itm, Xoax_addon_itm__bldr {
|
||||
public Xob_cmd[] Bldr_cmds() {
|
||||
return new Xob_cmd[]
|
||||
{ gplx.xowa.xtns.wbases.imports.json.Xowb_json_dump_cmd.Prototype
|
||||
};
|
||||
}
|
||||
|
||||
public String Addon__key() {return "xowa.builds.wikibase";}
|
||||
}
|
@ -0,0 +1,87 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.xtns.wbases.imports.json; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; import gplx.xowa.xtns.wbases.imports.*;
|
||||
import gplx.core.ios.*; import gplx.core.ios.streams.*;
|
||||
import gplx.xowa.bldrs.*;
|
||||
import gplx.xowa.wikis.data.tbls.*;
|
||||
class Xowb_json_dump_parser {
|
||||
private final Xob_bldr bldr; private final Xowe_wiki wiki;
|
||||
public Xowb_json_dump_parser(Xob_bldr bldr, Xowe_wiki wiki) {
|
||||
this.bldr = bldr; this.wiki = wiki;
|
||||
}
|
||||
public void Parse(Io_url json_dump_file) {
|
||||
// init
|
||||
Xoae_app app = bldr.App(); Gfo_usr_dlg usr_dlg = app.Usr_dlg();
|
||||
Xowb_json_dump_db dump_db = new Xowb_json_dump_db(bldr, wiki);
|
||||
Io_stream_unzip_mgr unzip_mgr = new Io_stream_unzip_mgr(app.Setup_mgr().Dump_mgr().Import_bz2_by_stdout(), app.Prog_mgr().App_decompress_bz2_by_stdout(), String_.Ary(".bz2", ".gz", ".zip"));
|
||||
|
||||
// open buffer from file
|
||||
Io_stream_rdr stream = Io_stream_rdr_mgr.Get_rdr_or_null(json_dump_file, wiki.Fsys_mgr().Root_dir(), unzip_mgr, "*wikidata-*-all.json", "*wikidata-*-all.json.gz");
|
||||
if (stream == null) {usr_dlg.Warn_many("", "", "wbase.import:file not found: src_dir=~{0}", wiki.Fsys_mgr().Root_dir()); return;}
|
||||
Io_buffer_rdr buffer = Io_buffer_rdr.new_(stream, 10 * Io_mgr.Len_mb);
|
||||
|
||||
try {
|
||||
// set page_bgn
|
||||
if (!Bry_.Match(buffer.Bfr(), 0, 3, Bry_.new_a7("[\n{"))) {usr_dlg.Warn_many("", "", "wbase.import:doc_bgn is not '[\n': url=~{0}", stream.Url().Raw()); return;} // validate file; if schema ever changes this will fail
|
||||
int page_bgn = 2; // 2="[\n"
|
||||
|
||||
// read file and create pages for each json item
|
||||
dump_db.Parse_all_bgn(stream.Len(), stream.Url().NameAndExt());
|
||||
Xowd_page_itm page = new Xowd_page_itm();
|
||||
while (true) {
|
||||
int cur_pos = Parse_doc(dump_db, buffer, page, page_bgn);
|
||||
if (cur_pos == -1) break;
|
||||
if (cur_pos < page_bgn)
|
||||
bldr.Print_prog_msg(buffer.Fil_pos(), buffer.Fil_len(), 1, "reading ~{0} MB: ~{1} ~{2}", Int_.To_str_pad_bgn_zero((int)(buffer.Fil_pos() / Io_mgr.Len_mb), Int_.DigitCount((int)(buffer.Fil_len() / Io_mgr.Len_mb))), "", page.Ttl_page_db());
|
||||
page_bgn = cur_pos;
|
||||
}
|
||||
dump_db.Parse_all_end();
|
||||
}
|
||||
catch (Exception e) {
|
||||
String msg = usr_dlg.Warn_many("", "", "dump_rdr:error while reading; url=~{0} err=~{1}", json_dump_file.Raw(), Err_.Message_lang(e));
|
||||
throw Err_.new_wo_type(msg);
|
||||
}
|
||||
finally {buffer.Rls();}
|
||||
}
|
||||
private int Parse_doc(Xowb_json_dump_db dump_db, Io_buffer_rdr rdr, Xowd_page_itm page, int page_bgn) {
|
||||
// init
|
||||
int pos = page_bgn;
|
||||
byte[] bry = rdr.Bfr();
|
||||
int bry_len = rdr.Bfr_len();
|
||||
|
||||
while (true) {// loop 1 byte at a time until nl
|
||||
if (pos == bry_len) { // refill if at end of 10 MB bfr
|
||||
rdr.Bfr_load_from(page_bgn);
|
||||
bry_len = rdr.Bfr_len();
|
||||
pos -= page_bgn;
|
||||
page_bgn = 0;
|
||||
}
|
||||
|
||||
// read byte; parse if nl; otherwise move to next byte
|
||||
byte b = bry[pos]; // NOTE: should never be out of bounds b/c json doc will end with "]\n"
|
||||
if (b == Byte_ascii.Nl) {
|
||||
if (pos - page_bgn == 1 && bry[page_bgn] == Byte_ascii.Brack_end) // EOF; note that json dump ends with "]\n"
|
||||
return -1;
|
||||
dump_db.Parse_doc(Bry_.Mid(bry, page_bgn, pos));
|
||||
return pos + 1;
|
||||
}
|
||||
else
|
||||
++pos;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue