diff --git a/400_xowa/src/gplx/xowa/addons/Xoax_addon_mgr.java b/400_xowa/src/gplx/xowa/addons/Xoax_addon_mgr.java index cc3f0632e..5186a1ae3 100644 --- a/400_xowa/src/gplx/xowa/addons/Xoax_addon_mgr.java +++ b/400_xowa/src/gplx/xowa/addons/Xoax_addon_mgr.java @@ -47,6 +47,7 @@ public class Xoax_addon_mgr { , new gplx.xowa.addons.wikis.pages.randoms .Rndm_addon() , new gplx.xowa.addons.bldrs.hdumps.diffs .Dumpdiff_addon() , new gplx.xowa.addons.wikis.ctgs.bldrs .Xoax_ctg_bldr_addon() + , new gplx.xowa.xtns.wbases.imports .Xowb_bldr_addon() // specials , new gplx.xowa.addons.wikis.registrys .Wiki_registry_addon() diff --git a/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_keys.java b/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_keys.java index bb1101587..f1d07bc3a 100644 --- a/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_keys.java +++ b/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_keys.java @@ -30,7 +30,6 @@ public class Xob_cmd_keys { , Key_util_xml_dump = "util.xml_dump" , Key_util_random = "util.random" , Key_util_delete = "util.delete" - , Key_wbase_json_dump = "wbase.json_dump" , Key_wbase_qid = "wbase.qid" // "text.wdata.qid" , Key_wbase_pid = "wbase.pid" // "text.wdata.pid" , Key_wbase_db = "wbase.db" // "wiki.wdata_db" diff --git a/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_mgr.java b/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_mgr.java index 9c8d636aa..f3daed932 100644 --- a/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_mgr.java +++ b/400_xowa/src/gplx/xowa/bldrs/Xob_cmd_mgr.java @@ -50,7 +50,6 @@ public class Xob_cmd_mgr implements Gfo_invk { else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_util_delete)) return Add(new Xob_delete_cmd(bldr, wiki)); else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_util_download)) return Add(new Xob_download_cmd(bldr, wiki)); else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_util_xml_dump)) return Add(new Xob_xml_dumper_cmd(bldr, wiki)); - else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_wbase_json_dump)) return Add(new Xob_wbase_json_dump_cmd(bldr, wiki)); else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_wbase_qid)) return Xml_rdr_direct_add(wiki, new Xob_wdata_qid_sql().Ctor(bldr, wiki)); else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_wbase_pid)) return Xml_rdr_direct_add(wiki, new Xob_wdata_pid_sql().Ctor(bldr, wiki)); else if (String_.Eq(cmd_key, Xob_cmd_keys.Key_wbase_db)) return Add(new Xob_wdata_db_cmd(bldr, wiki)); diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_parser.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_parser.java deleted file mode 100644 index 84daa37af..000000000 --- a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_parser.java +++ /dev/null @@ -1,90 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012 gnosygnu@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as -published by the Free Software Foundation, either version 3 of the -License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; -import gplx.core.ios.*; import gplx.core.ios.streams.*; -import gplx.xowa.bldrs.*; -import gplx.xowa.wikis.data.tbls.*; -class Xob_wbase_json_dump_parser { - private final Gfo_usr_dlg usr_dlg; private final Xoae_app app; private final Xob_bldr bldr; private final Xowe_wiki wiki; - private final Xob_wbase_json_dump_db dump_db; - private final Io_stream_unzip_mgr unzip_mgr; - public Xob_wbase_json_dump_parser(Xob_bldr bldr, Xowe_wiki wiki) { - this.bldr = bldr; this.wiki = wiki; - this.app = bldr.App(); this.usr_dlg = app.Usr_dlg(); - this.dump_db = new Xob_wbase_json_dump_db(bldr, wiki); - this.unzip_mgr = new Io_stream_unzip_mgr(app.Setup_mgr().Dump_mgr().Import_bz2_by_stdout(), app.Prog_mgr().App_decompress_bz2_by_stdout(), String_.Ary(".bz2", ".gz", ".zip")); - } - public void Parse(Io_url src_fil) { - byte[] json_bgn = Bry_.new_a7("[\n"), id_bgn = Bry_.new_a7("{\"id\":"); - String prog_fmt = "reading ~{0} MB: ~{1} ~{2}"; - Io_stream_rdr stream_rdr = Io_stream_rdr_mgr.Get_rdr_or_null(src_fil, wiki.Fsys_mgr().Root_dir(), unzip_mgr, "*wikidata-*-all.json", "*wikidata-*-all.json.gz"); - if (stream_rdr == null) {usr_dlg.Warn_many("", "", "wbase.import:file not found: src_dir=~{0}", wiki.Fsys_mgr().Root_dir()); return;} - Io_buffer_rdr buffer_rdr = Io_buffer_rdr.new_(stream_rdr, 10 * Io_mgr.Len_mb); long buffer_rdr_len = buffer_rdr.Fil_len(); - try { - Io_url stream_rdr_url = stream_rdr.Url(); - int page_bgn = Bry_find_.Find_fwd(buffer_rdr.Bfr(), id_bgn); - if (page_bgn == Bry_find_.Not_found) {usr_dlg.Warn_many("", "", "wbase.import:initial id not found: url=~{0}", stream_rdr_url.Raw()); return;} - if (!Bry_.Match(buffer_rdr.Bfr(), 0, page_bgn, json_bgn)) {usr_dlg.Warn_many("", "", "wbase.import:doc_bgn is not '[\n': url=~{0}", stream_rdr_url.Raw()); return;} - Xowd_page_itm page = new Xowd_page_itm(); - dump_db.Parse_bgn(stream_rdr.Len(), stream_rdr.Url().NameAndExt()); - while (true) { - int cur_pos = Extract_page(page, buffer_rdr, page_bgn); - if (cur_pos == -1) break; - if (cur_pos < page_bgn) - bldr.Print_prog_msg(buffer_rdr.Fil_pos(), buffer_rdr_len, 1, prog_fmt, Int_.To_str_pad_bgn_zero((int)(buffer_rdr.Fil_pos() / Io_mgr.Len_mb), Int_.DigitCount((int)(buffer_rdr.Fil_len() / Io_mgr.Len_mb))), "", page.Ttl_page_db()); - page_bgn = cur_pos; - } - dump_db.Parse_end(); - } - catch (Exception e) { - String msg = usr_dlg.Warn_many("", "", "dump_rdr:error while reading; url=~{0} err=~{1}", src_fil.Raw(), Err_.Message_lang(e)); - throw Err_.new_wo_type(msg); - } - finally {buffer_rdr.Rls();} - } - private int Extract_page(Xowd_page_itm page, Io_buffer_rdr rdr, int page_bgn) { - int pos = page_bgn; - byte[] bry = rdr.Bfr(); - int bry_len = rdr.Bfr_len(); - while (true) { - if (pos == bry_len) { - rdr.Bfr_load_from(page_bgn); // refill src from pos; - bry_len = rdr.Bfr_len(); - pos -= page_bgn; - page_bgn = 0; - } - byte b = Byte_.Zero; - boolean exit = false; - if (pos < bry_len) - b = bry[pos]; - else { - b = Byte_ascii.Nl; - pos = bry_len; - exit = true; - } - if (b == Byte_ascii.Nl) { - byte[] json_bry = Bry_.Mid(bry, page_bgn, pos); - if (json_bry.length == 1 && json_bry[0] == Byte_ascii.Brack_end) return -1; - if (exit) return -1; - dump_db.Parse_cmd(json_bry); - return pos + 1; - } - ++pos; - } - } -} diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xowb_bldr_addon.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xowb_bldr_addon.java new file mode 100644 index 000000000..7ac33f3f3 --- /dev/null +++ b/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xowb_bldr_addon.java @@ -0,0 +1,29 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; +import gplx.xowa.addons.*; +import gplx.xowa.bldrs.wkrs.*; +public class Xowb_bldr_addon implements Xoax_addon_itm, Xoax_addon_itm__bldr { + public Xob_cmd[] Bldr_cmds() { + return new Xob_cmd[] + { gplx.xowa.xtns.wbases.imports.json.Xowb_json_dump_cmd.Prototype + }; + } + + public String Addon__key() {return "xowa.builds.wikibase";} +} diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Io_stream_rdr_mgr.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Io_stream_rdr_mgr.java similarity index 93% rename from 400_xowa/src/gplx/xowa/xtns/wbases/imports/Io_stream_rdr_mgr.java rename to 400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Io_stream_rdr_mgr.java index f33f0b301..e410275fa 100644 --- a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Io_stream_rdr_mgr.java +++ b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Io_stream_rdr_mgr.java @@ -15,7 +15,7 @@ GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; +package gplx.xowa.xtns.wbases.imports.json; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; import gplx.xowa.xtns.wbases.imports.*; import gplx.core.ios.*; import gplx.core.ios.streams.*; import gplx.core.criterias.*; import gplx.core.envs.*; class Io_stream_rdr_mgr { public static Io_stream_rdr Get_rdr_or_null(Io_url src_fil, Io_url src_dir, Io_stream_unzip_mgr unzip_mgr, String... filter_ary) { diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_cmd.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_cmd.java similarity index 51% rename from 400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_cmd.java rename to 400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_cmd.java index fce46d25c..5d3a854d4 100644 --- a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_cmd.java +++ b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_cmd.java @@ -15,24 +15,26 @@ GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; +package gplx.xowa.xtns.wbases.imports.json; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; import gplx.xowa.xtns.wbases.imports.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; -public class Xob_wbase_json_dump_cmd implements Xob_cmd { - private final Xob_wbase_json_dump_parser json_dump_parser; +public class Xowb_json_dump_cmd extends Xob_cmd__base { + private final Xowb_json_dump_parser json_dump_parser; private Io_url src_fil; - public Xob_wbase_json_dump_cmd(Xob_bldr bldr, Xowe_wiki wiki) { - this.json_dump_parser = new Xob_wbase_json_dump_parser(bldr, wiki); + public Xowb_json_dump_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki); + this.json_dump_parser = new Xowb_json_dump_parser(bldr, wiki); } - public String Cmd_key() {return Xob_cmd_keys.Key_wbase_json_dump;} - public Xob_cmd Cmd_clone(Xob_bldr bldr, Xowe_wiki wiki) {return null;} - public void Cmd_run() {json_dump_parser.Parse(src_fil);} - public void Cmd_init(Xob_bldr bldr) {} - public void Cmd_bgn(Xob_bldr bldr) {} - public void Cmd_end() {} - public void Cmd_term() {} - public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { + @Override public void Cmd_run() { + json_dump_parser.Parse(src_fil); + } + + @Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { if (ctx.Match(k, Invk_src_fil_)) this.src_fil = m.ReadIoUrl("v"); else return Gfo_invk_.Rv_unhandled; return this; } private static final String Invk_src_fil_ = "src_fil_"; + + public static final String BLDR_CMD_KEY = "wbase.json_dump"; + @Override public String Cmd_key() {return BLDR_CMD_KEY;} + public static final Xob_cmd Prototype = new Xowb_json_dump_cmd(null, null); + @Override public Xob_cmd Cmd_clone(Xob_bldr bldr, Xowe_wiki wiki) {return new Xowb_json_dump_cmd(bldr, wiki);} } diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_db.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_db.java similarity index 68% rename from 400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_db.java rename to 400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_db.java index 5cbcce7bd..bdce66c3c 100644 --- a/400_xowa/src/gplx/xowa/xtns/wbases/imports/Xob_wbase_json_dump_db.java +++ b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_db.java @@ -15,61 +15,69 @@ GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -package gplx.xowa.xtns.wbases.imports; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; +package gplx.xowa.xtns.wbases.imports.json; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; import gplx.xowa.xtns.wbases.imports.*; import gplx.core.ios.*; import gplx.langs.jsons.*; -import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.cmds.*; import gplx.xowa.bldrs.cmds.texts.sqls.*; -import gplx.xowa.wikis.nss.*; -import gplx.xowa.wikis.*; import gplx.xowa.wikis.data.*; import gplx.xowa.wikis.data.tbls.*; -import gplx.xowa.apps.apis.xowa.bldrs.imports.*; -import gplx.xowa.xtns.wbases.core.*; import gplx.xowa.xtns.wbases.parsers.*; -class Xob_wbase_json_dump_db { - private final Gfo_usr_dlg usr_dlg; private final Xoae_app app; private final Xowe_wiki wiki; private final Xob_bldr bldr; +import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.cmds.*; import gplx.xowa.bldrs.cmds.texts.sqls.*; import gplx.xowa.apps.apis.xowa.bldrs.imports.*; +import gplx.xowa.wikis.*; import gplx.xowa.wikis.nss.*; import gplx.xowa.wikis.data.*; import gplx.xowa.wikis.data.tbls.*; +import gplx.xowa.xtns.wbases.core.*; import gplx.xowa.xtns.wbases.parsers.*; +class Xowb_json_dump_db { + private final Xoae_app app; private final Gfo_usr_dlg usr_dlg; private final Xowe_wiki wiki; private final Xob_bldr bldr; private final Json_parser json_parser; private final Xob_wdata_pid_sql pid_cmd = new Xob_wdata_pid_sql(); private final Xob_wdata_qid_sql qid_cmd = new Xob_wdata_qid_sql(); - private Xowd_page_tbl page_tbl; - private Xob_ns_to_db_mgr ns_to_db_mgr; - private DateAdp page_modified_on; - private Xow_db_mgr db_mgr; - private Xowd_page_tbl page_core_tbl; + private Xow_ns_mgr ns_mgr; private Xow_db_mgr db_mgr; + private Xowd_page_tbl page_tbl; private Xob_ns_to_db_mgr ns_to_db_mgr; private Io_stream_zip_mgr text_zip_mgr; private byte text_zip_tid; - private Xow_ns_mgr ns_mgr; - public Xob_wbase_json_dump_db(Xob_bldr bldr, Xowe_wiki wiki) { + private DateAdp page_modified_on; + private int page_id = 0, page_count_main = 0; + public Xowb_json_dump_db(Xob_bldr bldr, Xowe_wiki wiki) { this.app = bldr.App(); this.usr_dlg = app.Usr_dlg(); this.wiki = wiki; this.bldr = bldr; this.json_parser = bldr.App().Wiki_mgr().Wdata_mgr().Jdoc_parser(); this.ns_mgr = wiki.Ns_mgr(); } - public void Parse_bgn(long src_fil_len, String src_fil_name) { + public void Parse_all_bgn(long src_fil_len, String src_fil_name) { + // load wiki Xowe_wiki_.Create(wiki, src_fil_len, src_fil_name); this.db_mgr = wiki.Data__core_mgr(); this.page_tbl = db_mgr.Tbl__page(); pid_cmd.Cmd_ctor(bldr, wiki); qid_cmd.Cmd_ctor(bldr, wiki); + + // create ns_mgr wiki.Ns_mgr().Add_defaults(); wiki.Ns_mgr().Add_new(Wdata_wiki_mgr.Ns_property, Wdata_wiki_mgr.Ns_property_name); wiki.Ns_mgr().Init(); + + // init ns_map Xoapi_import import_cfg = app.Api_root().Bldr().Wiki().Import(); this.ns_to_db_mgr = new Xob_ns_to_db_mgr(new Xob_ns_to_db_wkr__text(), db_mgr, import_cfg.Text_db_max()); - this.text_zip_mgr = wiki.Utl__zip_mgr(); text_zip_tid = import_cfg.Zip_tid_text(); byte[] ns_file_map = import_cfg.New_ns_file_map(src_fil_len); Xob_ns_file_itm.Init_ns_bldr_data(Xow_db_file_.Tid__text, wiki.Ns_mgr(), ns_file_map); + + // start import + this.text_zip_mgr = wiki.Utl__zip_mgr(); this.text_zip_tid = import_cfg.Zip_tid_text(); this.page_modified_on = Datetime_now.Get(); - this.page_core_tbl = db_mgr.Tbl__page(); page_tbl.Insert_bgn(); qid_cmd.Page_wkr__bgn(); pid_cmd.Pid_bgn(); } - private int page_id = 0, page_count_main = 0; - public void Parse_cmd(byte[] json_bry) { + public void Parse_doc(byte[] json_bry) { + // parse to jdoc Json_doc jdoc = json_parser.Parse(json_bry); if (jdoc == null) {usr_dlg.Warn_many("", "", "wbase.json_dump:json is invalid: json=~{0}", json_bry); return;} - byte[] id = jdoc.Get_val_as_bry_or(id_key, null); + + // extract xid + byte[] id = jdoc.Get_val_as_bry_or(Bry__id_key, null); if (id == null) {usr_dlg.Warn_many("", "", "wbase.json_dump:id is invalid: json=~{0}", json_bry); return;} boolean jdoc_is_qid = Bry_.Has_at_bgn(id, Byte_ascii.Ltr_Q, 0); Xow_ns ns = jdoc_is_qid ? ns_mgr.Ns_main() : ns_mgr.Ids_get_or_null(Wdata_wiki_mgr.Ns_property); + + // create page entry int random_int = ns.Count() + 1; ns.Count_(random_int); byte[] json_zip = text_zip_mgr.Zip(text_zip_tid, json_bry); Xow_db_file text_db = ns_to_db_mgr.Get_by_ns(ns.Bldr_data(), json_zip.length); - db_mgr.Create_page(page_core_tbl, text_db.Tbl__text(), ++page_id, ns.Id(), id, Bool_.N, page_modified_on, json_zip, json_bry.length, random_int, text_db.Id(), -1); + db_mgr.Create_page(page_tbl, text_db.Tbl__text(), ++page_id, ns.Id(), id, Bool_.N, page_modified_on, json_zip, json_bry.length, random_int, text_db.Id(), -1); + + // insert text if (jdoc_is_qid) { qid_cmd.Parse_jdoc(jdoc); ++page_count_main; @@ -77,16 +85,18 @@ class Xob_wbase_json_dump_db { else pid_cmd.Parse_jdoc(jdoc); } - public void Parse_end() { + public void Parse_all_end() { page_tbl.Insert_end(); page_tbl.Create_idx(); qid_cmd.Qid_end(); pid_cmd.Pid_end(); ns_to_db_mgr.Rls_all(); + + // cleanup core Xow_db_file db_core = db_mgr.Db__core(); db_core.Tbl__site_stats().Update(page_count_main, page_id, ns_mgr.Ns_file().Count()); // save page stats - db_core.Tbl__ns().Insert(ns_mgr); // save ns + db_core.Tbl__ns().Insert(ns_mgr); // save ns db_mgr.Tbl__cfg().Insert_str(Xow_cfg_consts.Grp__wiki_init, Xow_cfg_consts.Key__init__modified_latest, page_modified_on.XtoStr_fmt(DateAdp_.Fmt_iso8561_date_time)); } - private static final byte[] id_key = Bry_.new_a7("id"); + private static final byte[] Bry__id_key = Bry_.new_a7("id"); } diff --git a/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_parser.java b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_parser.java new file mode 100644 index 000000000..9c7ea0af9 --- /dev/null +++ b/400_xowa/src/gplx/xowa/xtns/wbases/imports/json/Xowb_json_dump_parser.java @@ -0,0 +1,87 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.xtns.wbases.imports.json; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.wbases.*; import gplx.xowa.xtns.wbases.imports.*; +import gplx.core.ios.*; import gplx.core.ios.streams.*; +import gplx.xowa.bldrs.*; +import gplx.xowa.wikis.data.tbls.*; +class Xowb_json_dump_parser { + private final Xob_bldr bldr; private final Xowe_wiki wiki; + public Xowb_json_dump_parser(Xob_bldr bldr, Xowe_wiki wiki) { + this.bldr = bldr; this.wiki = wiki; + } + public void Parse(Io_url json_dump_file) { + // init + Xoae_app app = bldr.App(); Gfo_usr_dlg usr_dlg = app.Usr_dlg(); + Xowb_json_dump_db dump_db = new Xowb_json_dump_db(bldr, wiki); + Io_stream_unzip_mgr unzip_mgr = new Io_stream_unzip_mgr(app.Setup_mgr().Dump_mgr().Import_bz2_by_stdout(), app.Prog_mgr().App_decompress_bz2_by_stdout(), String_.Ary(".bz2", ".gz", ".zip")); + + // open buffer from file + Io_stream_rdr stream = Io_stream_rdr_mgr.Get_rdr_or_null(json_dump_file, wiki.Fsys_mgr().Root_dir(), unzip_mgr, "*wikidata-*-all.json", "*wikidata-*-all.json.gz"); + if (stream == null) {usr_dlg.Warn_many("", "", "wbase.import:file not found: src_dir=~{0}", wiki.Fsys_mgr().Root_dir()); return;} + Io_buffer_rdr buffer = Io_buffer_rdr.new_(stream, 10 * Io_mgr.Len_mb); + + try { + // set page_bgn + if (!Bry_.Match(buffer.Bfr(), 0, 3, Bry_.new_a7("[\n{"))) {usr_dlg.Warn_many("", "", "wbase.import:doc_bgn is not '[\n': url=~{0}", stream.Url().Raw()); return;} // validate file; if schema ever changes this will fail + int page_bgn = 2; // 2="[\n" + + // read file and create pages for each json item + dump_db.Parse_all_bgn(stream.Len(), stream.Url().NameAndExt()); + Xowd_page_itm page = new Xowd_page_itm(); + while (true) { + int cur_pos = Parse_doc(dump_db, buffer, page, page_bgn); + if (cur_pos == -1) break; + if (cur_pos < page_bgn) + bldr.Print_prog_msg(buffer.Fil_pos(), buffer.Fil_len(), 1, "reading ~{0} MB: ~{1} ~{2}", Int_.To_str_pad_bgn_zero((int)(buffer.Fil_pos() / Io_mgr.Len_mb), Int_.DigitCount((int)(buffer.Fil_len() / Io_mgr.Len_mb))), "", page.Ttl_page_db()); + page_bgn = cur_pos; + } + dump_db.Parse_all_end(); + } + catch (Exception e) { + String msg = usr_dlg.Warn_many("", "", "dump_rdr:error while reading; url=~{0} err=~{1}", json_dump_file.Raw(), Err_.Message_lang(e)); + throw Err_.new_wo_type(msg); + } + finally {buffer.Rls();} + } + private int Parse_doc(Xowb_json_dump_db dump_db, Io_buffer_rdr rdr, Xowd_page_itm page, int page_bgn) { + // init + int pos = page_bgn; + byte[] bry = rdr.Bfr(); + int bry_len = rdr.Bfr_len(); + + while (true) {// loop 1 byte at a time until nl + if (pos == bry_len) { // refill if at end of 10 MB bfr + rdr.Bfr_load_from(page_bgn); + bry_len = rdr.Bfr_len(); + pos -= page_bgn; + page_bgn = 0; + } + + // read byte; parse if nl; otherwise move to next byte + byte b = bry[pos]; // NOTE: should never be out of bounds b/c json doc will end with "]\n" + if (b == Byte_ascii.Nl) { + if (pos - page_bgn == 1 && bry[page_bgn] == Byte_ascii.Brack_end) // EOF; note that json dump ends with "]\n" + return -1; + dump_db.Parse_doc(Bry_.Mid(bry, page_bgn, pos)); + return pos + 1; + } + else + ++pos; + } + } +}