From e77916a02e812ccfabc41c38c20116b85535214f Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sun, 5 Mar 2017 08:40:52 -0500 Subject: [PATCH] Bldr: Add more implementation for missing origs --- .../Xobldr_missing_origs_cmd.java | 6 +- .../Xobldr_missing_origs_item.java | 89 +++++++++++++++++ .../Xobldr_missing_origs_wmfapi.java | 99 ++++++++++++------- 3 files changed, 158 insertions(+), 36 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java index 9cf9174a3..6095e1c89 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java @@ -55,9 +55,9 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base { } finally {rdr.Rls();} Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count); - // call api with list - Xof_orig_wkr__wmf_api wkr = new Xof_orig_wkr__wmf_api(new Xoapi_orig_wmf(), wiki.App().Wmf_mgr().Download_wkr(), wiki.File__repo_mgr(), wiki.Domain_bry()); - wkr.Find_by_list(null, null); +// call api with list + Xobldr_missing_origs_wmfapi wmf_api = new Xobldr_missing_origs_wmfapi(wiki.App().Wmf_mgr().Download_wkr()); + wmf_api.Find_by_list(null, Byte_.Zero, null, 0); // loop list and update conn.Txn_bgn("bldr.find_missing"); diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java new file mode 100644 index 000000000..8e22743e7 --- /dev/null +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java @@ -0,0 +1,89 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.files.*; +import gplx.xowa.files.*; +class Xobldr_missing_origs_item { + public byte[] Lnki_ttl() {return lnki_ttl;} private byte[] lnki_ttl; + public int Page_id() {return page_id;} private int page_id; + public byte Orig_repo() {return orig_repo;} private byte orig_repo; + public int Orig_page_id() {return orig_page_id;} private int orig_page_id; + public byte[] Orig_ttl() {return orig_ttl;} private byte[] orig_ttl; + public byte[] Orig_timestamp() {return orig_timestamp;} private byte[] orig_timestamp; + public long Orig_size() {return orig_size;} private long orig_size; + public int Orig_w() {return orig_w;} private int orig_w; + public int Orig_h() {return orig_h;} private int orig_h; + public byte[] Orig_minor_mime() {return orig_minor_mime;} private byte[] orig_minor_mime; + public byte[] Orig_media_type() {return orig_media_type;} private byte[] orig_media_type; + public byte[] Orig_redirect_ttl() {return orig_redirect_ttl;} private byte[] orig_redirect_ttl; + public int Lnki_ext() {return lnki_ext;} private int lnki_ext; + public int Orig_redirect_ext() {return orig_redirect_ext;} private int orig_redirect_ext; + + public Xobldr_missing_origs_item Init_by_orig_tbl(byte[] lnki_ttl) { + this.lnki_ttl = lnki_ttl; + return this; + } + public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int page_id, byte[] orig_ttl, byte[] orig_timestamp, long orig_size, int orig_w, int orig_h, byte[] orig_minor_mime, byte[] orig_media_type) { + this.page_id = page_id; + this.orig_page_id = page_id; + this.orig_repo = orig_repo; + this.orig_ttl = Normalize_ttl(orig_ttl); + this.orig_timestamp = orig_timestamp; + this.orig_size = orig_size; + this.orig_w = orig_w; + this.orig_h = orig_h; + this.orig_minor_mime = orig_minor_mime; + this.orig_media_type = orig_media_type; + return this; + } + public Xobldr_missing_origs_item Init_by_api_redirect(byte[] from, byte[] to) { + this.lnki_ttl = Normalize_ttl(from); + this.orig_redirect_ttl = Normalize_ttl(to); + return this; + } + private byte[] Normalize_ttl(byte[] v) { + // remove "File:" + if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) { + v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length); + } + else { + Gfo_usr_dlg_.Instance.Warn_many("", "", "wmf_api does not start with 'File:'; title=~{0}", v); + } + + // convert spaces to unders + v = Xoa_ttl.Replace_spaces(v); + + return v; + } + public void Copy_api_props(Xobldr_missing_origs_item src) { + // page nde + this.page_id = src.page_id; + this.orig_page_id = src.orig_page_id; + this.orig_ttl = src.orig_ttl; + this.orig_timestamp = src.orig_timestamp; + this.orig_size = src.orig_size; + this.orig_w = src.orig_w; + this.orig_h = src.orig_h; + this.orig_minor_mime = src.orig_minor_mime; + this.orig_media_type = src.orig_media_type; + + // revision nde + this.orig_redirect_ttl = src.orig_redirect_ttl; + + // set ext_ids + this.lnki_ext = Xof_ext_.new_by_ttl_(lnki_ttl).Id(); + this.orig_redirect_ext = Xof_ext_.new_by_ttl_(orig_redirect_ttl).Id(); + } +} diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java index 723ba1f9e..993ad3abf 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java @@ -20,76 +20,109 @@ import gplx.xowa.files.repos.*; import gplx.xowa.files.downloads.*; import gplx.xowa.apps.wms.apis.origs.*; public class Xobldr_missing_origs_wmfapi { -// private final Xoapi_orig_base orig_api; -// private final Xof_download_wkr download_wkr; -// private final Xow_repo_mgr repo_mgr; -// private final byte[] wiki_domain; -// private final Xoapi_orig_rslts api_rv = new Xoapi_orig_rslts(); - public Xobldr_missing_origs_wmfapi(Xoapi_orig_base orig_api, Xof_download_wkr download_wkr, Xow_repo_mgr repo_mgr, byte[] wiki_domain) { -// this.orig_api = orig_api; -// this.download_wkr = download_wkr; -// this.repo_mgr = repo_mgr; -// this.wiki_domain = wiki_domain; + private final Xof_download_wkr download_wkr; + private final Ordered_hash temp_hash = Ordered_hash_.New(); + public static final byte[] FILE_NS_PREFIX = Bry_.new_a7("File:"); + public Xobldr_missing_origs_wmfapi(Xof_download_wkr download_wkr) { + this.download_wkr = download_wkr; } - public void Find_by_list(Ordered_hash src, Ordered_hash trg, String api_domain, int idx) { + public void Find_by_list(Ordered_hash src, byte repo_id, String api_domain, int idx) { // fail if web access disabled if (!gplx.core.ios.IoEngine_system.Web_access_enabled) { throw Err_.new_wo_type("web access must be enabled for missing_origs cmd"); } -// Json_parser parser = new Json_parser(); + Json_parser parser = new Json_parser(); Gfo_url_encoder encoder = Gfo_url_encoder_.New__http_url().Make(); Bry_bfr bfr = Bry_bfr_.New(); int len = src.Len(); try { // loop until all titles found while (idx < len) { - // generate super api; EX: https://commons.wikimedia.org/w/api.php?action=query&format=xml&prop=imageinfo&iiprop=size|url|mediatype|mime|bitdepth|timestamp|size|sha1&redirects&iilimit=500&titles= + // generate api: EX: https://commons.wikimedia.org/w/api.php?action=query&format=json&formatversion=2&prop=imageinfo&iiprop=timestamp|size|mediatype|mime&redirects&iilimit=500&titles=File:Different%20Faces%20Neptune.jpg|File:East.svg + // generate everything up to titles bfr.Add_str_a7("https://"); bfr.Add_str_a7(api_domain); bfr.Add_str_a7("/w/api.php?action=query"); bfr.Add_str_a7("&format=json"); // json easier to use than xml bfr.Add_str_a7("&iilimit=1"); // limit to 1 revision history (default will return more); EX:File:Different_Faces_Neptune.jpg bfr.Add_str_a7("&redirects"); // show redirects - bfr.Add_str_a7("&prop=imageinfo&iiprop=size|url|mediatype|mime|bitdepth|timestamp|size|sha1"); // list of props + bfr.Add_str_a7("&prop=imageinfo&iiprop=timestamp|size|mediatype|mime"); // list of props; NOTE: "url" / "sha1" for future; "bitdepth" always 0? bfr.Add_str_a7("&titles="); // add titles; EX: File:A.png|File:B.png| for (int i = idx; i < idx + 500; i++) { Xobldr_missing_origs_item item = (Xobldr_missing_origs_item)src.Get_at(i); - Xoa_ttl ttl = item.Lnki_ttl(); // skip "|" if first if (i != idx) bfr.Add_byte_pipe(); - // make ttl_bry so (a) namespace is present (EX:File:); (b) spaces are present (not underscores) - byte[] ttl_bry = ttl.Full_txt_wo_qarg(); - ttl_bry = encoder.Encode(ttl_bry); + // add ttl_bry + byte[] ttl_bry = item.Lnki_ttl(); + ttl_bry = Bry_.Add(FILE_NS_PREFIX, ttl_bry); // WMF API requires "File:" prefix; EX: "File:A.png" x> "A.png" + ttl_bry = Xoa_ttl.Replace_unders(ttl_bry); // convert to spaces else will get extra "normalize" node + ttl_bry = encoder.Encode(ttl_bry); // encode for good form bfr.Add(ttl_bry); } // call api -// byte[] rslt = download_wkr.Download_xrg().Exec_as_bry(bfr.To_bry_and_clear()); + byte[] rslt = download_wkr.Download_xrg().Exec_as_bry(bfr.To_str_and_clear()); // deserialize -// Json_doc jdoc = parser.Parse(rslt); + Json_doc jdoc = parser.Parse(rslt); - // loop over /query/pages - // for each node, deserialize orig info and add to hash by "title" - // loop over /query/redirects - // for each node, retrieve from hash by "to"; add "from" as prop - // loop over hash - // for each item, retrieve from src; copy props over + // loop over pages + Json_ary pages_ary = (Json_ary)jdoc.Get_grp_many("query", "pages"); + int pages_len = pages_ary.Len(); + for (int i = 0; i < pages_len; i++) { + // get vars from page nde + Json_nde page = pages_ary.Get_at_as_nde(i); + int page_id = page.Get_as_int("page_id"); + byte[] title = page.Get_as_bry("title"); + + // get vars from imageinfo node + Json_ary info_ary = (Json_ary)page.Get_as_ary("imageinfo"); + Json_nde info_nde = (Json_nde)info_ary.Get_as_nde(0); + byte[] timestamp = info_nde.Get_as_bry("timestamp"); + long size = info_nde.Get_as_long("size"); + int width = info_nde.Get_as_int("width"); + int height = info_nde.Get_as_int("height"); + byte[] mime = info_nde.Get_as_bry("mime"); + byte[] mediatype = info_nde.Get_as_bry("mediatype"); + + // add to trg hash + Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, timestamp, size, width, height, mime, mediatype); + temp_hash.Add(trg_item.Orig_ttl(), trg_item); + } + + // loop over redirects + Json_ary redirects_ary = (Json_ary)jdoc.Get_grp_many("query", "redirects"); + int redirects_len = pages_ary.Len(); + for (int i = 0; i < redirects_len; i++) { + // get vars from redirect nde + Json_nde redirect = redirects_ary.Get_at_as_nde(i); + byte[] from = redirect.Get_as_bry("from"); + byte[] to = redirect.Get_as_bry("to"); + + // get nde by "to" and copy redirect + Xobldr_missing_origs_item trg_item = (Xobldr_missing_origs_item)temp_hash.Get_by_or_fail(to); + trg_item.Init_by_api_redirect(from, to); + + // update temp_hash key + temp_hash.Del(to); + temp_hash.Add(from, trg_item); + } + + // loop over hash and copy back to src + int temp_hash_len = temp_hash.Len(); + for (int i = 0; i < temp_hash_len; i++) { + Xobldr_missing_origs_item trg_item = (Xobldr_missing_origs_item)temp_hash.Get_at(i); + Xobldr_missing_origs_item src_item = (Xobldr_missing_origs_item)temp_hash.Get_by(trg_item.Lnki_ttl()); + src_item.Copy_api_props(trg_item); + } } } catch (Exception e) { Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failure while calling wmf_api; domain=~{0} idx=~{1} err=~{2}", api_domain, idx, Err_.Message_gplx_log(e)); } } } -class Xobldr_missing_origs_item { - private final Xoa_ttl lnki_ttl; - public Xobldr_missing_origs_item(Xoa_ttl lnki_ttl) { - this.lnki_ttl = lnki_ttl; - } - public Xoa_ttl Lnki_ttl() {return lnki_ttl;} -}