Bldr: Add more implementation for missing origs

pull/620/head
gnosygnu 8 years ago
parent 98fb49687b
commit e77916a02e

@ -55,9 +55,9 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
} finally {rdr.Rls();} } finally {rdr.Rls();}
Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count); Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count);
// call api with list // call api with list
Xof_orig_wkr__wmf_api wkr = new Xof_orig_wkr__wmf_api(new Xoapi_orig_wmf(), wiki.App().Wmf_mgr().Download_wkr(), wiki.File__repo_mgr(), wiki.Domain_bry()); Xobldr_missing_origs_wmfapi wmf_api = new Xobldr_missing_origs_wmfapi(wiki.App().Wmf_mgr().Download_wkr());
wkr.Find_by_list(null, null); wmf_api.Find_by_list(null, Byte_.Zero, null, 0);
// loop list and update // loop list and update
conn.Txn_bgn("bldr.find_missing"); conn.Txn_bgn("bldr.find_missing");

@ -0,0 +1,89 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.files.*;
import gplx.xowa.files.*;
class Xobldr_missing_origs_item {
public byte[] Lnki_ttl() {return lnki_ttl;} private byte[] lnki_ttl;
public int Page_id() {return page_id;} private int page_id;
public byte Orig_repo() {return orig_repo;} private byte orig_repo;
public int Orig_page_id() {return orig_page_id;} private int orig_page_id;
public byte[] Orig_ttl() {return orig_ttl;} private byte[] orig_ttl;
public byte[] Orig_timestamp() {return orig_timestamp;} private byte[] orig_timestamp;
public long Orig_size() {return orig_size;} private long orig_size;
public int Orig_w() {return orig_w;} private int orig_w;
public int Orig_h() {return orig_h;} private int orig_h;
public byte[] Orig_minor_mime() {return orig_minor_mime;} private byte[] orig_minor_mime;
public byte[] Orig_media_type() {return orig_media_type;} private byte[] orig_media_type;
public byte[] Orig_redirect_ttl() {return orig_redirect_ttl;} private byte[] orig_redirect_ttl;
public int Lnki_ext() {return lnki_ext;} private int lnki_ext;
public int Orig_redirect_ext() {return orig_redirect_ext;} private int orig_redirect_ext;
public Xobldr_missing_origs_item Init_by_orig_tbl(byte[] lnki_ttl) {
this.lnki_ttl = lnki_ttl;
return this;
}
public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int page_id, byte[] orig_ttl, byte[] orig_timestamp, long orig_size, int orig_w, int orig_h, byte[] orig_minor_mime, byte[] orig_media_type) {
this.page_id = page_id;
this.orig_page_id = page_id;
this.orig_repo = orig_repo;
this.orig_ttl = Normalize_ttl(orig_ttl);
this.orig_timestamp = orig_timestamp;
this.orig_size = orig_size;
this.orig_w = orig_w;
this.orig_h = orig_h;
this.orig_minor_mime = orig_minor_mime;
this.orig_media_type = orig_media_type;
return this;
}
public Xobldr_missing_origs_item Init_by_api_redirect(byte[] from, byte[] to) {
this.lnki_ttl = Normalize_ttl(from);
this.orig_redirect_ttl = Normalize_ttl(to);
return this;
}
private byte[] Normalize_ttl(byte[] v) {
// remove "File:"
if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) {
v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length);
}
else {
Gfo_usr_dlg_.Instance.Warn_many("", "", "wmf_api does not start with 'File:'; title=~{0}", v);
}
// convert spaces to unders
v = Xoa_ttl.Replace_spaces(v);
return v;
}
public void Copy_api_props(Xobldr_missing_origs_item src) {
// page nde
this.page_id = src.page_id;
this.orig_page_id = src.orig_page_id;
this.orig_ttl = src.orig_ttl;
this.orig_timestamp = src.orig_timestamp;
this.orig_size = src.orig_size;
this.orig_w = src.orig_w;
this.orig_h = src.orig_h;
this.orig_minor_mime = src.orig_minor_mime;
this.orig_media_type = src.orig_media_type;
// revision nde
this.orig_redirect_ttl = src.orig_redirect_ttl;
// set ext_ids
this.lnki_ext = Xof_ext_.new_by_ttl_(lnki_ttl).Id();
this.orig_redirect_ext = Xof_ext_.new_by_ttl_(orig_redirect_ttl).Id();
}
}

@ -20,76 +20,109 @@ import gplx.xowa.files.repos.*;
import gplx.xowa.files.downloads.*; import gplx.xowa.files.downloads.*;
import gplx.xowa.apps.wms.apis.origs.*; import gplx.xowa.apps.wms.apis.origs.*;
public class Xobldr_missing_origs_wmfapi { public class Xobldr_missing_origs_wmfapi {
// private final Xoapi_orig_base orig_api; private final Xof_download_wkr download_wkr;
// private final Xof_download_wkr download_wkr; private final Ordered_hash temp_hash = Ordered_hash_.New();
// private final Xow_repo_mgr repo_mgr; public static final byte[] FILE_NS_PREFIX = Bry_.new_a7("File:");
// private final byte[] wiki_domain; public Xobldr_missing_origs_wmfapi(Xof_download_wkr download_wkr) {
// private final Xoapi_orig_rslts api_rv = new Xoapi_orig_rslts(); this.download_wkr = download_wkr;
public Xobldr_missing_origs_wmfapi(Xoapi_orig_base orig_api, Xof_download_wkr download_wkr, Xow_repo_mgr repo_mgr, byte[] wiki_domain) {
// this.orig_api = orig_api;
// this.download_wkr = download_wkr;
// this.repo_mgr = repo_mgr;
// this.wiki_domain = wiki_domain;
} }
public void Find_by_list(Ordered_hash src, Ordered_hash trg, String api_domain, int idx) { public void Find_by_list(Ordered_hash src, byte repo_id, String api_domain, int idx) {
// fail if web access disabled // fail if web access disabled
if (!gplx.core.ios.IoEngine_system.Web_access_enabled) { if (!gplx.core.ios.IoEngine_system.Web_access_enabled) {
throw Err_.new_wo_type("web access must be enabled for missing_origs cmd"); throw Err_.new_wo_type("web access must be enabled for missing_origs cmd");
} }
// Json_parser parser = new Json_parser(); Json_parser parser = new Json_parser();
Gfo_url_encoder encoder = Gfo_url_encoder_.New__http_url().Make(); Gfo_url_encoder encoder = Gfo_url_encoder_.New__http_url().Make();
Bry_bfr bfr = Bry_bfr_.New(); Bry_bfr bfr = Bry_bfr_.New();
int len = src.Len(); int len = src.Len();
try { try {
// loop until all titles found // loop until all titles found
while (idx < len) { while (idx < len) {
// generate super api; EX: https://commons.wikimedia.org/w/api.php?action=query&format=xml&prop=imageinfo&iiprop=size|url|mediatype|mime|bitdepth|timestamp|size|sha1&redirects&iilimit=500&titles= // generate api: EX: https://commons.wikimedia.org/w/api.php?action=query&format=json&formatversion=2&prop=imageinfo&iiprop=timestamp|size|mediatype|mime&redirects&iilimit=500&titles=File:Different%20Faces%20Neptune.jpg|File:East.svg
// generate everything up to titles
bfr.Add_str_a7("https://"); bfr.Add_str_a7("https://");
bfr.Add_str_a7(api_domain); bfr.Add_str_a7(api_domain);
bfr.Add_str_a7("/w/api.php?action=query"); bfr.Add_str_a7("/w/api.php?action=query");
bfr.Add_str_a7("&format=json"); // json easier to use than xml bfr.Add_str_a7("&format=json"); // json easier to use than xml
bfr.Add_str_a7("&iilimit=1"); // limit to 1 revision history (default will return more); EX:File:Different_Faces_Neptune.jpg bfr.Add_str_a7("&iilimit=1"); // limit to 1 revision history (default will return more); EX:File:Different_Faces_Neptune.jpg
bfr.Add_str_a7("&redirects"); // show redirects bfr.Add_str_a7("&redirects"); // show redirects
bfr.Add_str_a7("&prop=imageinfo&iiprop=size|url|mediatype|mime|bitdepth|timestamp|size|sha1"); // list of props bfr.Add_str_a7("&prop=imageinfo&iiprop=timestamp|size|mediatype|mime"); // list of props; NOTE: "url" / "sha1" for future; "bitdepth" always 0?
bfr.Add_str_a7("&titles="); bfr.Add_str_a7("&titles=");
// add titles; EX: File:A.png|File:B.png| // add titles; EX: File:A.png|File:B.png|
for (int i = idx; i < idx + 500; i++) { for (int i = idx; i < idx + 500; i++) {
Xobldr_missing_origs_item item = (Xobldr_missing_origs_item)src.Get_at(i); Xobldr_missing_origs_item item = (Xobldr_missing_origs_item)src.Get_at(i);
Xoa_ttl ttl = item.Lnki_ttl();
// skip "|" if first // skip "|" if first
if (i != idx) bfr.Add_byte_pipe(); if (i != idx) bfr.Add_byte_pipe();
// make ttl_bry so (a) namespace is present (EX:File:); (b) spaces are present (not underscores) // add ttl_bry
byte[] ttl_bry = ttl.Full_txt_wo_qarg(); byte[] ttl_bry = item.Lnki_ttl();
ttl_bry = encoder.Encode(ttl_bry); ttl_bry = Bry_.Add(FILE_NS_PREFIX, ttl_bry); // WMF API requires "File:" prefix; EX: "File:A.png" x> "A.png"
ttl_bry = Xoa_ttl.Replace_unders(ttl_bry); // convert to spaces else will get extra "normalize" node
ttl_bry = encoder.Encode(ttl_bry); // encode for good form
bfr.Add(ttl_bry); bfr.Add(ttl_bry);
} }
// call api // call api
// byte[] rslt = download_wkr.Download_xrg().Exec_as_bry(bfr.To_bry_and_clear()); byte[] rslt = download_wkr.Download_xrg().Exec_as_bry(bfr.To_str_and_clear());
// deserialize // deserialize
// Json_doc jdoc = parser.Parse(rslt); Json_doc jdoc = parser.Parse(rslt);
// loop over /query/pages // loop over pages
// for each node, deserialize orig info and add to hash by "title" Json_ary pages_ary = (Json_ary)jdoc.Get_grp_many("query", "pages");
// loop over /query/redirects int pages_len = pages_ary.Len();
// for each node, retrieve from hash by "to"; add "from" as prop for (int i = 0; i < pages_len; i++) {
// loop over hash // get vars from page nde
// for each item, retrieve from src; copy props over Json_nde page = pages_ary.Get_at_as_nde(i);
int page_id = page.Get_as_int("page_id");
byte[] title = page.Get_as_bry("title");
// get vars from imageinfo node
Json_ary info_ary = (Json_ary)page.Get_as_ary("imageinfo");
Json_nde info_nde = (Json_nde)info_ary.Get_as_nde(0);
byte[] timestamp = info_nde.Get_as_bry("timestamp");
long size = info_nde.Get_as_long("size");
int width = info_nde.Get_as_int("width");
int height = info_nde.Get_as_int("height");
byte[] mime = info_nde.Get_as_bry("mime");
byte[] mediatype = info_nde.Get_as_bry("mediatype");
// add to trg hash
Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, timestamp, size, width, height, mime, mediatype);
temp_hash.Add(trg_item.Orig_ttl(), trg_item);
}
// loop over redirects
Json_ary redirects_ary = (Json_ary)jdoc.Get_grp_many("query", "redirects");
int redirects_len = pages_ary.Len();
for (int i = 0; i < redirects_len; i++) {
// get vars from redirect nde
Json_nde redirect = redirects_ary.Get_at_as_nde(i);
byte[] from = redirect.Get_as_bry("from");
byte[] to = redirect.Get_as_bry("to");
// get nde by "to" and copy redirect
Xobldr_missing_origs_item trg_item = (Xobldr_missing_origs_item)temp_hash.Get_by_or_fail(to);
trg_item.Init_by_api_redirect(from, to);
// update temp_hash key
temp_hash.Del(to);
temp_hash.Add(from, trg_item);
}
// loop over hash and copy back to src
int temp_hash_len = temp_hash.Len();
for (int i = 0; i < temp_hash_len; i++) {
Xobldr_missing_origs_item trg_item = (Xobldr_missing_origs_item)temp_hash.Get_at(i);
Xobldr_missing_origs_item src_item = (Xobldr_missing_origs_item)temp_hash.Get_by(trg_item.Lnki_ttl());
src_item.Copy_api_props(trg_item);
}
} }
} catch (Exception e) { } catch (Exception e) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failure while calling wmf_api; domain=~{0} idx=~{1} err=~{2}", api_domain, idx, Err_.Message_gplx_log(e)); Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failure while calling wmf_api; domain=~{0} idx=~{1} err=~{2}", api_domain, idx, Err_.Message_gplx_log(e));
} }
} }
} }
class Xobldr_missing_origs_item {
private final Xoa_ttl lnki_ttl;
public Xobldr_missing_origs_item(Xoa_ttl lnki_ttl) {
this.lnki_ttl = lnki_ttl;
}
public Xoa_ttl Lnki_ttl() {return lnki_ttl;}
}

Loading…
Cancel
Save