From 7a8228c709a859d5f861f509a5d124aa5ff38b3f Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Mon, 6 Mar 2017 09:01:15 -0500 Subject: [PATCH] Bldr: Update orig_regy with api results from missing origs --- .../Xobldr_missing_origs_cmd.java | 67 ++++++++++----- .../Xobldr_missing_origs_item.java | 81 ++++++++++++------- .../Xobldr_missing_origs_wmfapi.java | 10 ++- 3 files changed, 106 insertions(+), 52 deletions(-) diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java index 6095e1c89..366afd554 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_cmd.java @@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.files.*; import gplx.dbs.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*; -import gplx.xowa.files.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*; +import gplx.xowa.files.*; import gplx.xowa.files.repos.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*; import gplx.xowa.addons.bldrs.files.dbs.*; public class Xobldr_missing_origs_cmd extends Xob_cmd__base { private int fail_max = 100000; @@ -32,8 +32,7 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base { Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: found=~{0}", fail_count); // select into list; ignore any which are invalid titles - List_adp list = List_adp_.New(); - byte[] wiki_abrv = wiki.Domain_itm().Abrv_xo(); + Ordered_hash list = Ordered_hash_.New_bry(); int invalid_count = 0; String sql = "SELECT lnki_ttl FROM orig_regy WHERE orig_page_id IS NULL"; Db_rdr rdr = conn.Stmt_sql(sql).Exec_select__rls_auto(); @@ -48,33 +47,57 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base { } // create itm and add to list - Xof_fsdb_itm itm = new Xof_fsdb_itm(); - itm.Init_at_lnki(Xof_exec_tid.Tid_wiki_page, wiki_abrv, lnki_ttl, Byte_.Zero, Xof_img_size.Upright_null, -1, -1, -1, -1, Xof_patch_upright_tid_.Tid_all); - list.Add(itm); + Xobldr_missing_origs_item itm = new Xobldr_missing_origs_item(); + itm.Init_by_orig_tbl(lnki_ttl); + list.Add(itm.Lnki_ttl(), itm); } } finally {rdr.Rls();} Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count); // call api with list Xobldr_missing_origs_wmfapi wmf_api = new Xobldr_missing_origs_wmfapi(wiki.App().Wmf_mgr().Download_wkr()); - wmf_api.Find_by_list(null, Byte_.Zero, null, 0); + int list_len = list.Len(); + int list_bgn = 0; + // loop until no more entries + while (true) { + int list_end = list_bgn + 500; + if (list_end > list_len) list_end = list_len; - // loop list and update - conn.Txn_bgn("bldr.find_missing"); - Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl") - , "orig_commons_flag", "orig_repo" - , "orig_page_id", "orig_redirect_id", "orig_redirect_ttl" - , "orig_file_id", "orig_file_ttl", "orig_file_ext" - , "orig_size", "orig_w", "orig_h", "orig_bits", "orig_media_type", "orig_minor_mime", "orig_timestamp"); - int len = list.Len(); - for (int i = 0; i < len; i++) { - Xof_fsdb_itm itm = (Xof_fsdb_itm)list.Get_at(i); - update_stmt - .Val_int("orig_w", itm.Orig_w()) - .Val_int("orig_h", itm.Orig_h()) - .Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update(); + // find items + wmf_api.Find_by_list(list, Xof_repo_tid_.Tid__remote, "commons.wikimedia.org", list_bgn); + + // loop list and update + conn.Txn_bgn("bldr.find_missing"); + Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl") + , "orig_repo" + , "orig_page_id", "orig_redirect_id", "orig_redirect_ttl" + , "orig_file_id", "orig_file_ttl", "orig_file_ext" + , "orig_size", "orig_w", "orig_h", "orig_media_type", "orig_minor_mime", "orig_timestamp"); + // , "orig_bits" + for (int i = list_bgn; i < list_end; i++) { + Xobldr_missing_origs_item itm = (Xobldr_missing_origs_item)list.Get_at(i); + update_stmt + .Val_byte("orig_repo", itm.Orig_repo()) + .Val_int("orig_page_id", itm.Orig_page_id()) + .Val_int("orig_redirect_id", itm.Orig_redirect_id()) + .Val_bry_as_str("orig_redirect_ttl", itm.Orig_redirect_ttl()) + .Val_int("orig_file_id", itm.Orig_file_id()) + .Val_bry_as_str("orig_file_ttl", itm.Orig_file_ttl()) + .Val_int("orig_file_ext", itm.Orig_file_ext()) + .Val_int("orig_size", itm.Orig_size()) + .Val_int("orig_w", itm.Orig_w()) + .Val_int("orig_h", itm.Orig_h()) + .Val_bry_as_str("orig_media_type", itm.Orig_media_type()) + .Val_bry_as_str("orig_minor_mime", itm.Orig_minor_mime()) + .Val_bry_as_str("orig_timestamp", itm.Orig_timestamp()) + .Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update(); + } + conn.Txn_end(); + + // update bounds + if (list_end == list_len) break; + list_bgn += 500; } - conn.Txn_end(); } @Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) { if (ctx.Match(k, Invk__fail_max_)) this.fail_max = m.ReadInt("v"); diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java index 8e22743e7..97f88c0e5 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_item.java @@ -17,67 +17,60 @@ package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.x import gplx.xowa.files.*; class Xobldr_missing_origs_item { public byte[] Lnki_ttl() {return lnki_ttl;} private byte[] lnki_ttl; - public int Page_id() {return page_id;} private int page_id; public byte Orig_repo() {return orig_repo;} private byte orig_repo; public int Orig_page_id() {return orig_page_id;} private int orig_page_id; - public byte[] Orig_ttl() {return orig_ttl;} private byte[] orig_ttl; + public int Orig_file_id() {return orig_page_id;} + public byte[] Orig_file_ttl() {return orig_file_ttl;} private byte[] orig_file_ttl; public byte[] Orig_timestamp() {return orig_timestamp;} private byte[] orig_timestamp; - public long Orig_size() {return orig_size;} private long orig_size; + public int Orig_size() {return orig_size;} private int orig_size; public int Orig_w() {return orig_w;} private int orig_w; public int Orig_h() {return orig_h;} private int orig_h; public byte[] Orig_minor_mime() {return orig_minor_mime;} private byte[] orig_minor_mime; public byte[] Orig_media_type() {return orig_media_type;} private byte[] orig_media_type; public byte[] Orig_redirect_ttl() {return orig_redirect_ttl;} private byte[] orig_redirect_ttl; public int Lnki_ext() {return lnki_ext;} private int lnki_ext; + public int Orig_file_ext() {return orig_file_ext;} private int orig_file_ext; public int Orig_redirect_ext() {return orig_redirect_ext;} private int orig_redirect_ext; + public int Orig_redirect_id() {return orig_redirect_id;} private int orig_redirect_id; public Xobldr_missing_origs_item Init_by_orig_tbl(byte[] lnki_ttl) { this.lnki_ttl = lnki_ttl; return this; } - public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int page_id, byte[] orig_ttl, byte[] orig_timestamp, long orig_size, int orig_w, int orig_h, byte[] orig_minor_mime, byte[] orig_media_type) { - this.page_id = page_id; - this.orig_page_id = page_id; + public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int orig_page_id, byte[] orig_file_ttl, int orig_size, int orig_w, int orig_h, byte[] orig_media_type, byte[] orig_minor_mime, byte[] orig_timestamp) { this.orig_repo = orig_repo; - this.orig_ttl = Normalize_ttl(orig_ttl); - this.orig_timestamp = orig_timestamp; + this.orig_page_id = orig_page_id; + this.orig_file_ttl = Normalize_ttl(orig_file_ttl); + this.orig_file_ext = Xof_ext_.new_by_ttl_(orig_file_ttl).Id(); this.orig_size = orig_size; this.orig_w = orig_w; this.orig_h = orig_h; - this.orig_minor_mime = orig_minor_mime; this.orig_media_type = orig_media_type; + this.orig_minor_mime = Normalize_minor_mime(orig_minor_mime); + this.orig_timestamp = Normalize_timestamp(orig_timestamp); return this; } public Xobldr_missing_origs_item Init_by_api_redirect(byte[] from, byte[] to) { this.lnki_ttl = Normalize_ttl(from); this.orig_redirect_ttl = Normalize_ttl(to); + // page_id is always redirect_id + this.orig_redirect_id = orig_page_id; + // orig_page_id is unknown; need to make 2nd call; + this.orig_page_id = -987; return this; } - private byte[] Normalize_ttl(byte[] v) { - // remove "File:" - if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) { - v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length); - } - else { - Gfo_usr_dlg_.Instance.Warn_many("", "", "wmf_api does not start with 'File:'; title=~{0}", v); - } - - // convert spaces to unders - v = Xoa_ttl.Replace_spaces(v); - - return v; - } public void Copy_api_props(Xobldr_missing_origs_item src) { // page nde - this.page_id = src.page_id; + this.orig_repo = src.orig_repo; this.orig_page_id = src.orig_page_id; - this.orig_ttl = src.orig_ttl; - this.orig_timestamp = src.orig_timestamp; + this.orig_file_ttl = src.orig_file_ttl; + this.orig_file_ext = src.orig_file_ext; this.orig_size = src.orig_size; this.orig_w = src.orig_w; this.orig_h = src.orig_h; - this.orig_minor_mime = src.orig_minor_mime; this.orig_media_type = src.orig_media_type; + this.orig_minor_mime = src.orig_minor_mime; + this.orig_timestamp = src.orig_timestamp; // revision nde this.orig_redirect_ttl = src.orig_redirect_ttl; @@ -86,4 +79,38 @@ class Xobldr_missing_origs_item { this.lnki_ext = Xof_ext_.new_by_ttl_(lnki_ttl).Id(); this.orig_redirect_ext = Xof_ext_.new_by_ttl_(orig_redirect_ttl).Id(); } + private static byte[] Normalize_ttl(byte[] v) { + // remove "File:" + if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) { + v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length); + } + else { + throw Err_.new_wo_type("wmf_api does not start with 'File:'", "title", v); + } + + // convert spaces to unders + v = Xoa_ttl.Replace_spaces(v); + + return v; + } + private static byte[] Normalize_minor_mime(byte[] src) { + // convert "image/svg+xml" to "svg+xml" + int src_len = src.length; + int slash_pos = Bry_find_.Find_fwd(src, Byte_ascii.Slash, 0, src_len); + if (slash_pos == Bry_find_.Not_found) { + throw Err_.new_wo_type("wmf_api minor_mime does not have slash;", "minor_mime", src); + } + return Bry_.Mid(src, slash_pos + 1, src_len); + } + private static byte[] Normalize_timestamp(byte[] src) { + // convert 2017-03-06T08:09:10Z to 20170306080910 + byte[] rv = new byte[14]; + int rv_idx = 0; + for (byte b : src) { + if (Byte_ascii.Is_num(b)) { + rv[rv_idx++] = b; + } + } + return rv; + } } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java index 993ad3abf..4f25401ca 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/files/missing_origs/Xobldr_missing_origs_wmfapi.java @@ -84,15 +84,19 @@ public class Xobldr_missing_origs_wmfapi { Json_ary info_ary = (Json_ary)page.Get_as_ary("imageinfo"); Json_nde info_nde = (Json_nde)info_ary.Get_as_nde(0); byte[] timestamp = info_nde.Get_as_bry("timestamp"); - long size = info_nde.Get_as_long("size"); + int size = info_nde.Get_as_int("size"); int width = info_nde.Get_as_int("width"); int height = info_nde.Get_as_int("height"); byte[] mime = info_nde.Get_as_bry("mime"); byte[] mediatype = info_nde.Get_as_bry("mediatype"); // add to trg hash - Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, timestamp, size, width, height, mime, mediatype); - temp_hash.Add(trg_item.Orig_ttl(), trg_item); + try { + Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, size, width, height, mediatype, mime, timestamp); + temp_hash.Add(trg_item.Orig_file_ttl(), trg_item); + } catch (Exception e2) { + Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failed to deserialize api obj; domain=~{0} ttl=~{1} json=~{2} err=~{3}", api_domain, title, page.Print_as_json(), Err_.Message_gplx_log(e2)); + } } // loop over redirects