Bldr: Update orig_regy with api results from missing origs

pull/620/head
gnosygnu 8 years ago
parent e92710483f
commit 7a8228c709

@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.files.*;
import gplx.dbs.*;
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*;
import gplx.xowa.files.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*;
import gplx.xowa.files.*; import gplx.xowa.files.repos.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*;
import gplx.xowa.addons.bldrs.files.dbs.*;
public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
private int fail_max = 100000;
@ -32,8 +32,7 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: found=~{0}", fail_count);
// select into list; ignore any which are invalid titles
List_adp list = List_adp_.New();
byte[] wiki_abrv = wiki.Domain_itm().Abrv_xo();
Ordered_hash list = Ordered_hash_.New_bry();
int invalid_count = 0;
String sql = "SELECT lnki_ttl FROM orig_regy WHERE orig_page_id IS NULL";
Db_rdr rdr = conn.Stmt_sql(sql).Exec_select__rls_auto();
@ -48,33 +47,57 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
}
// create itm and add to list
Xof_fsdb_itm itm = new Xof_fsdb_itm();
itm.Init_at_lnki(Xof_exec_tid.Tid_wiki_page, wiki_abrv, lnki_ttl, Byte_.Zero, Xof_img_size.Upright_null, -1, -1, -1, -1, Xof_patch_upright_tid_.Tid_all);
list.Add(itm);
Xobldr_missing_origs_item itm = new Xobldr_missing_origs_item();
itm.Init_by_orig_tbl(lnki_ttl);
list.Add(itm.Lnki_ttl(), itm);
}
} finally {rdr.Rls();}
Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count);
// call api with list
Xobldr_missing_origs_wmfapi wmf_api = new Xobldr_missing_origs_wmfapi(wiki.App().Wmf_mgr().Download_wkr());
wmf_api.Find_by_list(null, Byte_.Zero, null, 0);
int list_len = list.Len();
int list_bgn = 0;
// loop until no more entries
while (true) {
int list_end = list_bgn + 500;
if (list_end > list_len) list_end = list_len;
// loop list and update
conn.Txn_bgn("bldr.find_missing");
Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl")
, "orig_commons_flag", "orig_repo"
, "orig_page_id", "orig_redirect_id", "orig_redirect_ttl"
, "orig_file_id", "orig_file_ttl", "orig_file_ext"
, "orig_size", "orig_w", "orig_h", "orig_bits", "orig_media_type", "orig_minor_mime", "orig_timestamp");
int len = list.Len();
for (int i = 0; i < len; i++) {
Xof_fsdb_itm itm = (Xof_fsdb_itm)list.Get_at(i);
update_stmt
.Val_int("orig_w", itm.Orig_w())
.Val_int("orig_h", itm.Orig_h())
.Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update();
// find items
wmf_api.Find_by_list(list, Xof_repo_tid_.Tid__remote, "commons.wikimedia.org", list_bgn);
// loop list and update
conn.Txn_bgn("bldr.find_missing");
Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl")
, "orig_repo"
, "orig_page_id", "orig_redirect_id", "orig_redirect_ttl"
, "orig_file_id", "orig_file_ttl", "orig_file_ext"
, "orig_size", "orig_w", "orig_h", "orig_media_type", "orig_minor_mime", "orig_timestamp");
// , "orig_bits"
for (int i = list_bgn; i < list_end; i++) {
Xobldr_missing_origs_item itm = (Xobldr_missing_origs_item)list.Get_at(i);
update_stmt
.Val_byte("orig_repo", itm.Orig_repo())
.Val_int("orig_page_id", itm.Orig_page_id())
.Val_int("orig_redirect_id", itm.Orig_redirect_id())
.Val_bry_as_str("orig_redirect_ttl", itm.Orig_redirect_ttl())
.Val_int("orig_file_id", itm.Orig_file_id())
.Val_bry_as_str("orig_file_ttl", itm.Orig_file_ttl())
.Val_int("orig_file_ext", itm.Orig_file_ext())
.Val_int("orig_size", itm.Orig_size())
.Val_int("orig_w", itm.Orig_w())
.Val_int("orig_h", itm.Orig_h())
.Val_bry_as_str("orig_media_type", itm.Orig_media_type())
.Val_bry_as_str("orig_minor_mime", itm.Orig_minor_mime())
.Val_bry_as_str("orig_timestamp", itm.Orig_timestamp())
.Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update();
}
conn.Txn_end();
// update bounds
if (list_end == list_len) break;
list_bgn += 500;
}
conn.Txn_end();
}
@Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, Invk__fail_max_)) this.fail_max = m.ReadInt("v");

@ -17,67 +17,60 @@ package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.x
import gplx.xowa.files.*;
class Xobldr_missing_origs_item {
public byte[] Lnki_ttl() {return lnki_ttl;} private byte[] lnki_ttl;
public int Page_id() {return page_id;} private int page_id;
public byte Orig_repo() {return orig_repo;} private byte orig_repo;
public int Orig_page_id() {return orig_page_id;} private int orig_page_id;
public byte[] Orig_ttl() {return orig_ttl;} private byte[] orig_ttl;
public int Orig_file_id() {return orig_page_id;}
public byte[] Orig_file_ttl() {return orig_file_ttl;} private byte[] orig_file_ttl;
public byte[] Orig_timestamp() {return orig_timestamp;} private byte[] orig_timestamp;
public long Orig_size() {return orig_size;} private long orig_size;
public int Orig_size() {return orig_size;} private int orig_size;
public int Orig_w() {return orig_w;} private int orig_w;
public int Orig_h() {return orig_h;} private int orig_h;
public byte[] Orig_minor_mime() {return orig_minor_mime;} private byte[] orig_minor_mime;
public byte[] Orig_media_type() {return orig_media_type;} private byte[] orig_media_type;
public byte[] Orig_redirect_ttl() {return orig_redirect_ttl;} private byte[] orig_redirect_ttl;
public int Lnki_ext() {return lnki_ext;} private int lnki_ext;
public int Orig_file_ext() {return orig_file_ext;} private int orig_file_ext;
public int Orig_redirect_ext() {return orig_redirect_ext;} private int orig_redirect_ext;
public int Orig_redirect_id() {return orig_redirect_id;} private int orig_redirect_id;
public Xobldr_missing_origs_item Init_by_orig_tbl(byte[] lnki_ttl) {
this.lnki_ttl = lnki_ttl;
return this;
}
public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int page_id, byte[] orig_ttl, byte[] orig_timestamp, long orig_size, int orig_w, int orig_h, byte[] orig_minor_mime, byte[] orig_media_type) {
this.page_id = page_id;
this.orig_page_id = page_id;
public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int orig_page_id, byte[] orig_file_ttl, int orig_size, int orig_w, int orig_h, byte[] orig_media_type, byte[] orig_minor_mime, byte[] orig_timestamp) {
this.orig_repo = orig_repo;
this.orig_ttl = Normalize_ttl(orig_ttl);
this.orig_timestamp = orig_timestamp;
this.orig_page_id = orig_page_id;
this.orig_file_ttl = Normalize_ttl(orig_file_ttl);
this.orig_file_ext = Xof_ext_.new_by_ttl_(orig_file_ttl).Id();
this.orig_size = orig_size;
this.orig_w = orig_w;
this.orig_h = orig_h;
this.orig_minor_mime = orig_minor_mime;
this.orig_media_type = orig_media_type;
this.orig_minor_mime = Normalize_minor_mime(orig_minor_mime);
this.orig_timestamp = Normalize_timestamp(orig_timestamp);
return this;
}
public Xobldr_missing_origs_item Init_by_api_redirect(byte[] from, byte[] to) {
this.lnki_ttl = Normalize_ttl(from);
this.orig_redirect_ttl = Normalize_ttl(to);
// page_id is always redirect_id
this.orig_redirect_id = orig_page_id;
// orig_page_id is unknown; need to make 2nd call;
this.orig_page_id = -987;
return this;
}
private byte[] Normalize_ttl(byte[] v) {
// remove "File:"
if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) {
v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length);
}
else {
Gfo_usr_dlg_.Instance.Warn_many("", "", "wmf_api does not start with 'File:'; title=~{0}", v);
}
// convert spaces to unders
v = Xoa_ttl.Replace_spaces(v);
return v;
}
public void Copy_api_props(Xobldr_missing_origs_item src) {
// page nde
this.page_id = src.page_id;
this.orig_repo = src.orig_repo;
this.orig_page_id = src.orig_page_id;
this.orig_ttl = src.orig_ttl;
this.orig_timestamp = src.orig_timestamp;
this.orig_file_ttl = src.orig_file_ttl;
this.orig_file_ext = src.orig_file_ext;
this.orig_size = src.orig_size;
this.orig_w = src.orig_w;
this.orig_h = src.orig_h;
this.orig_minor_mime = src.orig_minor_mime;
this.orig_media_type = src.orig_media_type;
this.orig_minor_mime = src.orig_minor_mime;
this.orig_timestamp = src.orig_timestamp;
// revision nde
this.orig_redirect_ttl = src.orig_redirect_ttl;
@ -86,4 +79,38 @@ class Xobldr_missing_origs_item {
this.lnki_ext = Xof_ext_.new_by_ttl_(lnki_ttl).Id();
this.orig_redirect_ext = Xof_ext_.new_by_ttl_(orig_redirect_ttl).Id();
}
private static byte[] Normalize_ttl(byte[] v) {
// remove "File:"
if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) {
v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length);
}
else {
throw Err_.new_wo_type("wmf_api does not start with 'File:'", "title", v);
}
// convert spaces to unders
v = Xoa_ttl.Replace_spaces(v);
return v;
}
private static byte[] Normalize_minor_mime(byte[] src) {
// convert "image/svg+xml" to "svg+xml"
int src_len = src.length;
int slash_pos = Bry_find_.Find_fwd(src, Byte_ascii.Slash, 0, src_len);
if (slash_pos == Bry_find_.Not_found) {
throw Err_.new_wo_type("wmf_api minor_mime does not have slash;", "minor_mime", src);
}
return Bry_.Mid(src, slash_pos + 1, src_len);
}
private static byte[] Normalize_timestamp(byte[] src) {
// convert 2017-03-06T08:09:10Z to 20170306080910
byte[] rv = new byte[14];
int rv_idx = 0;
for (byte b : src) {
if (Byte_ascii.Is_num(b)) {
rv[rv_idx++] = b;
}
}
return rv;
}
}

@ -84,15 +84,19 @@ public class Xobldr_missing_origs_wmfapi {
Json_ary info_ary = (Json_ary)page.Get_as_ary("imageinfo");
Json_nde info_nde = (Json_nde)info_ary.Get_as_nde(0);
byte[] timestamp = info_nde.Get_as_bry("timestamp");
long size = info_nde.Get_as_long("size");
int size = info_nde.Get_as_int("size");
int width = info_nde.Get_as_int("width");
int height = info_nde.Get_as_int("height");
byte[] mime = info_nde.Get_as_bry("mime");
byte[] mediatype = info_nde.Get_as_bry("mediatype");
// add to trg hash
Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, timestamp, size, width, height, mime, mediatype);
temp_hash.Add(trg_item.Orig_ttl(), trg_item);
try {
Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, size, width, height, mediatype, mime, timestamp);
temp_hash.Add(trg_item.Orig_file_ttl(), trg_item);
} catch (Exception e2) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failed to deserialize api obj; domain=~{0} ttl=~{1} json=~{2} err=~{3}", api_domain, title, page.Print_as_json(), Err_.Message_gplx_log(e2));
}
}
// loop over redirects

Loading…
Cancel
Save