mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Bldr: Update orig_regy with api results from missing origs
This commit is contained in:
parent
e92710483f
commit
7a8228c709
@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.bldrs.*; import gplx.xowa.addons.bldrs.files.*;
|
||||
import gplx.dbs.*;
|
||||
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*;
|
||||
import gplx.xowa.files.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*;
|
||||
import gplx.xowa.files.*; import gplx.xowa.files.repos.*; import gplx.xowa.files.origs.*; import gplx.xowa.apps.wms.apis.origs.*;
|
||||
import gplx.xowa.addons.bldrs.files.dbs.*;
|
||||
public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
|
||||
private int fail_max = 100000;
|
||||
@ -32,8 +32,7 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
|
||||
Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: found=~{0}", fail_count);
|
||||
|
||||
// select into list; ignore any which are invalid titles
|
||||
List_adp list = List_adp_.New();
|
||||
byte[] wiki_abrv = wiki.Domain_itm().Abrv_xo();
|
||||
Ordered_hash list = Ordered_hash_.New_bry();
|
||||
int invalid_count = 0;
|
||||
String sql = "SELECT lnki_ttl FROM orig_regy WHERE orig_page_id IS NULL";
|
||||
Db_rdr rdr = conn.Stmt_sql(sql).Exec_select__rls_auto();
|
||||
@ -48,33 +47,57 @@ public class Xobldr_missing_origs_cmd extends Xob_cmd__base {
|
||||
}
|
||||
|
||||
// create itm and add to list
|
||||
Xof_fsdb_itm itm = new Xof_fsdb_itm();
|
||||
itm.Init_at_lnki(Xof_exec_tid.Tid_wiki_page, wiki_abrv, lnki_ttl, Byte_.Zero, Xof_img_size.Upright_null, -1, -1, -1, -1, Xof_patch_upright_tid_.Tid_all);
|
||||
list.Add(itm);
|
||||
Xobldr_missing_origs_item itm = new Xobldr_missing_origs_item();
|
||||
itm.Init_by_orig_tbl(lnki_ttl);
|
||||
list.Add(itm.Lnki_ttl(), itm);
|
||||
}
|
||||
} finally {rdr.Rls();}
|
||||
Gfo_usr_dlg_.Instance.Note_many("", "", "bldr.find_missing: invalid=~{0}", invalid_count);
|
||||
|
||||
// call api with list
|
||||
Xobldr_missing_origs_wmfapi wmf_api = new Xobldr_missing_origs_wmfapi(wiki.App().Wmf_mgr().Download_wkr());
|
||||
wmf_api.Find_by_list(null, Byte_.Zero, null, 0);
|
||||
int list_len = list.Len();
|
||||
int list_bgn = 0;
|
||||
// loop until no more entries
|
||||
while (true) {
|
||||
int list_end = list_bgn + 500;
|
||||
if (list_end > list_len) list_end = list_len;
|
||||
|
||||
// loop list and update
|
||||
conn.Txn_bgn("bldr.find_missing");
|
||||
Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl")
|
||||
, "orig_commons_flag", "orig_repo"
|
||||
, "orig_page_id", "orig_redirect_id", "orig_redirect_ttl"
|
||||
, "orig_file_id", "orig_file_ttl", "orig_file_ext"
|
||||
, "orig_size", "orig_w", "orig_h", "orig_bits", "orig_media_type", "orig_minor_mime", "orig_timestamp");
|
||||
int len = list.Len();
|
||||
for (int i = 0; i < len; i++) {
|
||||
Xof_fsdb_itm itm = (Xof_fsdb_itm)list.Get_at(i);
|
||||
update_stmt
|
||||
.Val_int("orig_w", itm.Orig_w())
|
||||
.Val_int("orig_h", itm.Orig_h())
|
||||
.Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update();
|
||||
// find items
|
||||
wmf_api.Find_by_list(list, Xof_repo_tid_.Tid__remote, "commons.wikimedia.org", list_bgn);
|
||||
|
||||
// loop list and update
|
||||
conn.Txn_bgn("bldr.find_missing");
|
||||
Db_stmt update_stmt = conn.Stmt_update("orig_regy", String_.Ary("lnki_ttl")
|
||||
, "orig_repo"
|
||||
, "orig_page_id", "orig_redirect_id", "orig_redirect_ttl"
|
||||
, "orig_file_id", "orig_file_ttl", "orig_file_ext"
|
||||
, "orig_size", "orig_w", "orig_h", "orig_media_type", "orig_minor_mime", "orig_timestamp");
|
||||
// , "orig_bits"
|
||||
for (int i = list_bgn; i < list_end; i++) {
|
||||
Xobldr_missing_origs_item itm = (Xobldr_missing_origs_item)list.Get_at(i);
|
||||
update_stmt
|
||||
.Val_byte("orig_repo", itm.Orig_repo())
|
||||
.Val_int("orig_page_id", itm.Orig_page_id())
|
||||
.Val_int("orig_redirect_id", itm.Orig_redirect_id())
|
||||
.Val_bry_as_str("orig_redirect_ttl", itm.Orig_redirect_ttl())
|
||||
.Val_int("orig_file_id", itm.Orig_file_id())
|
||||
.Val_bry_as_str("orig_file_ttl", itm.Orig_file_ttl())
|
||||
.Val_int("orig_file_ext", itm.Orig_file_ext())
|
||||
.Val_int("orig_size", itm.Orig_size())
|
||||
.Val_int("orig_w", itm.Orig_w())
|
||||
.Val_int("orig_h", itm.Orig_h())
|
||||
.Val_bry_as_str("orig_media_type", itm.Orig_media_type())
|
||||
.Val_bry_as_str("orig_minor_mime", itm.Orig_minor_mime())
|
||||
.Val_bry_as_str("orig_timestamp", itm.Orig_timestamp())
|
||||
.Crt_bry_as_str("lnki_ttl", itm.Lnki_ttl()).Exec_update();
|
||||
}
|
||||
conn.Txn_end();
|
||||
|
||||
// update bounds
|
||||
if (list_end == list_len) break;
|
||||
list_bgn += 500;
|
||||
}
|
||||
conn.Txn_end();
|
||||
}
|
||||
@Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
|
||||
if (ctx.Match(k, Invk__fail_max_)) this.fail_max = m.ReadInt("v");
|
||||
|
@ -17,67 +17,60 @@ package gplx.xowa.addons.bldrs.files.missing_origs; import gplx.*; import gplx.x
|
||||
import gplx.xowa.files.*;
|
||||
class Xobldr_missing_origs_item {
|
||||
public byte[] Lnki_ttl() {return lnki_ttl;} private byte[] lnki_ttl;
|
||||
public int Page_id() {return page_id;} private int page_id;
|
||||
public byte Orig_repo() {return orig_repo;} private byte orig_repo;
|
||||
public int Orig_page_id() {return orig_page_id;} private int orig_page_id;
|
||||
public byte[] Orig_ttl() {return orig_ttl;} private byte[] orig_ttl;
|
||||
public int Orig_file_id() {return orig_page_id;}
|
||||
public byte[] Orig_file_ttl() {return orig_file_ttl;} private byte[] orig_file_ttl;
|
||||
public byte[] Orig_timestamp() {return orig_timestamp;} private byte[] orig_timestamp;
|
||||
public long Orig_size() {return orig_size;} private long orig_size;
|
||||
public int Orig_size() {return orig_size;} private int orig_size;
|
||||
public int Orig_w() {return orig_w;} private int orig_w;
|
||||
public int Orig_h() {return orig_h;} private int orig_h;
|
||||
public byte[] Orig_minor_mime() {return orig_minor_mime;} private byte[] orig_minor_mime;
|
||||
public byte[] Orig_media_type() {return orig_media_type;} private byte[] orig_media_type;
|
||||
public byte[] Orig_redirect_ttl() {return orig_redirect_ttl;} private byte[] orig_redirect_ttl;
|
||||
public int Lnki_ext() {return lnki_ext;} private int lnki_ext;
|
||||
public int Orig_file_ext() {return orig_file_ext;} private int orig_file_ext;
|
||||
public int Orig_redirect_ext() {return orig_redirect_ext;} private int orig_redirect_ext;
|
||||
public int Orig_redirect_id() {return orig_redirect_id;} private int orig_redirect_id;
|
||||
|
||||
public Xobldr_missing_origs_item Init_by_orig_tbl(byte[] lnki_ttl) {
|
||||
this.lnki_ttl = lnki_ttl;
|
||||
return this;
|
||||
}
|
||||
public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int page_id, byte[] orig_ttl, byte[] orig_timestamp, long orig_size, int orig_w, int orig_h, byte[] orig_minor_mime, byte[] orig_media_type) {
|
||||
this.page_id = page_id;
|
||||
this.orig_page_id = page_id;
|
||||
public Xobldr_missing_origs_item Init_by_api_page(byte orig_repo, int orig_page_id, byte[] orig_file_ttl, int orig_size, int orig_w, int orig_h, byte[] orig_media_type, byte[] orig_minor_mime, byte[] orig_timestamp) {
|
||||
this.orig_repo = orig_repo;
|
||||
this.orig_ttl = Normalize_ttl(orig_ttl);
|
||||
this.orig_timestamp = orig_timestamp;
|
||||
this.orig_page_id = orig_page_id;
|
||||
this.orig_file_ttl = Normalize_ttl(orig_file_ttl);
|
||||
this.orig_file_ext = Xof_ext_.new_by_ttl_(orig_file_ttl).Id();
|
||||
this.orig_size = orig_size;
|
||||
this.orig_w = orig_w;
|
||||
this.orig_h = orig_h;
|
||||
this.orig_minor_mime = orig_minor_mime;
|
||||
this.orig_media_type = orig_media_type;
|
||||
this.orig_minor_mime = Normalize_minor_mime(orig_minor_mime);
|
||||
this.orig_timestamp = Normalize_timestamp(orig_timestamp);
|
||||
return this;
|
||||
}
|
||||
public Xobldr_missing_origs_item Init_by_api_redirect(byte[] from, byte[] to) {
|
||||
this.lnki_ttl = Normalize_ttl(from);
|
||||
this.orig_redirect_ttl = Normalize_ttl(to);
|
||||
// page_id is always redirect_id
|
||||
this.orig_redirect_id = orig_page_id;
|
||||
// orig_page_id is unknown; need to make 2nd call;
|
||||
this.orig_page_id = -987;
|
||||
return this;
|
||||
}
|
||||
private byte[] Normalize_ttl(byte[] v) {
|
||||
// remove "File:"
|
||||
if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) {
|
||||
v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length);
|
||||
}
|
||||
else {
|
||||
Gfo_usr_dlg_.Instance.Warn_many("", "", "wmf_api does not start with 'File:'; title=~{0}", v);
|
||||
}
|
||||
|
||||
// convert spaces to unders
|
||||
v = Xoa_ttl.Replace_spaces(v);
|
||||
|
||||
return v;
|
||||
}
|
||||
public void Copy_api_props(Xobldr_missing_origs_item src) {
|
||||
// page nde
|
||||
this.page_id = src.page_id;
|
||||
this.orig_repo = src.orig_repo;
|
||||
this.orig_page_id = src.orig_page_id;
|
||||
this.orig_ttl = src.orig_ttl;
|
||||
this.orig_timestamp = src.orig_timestamp;
|
||||
this.orig_file_ttl = src.orig_file_ttl;
|
||||
this.orig_file_ext = src.orig_file_ext;
|
||||
this.orig_size = src.orig_size;
|
||||
this.orig_w = src.orig_w;
|
||||
this.orig_h = src.orig_h;
|
||||
this.orig_minor_mime = src.orig_minor_mime;
|
||||
this.orig_media_type = src.orig_media_type;
|
||||
this.orig_minor_mime = src.orig_minor_mime;
|
||||
this.orig_timestamp = src.orig_timestamp;
|
||||
|
||||
// revision nde
|
||||
this.orig_redirect_ttl = src.orig_redirect_ttl;
|
||||
@ -86,4 +79,38 @@ class Xobldr_missing_origs_item {
|
||||
this.lnki_ext = Xof_ext_.new_by_ttl_(lnki_ttl).Id();
|
||||
this.orig_redirect_ext = Xof_ext_.new_by_ttl_(orig_redirect_ttl).Id();
|
||||
}
|
||||
private static byte[] Normalize_ttl(byte[] v) {
|
||||
// remove "File:"
|
||||
if (Bry_.Has_at_bgn(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX)) {
|
||||
v = Bry_.Mid(v, Xobldr_missing_origs_wmfapi.FILE_NS_PREFIX.length);
|
||||
}
|
||||
else {
|
||||
throw Err_.new_wo_type("wmf_api does not start with 'File:'", "title", v);
|
||||
}
|
||||
|
||||
// convert spaces to unders
|
||||
v = Xoa_ttl.Replace_spaces(v);
|
||||
|
||||
return v;
|
||||
}
|
||||
private static byte[] Normalize_minor_mime(byte[] src) {
|
||||
// convert "image/svg+xml" to "svg+xml"
|
||||
int src_len = src.length;
|
||||
int slash_pos = Bry_find_.Find_fwd(src, Byte_ascii.Slash, 0, src_len);
|
||||
if (slash_pos == Bry_find_.Not_found) {
|
||||
throw Err_.new_wo_type("wmf_api minor_mime does not have slash;", "minor_mime", src);
|
||||
}
|
||||
return Bry_.Mid(src, slash_pos + 1, src_len);
|
||||
}
|
||||
private static byte[] Normalize_timestamp(byte[] src) {
|
||||
// convert 2017-03-06T08:09:10Z to 20170306080910
|
||||
byte[] rv = new byte[14];
|
||||
int rv_idx = 0;
|
||||
for (byte b : src) {
|
||||
if (Byte_ascii.Is_num(b)) {
|
||||
rv[rv_idx++] = b;
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
@ -84,15 +84,19 @@ public class Xobldr_missing_origs_wmfapi {
|
||||
Json_ary info_ary = (Json_ary)page.Get_as_ary("imageinfo");
|
||||
Json_nde info_nde = (Json_nde)info_ary.Get_as_nde(0);
|
||||
byte[] timestamp = info_nde.Get_as_bry("timestamp");
|
||||
long size = info_nde.Get_as_long("size");
|
||||
int size = info_nde.Get_as_int("size");
|
||||
int width = info_nde.Get_as_int("width");
|
||||
int height = info_nde.Get_as_int("height");
|
||||
byte[] mime = info_nde.Get_as_bry("mime");
|
||||
byte[] mediatype = info_nde.Get_as_bry("mediatype");
|
||||
|
||||
// add to trg hash
|
||||
Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, timestamp, size, width, height, mime, mediatype);
|
||||
temp_hash.Add(trg_item.Orig_ttl(), trg_item);
|
||||
try {
|
||||
Xobldr_missing_origs_item trg_item = new Xobldr_missing_origs_item().Init_by_api_page(repo_id, page_id, title, size, width, height, mediatype, mime, timestamp);
|
||||
temp_hash.Add(trg_item.Orig_file_ttl(), trg_item);
|
||||
} catch (Exception e2) {
|
||||
Gfo_usr_dlg_.Instance.Warn_many("", "", "missing_origs:failed to deserialize api obj; domain=~{0} ttl=~{1} json=~{2} err=~{3}", api_domain, title, page.Print_as_json(), Err_.Message_gplx_log(e2));
|
||||
}
|
||||
}
|
||||
|
||||
// loop over redirects
|
||||
|
Loading…
Reference in New Issue
Block a user