1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2015-08-24 00:32:13 -04:00
parent df10db140c
commit ed911e3de5
220 changed files with 2618 additions and 1569 deletions

View File

@@ -1,98 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
import gplx.ios.*;
import gplx.xowa.wikis.*;
public class Xob_dump_file {
public Xow_domain Wiki_type() {return wiki_domain_itm;} private Xow_domain wiki_domain_itm;
public String Dump_date() {return dump_date;} public void Dump_date_(String v) {dump_date = v;} String dump_date;
public String Dump_file_type() {return dump_file_type;} private String dump_file_type;
public String Server_url() {return server_url;} private String server_url;
public String File_url() {return file_url;} private String file_url;
public String File_name() {return file_name;} private String file_name;
public long File_len() {return file_len;} long file_len;
public DateAdp File_modified() {return file_modified;} DateAdp file_modified;
public byte[] Wiki_alias() {return wiki_alias;} private byte[] wiki_alias;
public Xob_dump_file Ctor(String wiki_domain, String dump_date, String dump_file_type) {
this.dump_date = dump_date; this.dump_file_type = dump_file_type;
this.wiki_domain_itm = Xow_domain_.parse(Bry_.new_a7(wiki_domain));
this.wiki_alias = Xow_wiki_alias.Build_alias(wiki_domain_itm);
byte[] dump_file_bry = Bry_.new_u8(dump_file_type);
byte dump_file_tid = Xow_wiki_alias.Parse__tid(dump_file_bry);
byte[] ext = Xob_dump_file_.Ext_xml_bz2;
switch (dump_file_tid) {
case Xow_wiki_alias.Tid_page_props: case Xow_wiki_alias.Tid_categorylinks: case Xow_wiki_alias.Tid_image:
ext = Xob_dump_file_.Ext_sql_gz;
break;
}
this.file_name = String_.new_u8(Xob_dump_file_.Bld_dump_file_name(wiki_alias, Bry_.new_u8(dump_date), dump_file_bry, ext));
return this;
}
public void Server_url_(String server_url) {
this.server_url = server_url;
String dump_dir_url = String_.new_u8(Xob_dump_file_.Bld_dump_dir_url(Bry_.new_u8(server_url), wiki_alias, Bry_.new_u8(dump_date)));
this.file_url = dump_dir_url + file_name;
}
public boolean Connect() {
IoEngine_xrg_downloadFil args = Io_mgr.I.DownloadFil_args("", Io_url_.Empty);
boolean rv = Connect_exec(args, file_url);
// WMF changed dumping approach to partial dumps; this sometimes causes /latest/ to be missing page_articles; try to get earlier dump; DATE:2015-07-09
if ( !rv // not found
&& String_.In(server_url, Xob_dump_file_.Server_wmf_http, Xob_dump_file_.Server_wmf_https) // server is dumps.wikimedia.org
&& String_.Eq(dump_date, Xob_dump_file_.Date_latest) // request dump was latest
) {
Xoa_app_.Usr_dlg().Warn_many("", "", "wmf.dump:latest not found; url=~{0}", file_url);
byte[] abrv_wm = Xow_wiki_alias.Build_alias(wiki_domain_itm);
String new_dump_root = Xob_dump_file_.Server_wmf_https + String_.new_u8(abrv_wm) + "/"; // EX: http://dumps.wikimedia.org/enwiki/
byte[] wiki_dump_dirs_src = args.Exec_as_bry(new_dump_root);
if (wiki_dump_dirs_src == null) {Xoa_app_.Usr_dlg().Warn_many("", "", "could not connect to dump server; url=~{0}", new_dump_root); return false;}
String[] dates = gplx.xowa.wmfs.dump_pages.Xowmf_wiki_dump_dirs_parser.Parse(wiki_domain_itm.Domain_bry(), wiki_dump_dirs_src);
int dates_len = dates.length;
for (int i = dates_len - 1; i > -1; --i) {
String new_dump_date = dates[i];
if (String_.Eq(new_dump_date, Xob_dump_file_.Date_latest)) continue; // skip latest; assume it is bad
String new_dump_file = String_.Replace(file_name, Xob_dump_file_.Date_latest, new_dump_date); // replace "-latest-" with "-20150602-";
String new_file_url = new_dump_root + new_dump_date + "/" + new_dump_file;
rv = Connect_exec(args, new_file_url);
if (rv) {
Xoa_app_.Usr_dlg().Note_many("", "", "wmf.dump:dump found; url=~{0}", new_file_url);
dump_date = new_dump_date;
file_name = new_dump_file;
file_url = new_file_url;
break;
}
else
Xoa_app_.Usr_dlg().Warn_many("", "", "wmf.dump:dump not found; url=~{0}", new_file_url);
}
}
return rv;
}
private boolean Connect_exec(IoEngine_xrg_downloadFil args, String cur_file_url) {
boolean rv = args.Src_last_modified_query_(true).Exec_meta(cur_file_url);
long tmp_file_len = args.Src_content_length();
DateAdp tmp_file_modified = args.Src_last_modified();
Xoa_app_.Usr_dlg().Note_many("", "", "wmf.dump:connect rslts; url=~{0} result=~{1} fil_len=~{2} file_modified=~{3} server_url=~{4} dump_date=~{5}", cur_file_url, rv, tmp_file_len, tmp_file_modified == null ? "<<NULL>>" : tmp_file_modified.XtoStr_fmt_yyyy_MM_dd_HH_mm_ss(), server_url, dump_date);
if (rv) {
if (tmp_file_modified != null && tmp_file_modified.Year() <= 1970) return false; // url has invalid file; note that dumps.wikimedia.org currently returns back an HTML page with "404 not found"; rather than try to download and parse this (since content may change), use the date_modified which always appears to be UnixTime 0; DATE:2015-07-21
file_len = tmp_file_len;
file_modified = tmp_file_modified;
}
return rv;
}
public static Xob_dump_file new_(String wiki_domain, String dump_date, String dump_type) {return new Xob_dump_file().Ctor(wiki_domain, dump_date, dump_type);}
}

View File

@@ -1,73 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
public class Xob_dump_file_ {
public static boolean Connect_first(Xob_dump_file rv, String[] server_urls) {
int len = server_urls.length;
for (int i = 0; i < len; ++i) {
String server_url = server_urls[i];
rv.Server_url_(server_url);
Override_dump_date(rv, server_url);
if (rv.Connect()) return true;
}
return false;
}
private static void Override_dump_date(Xob_dump_file rv, String dump_server) {
String dump_date = rv.Dump_date();
if ( String_.Eq(dump_date, Xob_dump_file_.Date_latest)
&& ( String_.Eq(dump_server, Xob_dump_file_.Server_c3sl)
|| String_.Eq(dump_server, Xob_dump_file_.Server_masaryk)
)
){
Xoa_app_.Usr_dlg().Note_many("", "", "wmf.dump:dump date; server_url=~{0} dump_date=~{1}", dump_server, dump_date);
Xoi_mirror_parser mirror_parser = new Xoi_mirror_parser();
String dump_wiki_url = dump_server + String_.new_a7(rv.Wiki_alias()) + "/";
byte[] dump_url_wiki_html = gplx.ios.IoEngine_xrg_downloadFil.new_("", Io_url_.Empty).Exec_as_bry(dump_wiki_url); if (Bry_.Len_eq_0(dump_url_wiki_html)) return;
String[] dump_available_dates = mirror_parser.Parse(String_.new_u8(dump_url_wiki_html));
String dump_dates_latest = Xoi_mirror_parser.Find_last_lte(dump_available_dates, dump_date);
if (String_.Eq(dump_dates_latest, "")) return; // nothing found
rv.Dump_date_(dump_dates_latest);
}
}
public static byte[] Bld_dump_dir_url(byte[] server_url, byte[] alias, byte[] date) {
return Bry_.Add
( server_url // "http://dumps.wikimedia.org/"
, Bry_.Replace(alias, Byte_ascii.Dash, Byte_ascii.Underline), Bry_slash // "simplewiki/"
, date, Bry_slash // "latest/"
);
}
public static byte[] Bld_dump_file_name(byte[] alias, byte[] date, byte[] dump_file_type, byte[] ext) {
return Bry_.Add
( Bry_.Replace(alias, Byte_ascii.Dash, Byte_ascii.Underline), Bry_dash // "simplewiki-"
, date, Bry_dash // "latest-"
, dump_file_type // "pages-articles"
, ext // ".xml.bz2"
);
}
private static final byte[] Bry_dash = new byte[] {Byte_ascii.Dash}, Bry_slash = new byte[] {Byte_ascii.Slash};
public static final byte[] Ext_xml_bz2 = Bry_.new_a7(".xml.bz2");
public static final byte[] Ext_sql_gz = Bry_.new_a7(".sql.gz");
public static final String
Server_wmf_http = "http://dumps.wikimedia.org/"
, Server_wmf_https = "https://dumps.wikimedia.org/"
, Server_your_org = "http://dumps.wikimedia.your.org/"
, Server_c3sl = "http://wikipedia.c3sl.ufpr.br/"
, Server_masaryk = "http://ftp.fi.muni.cz/pub/wikimedia/"
, Date_latest = "latest"
;
}

View File

@@ -16,7 +16,9 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
import gplx.core.threads.*; import gplx.xowa.wikis.*; import gplx.xowa.bldrs.*;
import gplx.core.threads.*; import gplx.xowa.bldrs.*;
import gplx.xowa.wikis.domains.*;
import gplx.xowa.wmfs.dumps.*;
abstract class Xoi_cmd_base implements Gfo_thread_cmd {
public void Ctor(Xoi_setup_mgr install_mgr, String wiki_key) {
this.install_mgr = install_mgr; this.wiki_key = wiki_key;
@@ -68,11 +70,11 @@ abstract class Xoi_cmd_base implements Gfo_thread_cmd {
return this;
} private static final String Invk_process_async = "run_async", Invk_owner = "owner";
}
class Xoi_cmd_category2_page_props extends Xoi_cmd_wiki_download { public Xoi_cmd_category2_page_props(Xoi_setup_mgr install_mgr, String wiki_key, String dump_date) {this.Ctor_download_(install_mgr, wiki_key, dump_date, Xow_wiki_alias.Key_page_props);}
class Xoi_cmd_category2_page_props extends Xoi_cmd_wiki_download { public Xoi_cmd_category2_page_props(Xoi_setup_mgr install_mgr, String wiki_key, String dump_date) {this.Ctor_download_(install_mgr, wiki_key, dump_date, Xowm_dump_type_.Str__page_props);}
@Override public String Download_file_ext() {return ".sql.gz2";}
public static final String KEY_category2 = "wiki.category2.download.page_props";
}
class Xoi_cmd_category2_categorylinks extends Xoi_cmd_wiki_download { public Xoi_cmd_category2_categorylinks(Xoi_setup_mgr install_mgr, String wiki_key, String dump_date) {this.Ctor_download_(install_mgr, wiki_key, dump_date, Xow_wiki_alias.Key_categorylinks);}
class Xoi_cmd_category2_categorylinks extends Xoi_cmd_wiki_download { public Xoi_cmd_category2_categorylinks(Xoi_setup_mgr install_mgr, String wiki_key, String dump_date) {this.Ctor_download_(install_mgr, wiki_key, dump_date, Xowm_dump_type_.Str__categorylinks);}
@Override public String Download_file_ext() {return ".sql.gz2";}
public static final String KEY_category2 = "wiki.category2.download.categorylinks";
}

View File

@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.xowa; import gplx.*;
import gplx.gfui.*;
import gplx.core.threads.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.cmds.utils.*;
import gplx.xowa.wmfs.dumps.*;
class Xoi_cmd_wiki_download extends Gfo_thread_cmd_download implements Gfo_thread_cmd { private Xoi_setup_mgr install_mgr; private String wiki_key, dump_date, dump_type;
public Xoi_cmd_wiki_download Ctor_download_(Xoi_setup_mgr install_mgr, String wiki_key, String dump_date, String dump_type) {
this.install_mgr = install_mgr;
@@ -31,8 +32,8 @@ class Xoi_cmd_wiki_download extends Gfo_thread_cmd_download implements Gfo_threa
@Override public String Async_key() {return Key_wiki_download;} public static final String Key_wiki_download = "wiki.download";
@Override public byte Async_init() {
Xoae_app app = install_mgr.App();
Xob_dump_file dump_file = Xob_dump_file.new_(wiki_key, dump_date, dump_type);
boolean connected = Xob_dump_file_.Connect_first(dump_file, install_mgr.Dump_mgr().Server_urls());
Xowm_dump_file dump_file = new Xowm_dump_file(wiki_key, dump_date, dump_type);
boolean connected = Xowm_dump_file_.Connect_first(dump_file, install_mgr.Dump_mgr().Server_urls());
if (connected)
app.Usr_dlg().Note_many("", "", "url: ~{0}", dump_file.File_url());
else {
@@ -41,7 +42,7 @@ class Xoi_cmd_wiki_download extends Gfo_thread_cmd_download implements Gfo_threa
Dump_servers_offline_msg_shown = true;
}
}
Xowe_wiki wiki = app.Wiki_mgr().Get_by_key_or_make(dump_file.Wiki_type().Domain_bry());
Xowe_wiki wiki = app.Wiki_mgr().Get_by_key_or_make(dump_file.Domain_itm().Domain_bry());
Io_url root_dir = wiki.Fsys_mgr().Root_dir();
Io_url[] trg_fil_ary = Io_mgr.I.QueryDir_args(root_dir).FilPath_("*." + dump_type + Download_file_ext() + "*").ExecAsUrlAry();
Io_url trg = trg_fil_ary.length == 0 ? root_dir.GenSubFil(dump_file.File_name()) : trg_fil_ary[0];

View File

@@ -18,8 +18,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.xowa; import gplx.*;
import org.junit.*;
import gplx.core.consoles.*;
import gplx.brys.*; import gplx.core.threads.*; import gplx.xowa.wikis.*; import gplx.xowa.setup.maints.*; import gplx.xowa.xtns.wdatas.imports.*;
import gplx.xowa.wmfs.*;
import gplx.brys.*; import gplx.core.threads.*; import gplx.xowa.setup.maints.*; import gplx.xowa.xtns.wdatas.imports.*;
import gplx.xowa.wikis.domains.*;
import gplx.xowa.wmfs.*; import gplx.xowa.wmfs.dumps.*;
public class Xoi_cmd_wiki_tst {
@Test public void Run() { // MAINT
// Bld_import_list(Xow_wmf_api_mgr.Wikis);
@@ -29,17 +30,16 @@ public class Xoi_cmd_wiki_tst {
int ary_len = ary.length;
Bry_bfr bfr = Bry_bfr.reset_(255);
Wmf_latest_parser parser = new Wmf_latest_parser();
Xob_dump_file dump_file = new Xob_dump_file();
Bry_fmtr_arg_time time_fmtr = new Bry_fmtr_arg_time();
for (int i = 0; i < ary_len; i++)
Bld_import_list_itm2(bfr, parser, dump_file, time_fmtr, ary, i);
Bld_import_list_itm2(bfr, parser, time_fmtr, ary, i);
Io_mgr.I.SaveFilStr("C:\\temp.txt", bfr.Xto_str());
}
private void Bld_import_list_itm2(Bry_bfr bfr, Wmf_latest_parser parser, Xob_dump_file dump_file, Bry_fmtr_arg_time time_fmtr, String[] ary, int i) {
private void Bld_import_list_itm2(Bry_bfr bfr, Wmf_latest_parser parser, Bry_fmtr_arg_time time_fmtr, String[] ary, int i) {
String domain_str = ary[i];
byte[] domain_bry = Bry_.new_a7(domain_str);
Xow_domain domain_itm = Xow_domain_.parse(domain_bry);
byte[] wmf_key_bry = Bry_.Replace(Xow_wiki_alias.Build_alias(domain_itm), Byte_ascii.Dash, Byte_ascii.Underline);
Xow_domain_itm domain_itm = Xow_domain_itm_.parse(domain_bry);
byte[] wmf_key_bry = Bry_.Replace(Xow_abrv_wm_.To_abrv(domain_itm), Byte_ascii.Dash, Byte_ascii.Underline);
String wmf_key = String_.new_u8(wmf_key_bry);
String url = "https://dumps.wikimedia.org/" + wmf_key + "/latest";
byte[] latest_html = null;
@@ -51,13 +51,13 @@ public class Xoi_cmd_wiki_tst {
}
Tfds.Write("pass|" + url);
parser.Parse(latest_html);
dump_file.Ctor(domain_str, "latest", Xow_wiki_alias.Key_pages_articles);
dump_file.Server_url_(Xob_dump_file_.Server_wmf_https);
Xowm_dump_file dump_file = new Xowm_dump_file(domain_str, "latest", Xowm_dump_type_.Str__pages_articles);
dump_file.Server_url_(Xowm_dump_file_.Server_wmf_https);
byte[] pages_articles_key = Bry_.new_a7(wmf_key + "-latest-pages-articles.xml.bz2");
Wmf_latest_itm latest_itm = parser.Get_by(pages_articles_key);
bfr.Add(domain_bry).Add_byte_pipe();
bfr.Add_str(dump_file.File_url()).Add_byte_pipe();
bfr.Add(Xow_domain_type_.Get_type_as_bry(domain_itm.Domain_tid())).Add_byte_pipe();
bfr.Add(Xow_domain_type_.Get_type_as_bry(domain_itm.Domain_type_id())).Add_byte_pipe();
long src_size = latest_itm.Size();
bfr.Add_long_variable(src_size).Add_byte_pipe();
bfr.Add_str(gplx.ios.Io_size_.To_str(src_size)).Add_byte_pipe();
@@ -69,12 +69,12 @@ public class Xoi_cmd_wiki_tst {
bfr.Add_byte_nl();
}
/*
private void Bld_import_list_itm(Bry_bfr bfr, Xob_dump_file dump_file, Bry_fmtr_arg_time time_fmtr, String[] ary, int i) {
private void Bld_import_list_itm(Bry_bfr bfr, Xowm_dump_file dump_file, Bry_fmtr_arg_time time_fmtr, String[] ary, int i) {
String itm = ary[i];
dump_file.Ctor(itm, "latest", Xow_wiki_alias.Key_pages_articles);
dump_file.Ctor(itm, "latest", Xowm_dump_type_.Str__pages_articles);
int count = 0;
while (count++ < 1) {
dump_file.Server_url_(Xob_dump_file_.Server_wmf);
dump_file.Server_url_(Xowm_dump_file_.Server_wmf);
if (dump_file.Connect()) break;
Tfds.WriteText(String_.Format("retrying: {0} {1}\n", count, dump_file.File_modified()));
Thread_adp_.Sleep(15000); // wait for connection to reset

View File

@@ -17,8 +17,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
import gplx.ios.*;
import gplx.xowa.wmfs.dumps.*;
public class Xoi_dump_mgr implements GfoInvkAble {
public String[] Server_urls() {return server_urls;} private String[] server_urls = String_.Ary(Xob_dump_file_.Server_wmf_https, Xob_dump_file_.Server_your_org, Xob_dump_file_.Server_c3sl, Xob_dump_file_.Server_masaryk);
public String[] Server_urls() {return server_urls;} private String[] server_urls = String_.Ary(Xowm_dump_file_.Server_wmf_https, Xowm_dump_file_.Server_your_org, Xowm_dump_file_.Server_c3sl, Xowm_dump_file_.Server_masaryk);
public String[] Custom_cmds() {return custom_cmds;} private String[] custom_cmds = String_.Ary(Xoi_cmd_wiki_download.Key_wiki_download, Xoi_cmd_wiki_import.KEY);
public byte Data_storage_format() {return data_storage_format;} public Xoi_dump_mgr Data_storage_format_(byte v) {data_storage_format = v; return this;} private byte data_storage_format = gplx.ios.Io_stream_.Tid_gzip;
public long Db_text_max() {return db_text_max;} private long db_text_max = (long)3000 * Io_mgr.Len_mb;