mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
HTML Databases: Fix loading of plain-text pages; Normalize some behavior in page-sync code [#320]
This commit is contained in:
parent
9e5a13f54f
commit
2b4320b302
@ -91,9 +91,7 @@ public class Xosync_read_mgr implements Gfo_invk {
|
||||
|
||||
// auto-sync page
|
||||
Xoa_app app = wiki.App();
|
||||
Xoh_page hpg = new Xoh_page();
|
||||
update_mgr.Init_by_app(app);
|
||||
update_mgr.Init_by_page(wiki, hpg);
|
||||
Xowm_parse_data parse_data = update_mgr.Update(app.Wmf_mgr().Download_wkr(), wiki, page_ttl);
|
||||
if (parse_data == null)
|
||||
return rv;
|
||||
|
@ -35,10 +35,6 @@ public class Xosync_update_mgr {
|
||||
public void Init_by_app(Xoa_app app) {
|
||||
hctx.Init_by_app(app);
|
||||
}
|
||||
public void Init_by_page(Xow_wiki wiki, Xoa_page page) {
|
||||
hctx.Init_by_page(wiki, page);
|
||||
page.Hdump_mgr().Clear();
|
||||
}
|
||||
public Xowm_parse_data Update(Xof_download_wkr download_wkr, Xow_wiki wiki, Xoa_ttl page_ttl) {
|
||||
Xoh_page hpg = (Xoh_page)hctx.Page();
|
||||
|
||||
@ -97,6 +93,7 @@ public class Xosync_update_mgr {
|
||||
|
||||
// init_by_page for bldr, parser, hdoc
|
||||
hctx.Init_by_page(wiki, hpg);
|
||||
hpg.Hdump_mgr().Clear();
|
||||
hdoc_bldr.Init_by_page(bfr, hpg, hctx, src, 0, src_len);
|
||||
hdoc_parser_wkr.Init_by_page(hctx, src, 0, src_len);
|
||||
|
||||
|
@ -53,7 +53,7 @@ public class Xosync_page_loader {
|
||||
byte[] img_src_val = img_src_atr.Val();
|
||||
byte path_tid = Xosync_img_src_parser.Src_xo_trie.Match_byte_or(trv, img_src_val, Xosync_img_src_parser.Path__unknown);
|
||||
switch (path_tid) {
|
||||
case Xosync_img_src_parser.Path__file:
|
||||
case Xosync_img_src_parser.Path__file_wm:
|
||||
Add_img(wiki, hpg, img_tag, img_src_atr, img_src_val, path_tid, Xosync_img_src_parser.Bry__xowa_file, wiki.App().Fsys_mgr().File_dir().To_http_file_bry());
|
||||
break;
|
||||
case Xosync_img_src_parser.Path__math:
|
||||
@ -74,7 +74,7 @@ public class Xosync_page_loader {
|
||||
img_src_val = Bry_.Replace(img_src_val, src_find, src_repl);
|
||||
|
||||
// parse src
|
||||
img_src_parser.Parse(err_wkr, hctx, wiki.Domain_bry(), img_src_atr.Val_bgn(), img_src_atr.Val_end());
|
||||
img_src_parser.Parse(err_wkr, hctx, wiki.Domain_bry(), img_src_atr.Src(), img_src_atr.Val_bgn(), img_src_atr.Val_end());
|
||||
if (img_src_parser.File_ttl_bry() == null) return null; // skip images that don't follow format of "commons.wikimedia.org/thumb/7/70/A.png"; for example, enlarge buttons
|
||||
|
||||
// create img
|
||||
|
@ -17,7 +17,9 @@ package gplx.xowa.addons.wikis.pages.syncs.core.parsers; import gplx.*; import g
|
||||
import org.junit.*;
|
||||
import gplx.langs.htmls.*;
|
||||
public class Xosync_hdoc_parser__err__tst {
|
||||
@Before public void init() {fxt.Clear();} private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
@Before public void init() {fxt.Init(false);}
|
||||
@After public void term() {fxt.Term();}
|
||||
@Test public void Url_does_not_start_with_upload_wikimedia_org() {
|
||||
fxt.Exec__parse(Gfh_utl.Replace_apos("<img src='//fail/wikipedia/commons/thumb/7/70/A.png/220px-A.png'>"))
|
||||
.Test__html(Gfh_utl.Replace_apos("<!--wm.parse:img src does not start with known sequence--><img src='//fail/wikipedia/commons/thumb/7/70/A.png/220px-A.png'>"));
|
||||
|
@ -17,7 +17,9 @@ package gplx.xowa.addons.wikis.pages.syncs.core.parsers; import gplx.*; import g
|
||||
import org.junit.*;
|
||||
import gplx.langs.htmls.*;
|
||||
public class Xosync_hdoc_parser__file__tst {
|
||||
@Before public void init() {fxt.Clear();} private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
@Before public void init() {fxt.Init(true);}
|
||||
@After public void term() {fxt.Term();}
|
||||
@Test public void Commons__thumb() {
|
||||
fxt.Exec__parse(Gfh_utl.Replace_apos("<img src='//upload.wikimedia.org/wikipedia/commons/thumb/7/70/A.png/320px-A.png'>"))
|
||||
.Test__html(Gfh_utl.Replace_apos("<img src='xowa:/file/commons.wikimedia.org/thumb/7/0/1/c/A.png/320px.png'>"))
|
||||
@ -53,4 +55,9 @@ public class Xosync_hdoc_parser__file__tst {
|
||||
.Test__html(Gfh_utl.Replace_apos("<img src='xowa:/file/commons.wikimedia.org/thumb/7/6/9/a/A.djvu/320px-1.jpg'>"))
|
||||
.Test__fsdb(fxt.Make__fsdb(Bool_.Y, Bool_.N, "A.djvu", 320, -1, 1));
|
||||
}
|
||||
@Test public void Xo() {
|
||||
fxt.Exec__parse(Gfh_utl.Replace_apos("<img src='file:///mem/xowa/file/commons.wikimedia.org/thumb/7/0/1/c/A.png/320px.png'>"))
|
||||
.Test__html(Gfh_utl.Replace_apos("<img src='xowa:/file/commons.wikimedia.org/thumb/7/0/1/c/A.png/320px.png'>"))
|
||||
.Test__fsdb(fxt.Make__fsdb(Bool_.Y, Bool_.N, "A.png", 320, -1, -1));
|
||||
}
|
||||
}
|
||||
|
@ -22,12 +22,16 @@ public class Xosync_hdoc_parser__fxt {
|
||||
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
private final Xoh_page hpg = new Xoh_page();
|
||||
private Xowe_wiki wiki;
|
||||
public void Clear() {
|
||||
public void Init(boolean print_errors) {
|
||||
if (print_errors)
|
||||
Gfo_usr_dlg_.Instance = Gfo_usr_dlg_.Test_console();
|
||||
Xoae_app app = Xoa_app_fxt.Make__app__edit();
|
||||
this.wiki = Xoa_app_fxt.Make__wiki__edit(app);
|
||||
Xoa_app_fxt.repo2_(app, wiki);
|
||||
mgr.Init_by_app(app);
|
||||
mgr.Init_by_page(wiki, hpg);
|
||||
}
|
||||
public void Term() {
|
||||
Gfo_usr_dlg_.Instance = Gfo_usr_dlg_.Noop;
|
||||
}
|
||||
public Xosync_hdoc_parser__fxt Exec__parse(String raw) {
|
||||
mgr.Parse(hpg, wiki, Bry_.Empty, Bry_.new_u8(raw));
|
||||
|
@ -17,7 +17,9 @@ package gplx.xowa.addons.wikis.pages.syncs.core.parsers; import gplx.*; import g
|
||||
import org.junit.*;
|
||||
import gplx.langs.htmls.*;
|
||||
public class Xosync_hdoc_parser__misc__tst {
|
||||
@Before public void init() {fxt.Clear();} private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
@Before public void init() {fxt.Init(true);}
|
||||
@After public void term() {fxt.Term();}
|
||||
@Test public void Math() {
|
||||
fxt.Exec__parse(Gfh_utl.Replace_apos("<img src='https://wikimedia.org/api/rest_v1/media/math/render/svg/596f8baf206a81478afd4194b44138715dc1a05c' class='mwe-math-fallback-image-inline' aria-hidden='true' style='vertical-align: -2.005ex; width:16.822ex; height:6.176ex;' alt='R_{H}=a\\left({\\frac {m}{3M}}\\right)^{\\frac {1}{3}}'>"))
|
||||
.Test__html(Gfh_utl.Replace_apos("<img src='xowa:/math/596f8baf206a81478afd4194b44138715dc1a05c.svg' class='mwe-math-fallback-image-inline' aria-hidden='true' style='vertical-align: -2.005ex; width:16.822ex; height:6.176ex;' alt='R_{H}=a\\left({\\frac {m}{3M}}\\right)^{\\frac {1}{3}}'>"))
|
||||
|
@ -17,7 +17,9 @@ package gplx.xowa.addons.wikis.pages.syncs.core.parsers; import gplx.*; import g
|
||||
import org.junit.*;
|
||||
import gplx.langs.htmls.*;
|
||||
public class Xosync_hdoc_parser__tst {
|
||||
@Before public void init() {fxt.Clear();} private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
private final Xosync_hdoc_parser__fxt fxt = new Xosync_hdoc_parser__fxt();
|
||||
@Before public void init() {fxt.Init(true);}
|
||||
@After public void term() {fxt.Term();}
|
||||
@Test public void Remove_edit() {
|
||||
fxt.Exec__parse(Gfh_utl.Replace_apos_concat_lines
|
||||
( "<h2><span class='mw-headline' id='Section_1'>Section_1</span>"
|
||||
|
@ -16,19 +16,21 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.addons.wikis.pages.syncs.core.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.pages.*; import gplx.xowa.addons.wikis.pages.syncs.*; import gplx.xowa.addons.wikis.pages.syncs.core.*;
|
||||
import gplx.core.brys.*; import gplx.core.btries.*;
|
||||
import gplx.xowa.files.*; import gplx.xowa.files.repos.*; import gplx.xowa.files.imgs.*;
|
||||
import gplx.langs.htmls.*; import gplx.xowa.htmls.core.wkrs.*;
|
||||
import gplx.langs.htmls.*; import gplx.xowa.htmls.core.wkrs.*; import gplx.xowa.htmls.core.wkrs.imgs.atrs.*;
|
||||
import gplx.xowa.wikis.domains.*;
|
||||
public class Xosync_img_src_parser {
|
||||
private final Bry_rdr rdr = new Bry_rdr().Dflt_dlm_(Byte_ascii.Slash);
|
||||
private final Xof_url_bldr url_bldr = Xof_url_bldr.new_v2();
|
||||
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
private final byte[] wiki_abrv_commons;
|
||||
private final Xoh_img_src_data img_src_parser = new Xoh_img_src_data();
|
||||
|
||||
private Xoh_hdoc_ctx hctx;
|
||||
private byte path_tid;
|
||||
private byte[] img_src_bgn_local, img_src_bgn_remote;
|
||||
private byte[] page_url, repo_local;
|
||||
private byte[] raw;
|
||||
private int raw_len;
|
||||
|
||||
public boolean Repo_is_commons() {return repo_is_commons;} private boolean repo_is_commons;
|
||||
public byte[] File_ttl_bry() {return file_ttl_bry;} private byte[] file_ttl_bry;
|
||||
@ -58,33 +60,60 @@ public class Xosync_img_src_parser {
|
||||
|
||||
// set raw, raw_len; exit if empty
|
||||
this.raw = raw;
|
||||
int raw_len = raw.length;
|
||||
this.raw_len = raw.length;
|
||||
if (raw_len == 0) return Fail("empty img_src");
|
||||
rdr.Init_by_src(raw);
|
||||
|
||||
// check "//upload.wikimedia.org/" at bgn
|
||||
// identify type based on beginning; EX: "//upload.wikimedia.org/" -> WM_FILE
|
||||
this.path_tid = rdr.Chk_or(path_trie, Path__unknown);
|
||||
switch (path_tid) {
|
||||
case Path__file: return Parse_file(raw_len);
|
||||
case Path__math: return Parse_math(raw_len);
|
||||
default: return Fail("img src does not start with known sequence");
|
||||
// "//upload.wikimedia.org/"
|
||||
case Path__file_wm:
|
||||
return Parse_file_wm();
|
||||
// "https://wikimedia.org/api/rest_v1/media/math/render/svg/"
|
||||
case Path__math:
|
||||
return Parse_math();
|
||||
// "file:///"
|
||||
case Path__file_xo:
|
||||
return Parse_file_xo();
|
||||
default:
|
||||
return Fail("img src does not start with known sequence");
|
||||
}
|
||||
}
|
||||
private boolean Parse_file(int raw_len) {
|
||||
// get repo: either "wikipedia/commons/" or "wiki_type/wiki_lang/"; EX:"wiktionary/fr"
|
||||
private boolean Parse_file_xo() {
|
||||
img_src_parser.Clear();
|
||||
boolean rv = img_src_parser.Parse(rdr.Err_wkr(), hctx, hctx.Wiki__domain_bry(), raw, 0, raw_len);
|
||||
if (rv) {
|
||||
this.repo_is_commons = img_src_parser.Repo_is_commons();
|
||||
this.file_is_orig = img_src_parser.File_is_orig();
|
||||
this.file_ttl_bry = img_src_parser.File_ttl_bry();
|
||||
this.file_ext = Xof_ext_.new_by_ttl_(file_ttl_bry);
|
||||
if (!file_is_orig) {
|
||||
this.file_w = img_src_parser.File_w();
|
||||
if (img_src_parser.File_page_exists())
|
||||
this.file_page = img_src_parser.File_page();
|
||||
if (img_src_parser.File_time_exists())
|
||||
this.file_time = img_src_parser.File_time();
|
||||
}
|
||||
Add_img(hctx.Wiki__domain_itm().Abrv_xo());
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
private boolean Parse_file_wm() {
|
||||
// set repo: either "wikipedia/commons/" or "wiki_type/wiki_lang/"; EX:"wiktionary/fr"
|
||||
if (rdr.Is(Bry__repo_remote))
|
||||
this.repo_is_commons = true;
|
||||
else {
|
||||
if (!rdr.Is(repo_local)) return Fail("unknown repo");
|
||||
}
|
||||
|
||||
// get file_is_orig; note omitting "else" b/c default is file_is_orig == false
|
||||
// set file_is_orig; note omitting "else" b/c default is file_is_orig == false
|
||||
if (!rdr.Is(Bry__thumb)) file_is_orig = true; // no "/thumb";
|
||||
|
||||
// check md5
|
||||
// check md5 and skip
|
||||
if (!Check_md5()) return Fail("invalid md5");
|
||||
|
||||
// get file_ttl
|
||||
// set file_ttl
|
||||
int file_ttl_bgn = rdr.Pos();
|
||||
int file_ttl_end = rdr.Find_fwd_lr_or(Byte_ascii.Slash, raw_len);
|
||||
file_ttl_bry = Bry_.Mid(raw, file_ttl_bgn, file_ttl_end);
|
||||
@ -92,7 +121,6 @@ public class Xosync_img_src_parser {
|
||||
this.file_ext = Xof_ext_.new_by_ttl_(file_ttl_bry);
|
||||
if (file_ext.Id_is_ogg()) file_ext = Xof_ext_.new_by_id_(Xof_ext_.Id_ogv);
|
||||
|
||||
|
||||
// if thumb, get file_w, file_time, file_page
|
||||
if (!file_is_orig) {
|
||||
// if "page", then file_page exists; EX: // "page1-320px"
|
||||
@ -110,7 +138,7 @@ public class Xosync_img_src_parser {
|
||||
if (file_w == -1) return Fail("invalid file_w");
|
||||
|
||||
// get time via "-seek%3D"; EX: "320px-seek%3D67-"
|
||||
int seek_end = rdr.Find_fwd_rr(Bry__seek);
|
||||
int seek_end = rdr.Find_fwd_rr_or(Bry__seek, Bry_find_.Not_found);
|
||||
if (seek_end != Bry_find_.Not_found) {
|
||||
int file_time_bgn = rdr.Pos();
|
||||
int file_time_end = rdr.Find_fwd_lr(Byte_ascii.Dash);
|
||||
@ -118,11 +146,11 @@ public class Xosync_img_src_parser {
|
||||
}
|
||||
}
|
||||
|
||||
// make image
|
||||
// register image
|
||||
Add_img(hctx.Wiki__domain_itm().Abrv_xo());
|
||||
return true;
|
||||
}
|
||||
private boolean Parse_math(int raw_len) {
|
||||
private boolean Parse_math() {
|
||||
// set file_ttl_bry to rest of src + ".svg"; EX: "https://wikimedia.org/api/rest_v1/media/math/render/svg/596f8baf206a81478afd4194b44138715dc1a05c" -> "596f8baf206a81478afd4194b44138715dc1a05c.svg"
|
||||
this.file_ttl_bry = Bry_.Add(Bry_.Mid(raw, rdr.Pos(), raw_len), Byte_ascii.Dot_bry, Xof_ext_.Bry_svg);
|
||||
this.repo_is_commons = true;
|
||||
@ -139,8 +167,13 @@ public class Xosync_img_src_parser {
|
||||
}
|
||||
public byte[] To_bry() {
|
||||
switch (path_tid) {
|
||||
case Path__file: To_bfr_file(tmp_bfr); break;
|
||||
case Path__math: To_bfr_math(tmp_bfr); break;
|
||||
case Path__file_wm:
|
||||
case Path__file_xo:
|
||||
To_bfr_file(tmp_bfr);
|
||||
break;
|
||||
case Path__math:
|
||||
To_bfr_math(tmp_bfr);
|
||||
break;
|
||||
}
|
||||
return tmp_bfr.To_bry_and_clear();
|
||||
}
|
||||
@ -169,7 +202,7 @@ public class Xosync_img_src_parser {
|
||||
}
|
||||
private boolean Fail(String fmt) {
|
||||
this.err_msg = "wm.parse:" + fmt;
|
||||
String msg = String_.Format("", err_msg + "; page={0} raw={1}", page_url, raw);
|
||||
String msg = String_.Format(err_msg + "; page={0} raw={1}", page_url, raw);
|
||||
Gfo_usr_dlg_.Instance.Warn_many("", "", msg);
|
||||
return false;
|
||||
}
|
||||
@ -183,6 +216,19 @@ public class Xosync_img_src_parser {
|
||||
return true;
|
||||
}
|
||||
|
||||
public static final byte Path__unknown = 0, Path__file_wm = 1, Path__math = 2, Path__file_xo = 3;
|
||||
private final Btrie_slim_mgr path_trie = Btrie_slim_mgr.cs()
|
||||
.Add_str_byte("//upload.wikimedia.org/", Path__file_wm)
|
||||
.Add_str_byte("https://wikimedia.org/api/rest_v1/media/math/render/svg/", Path__math)
|
||||
.Add_str_byte("file:///", Path__file_xo)
|
||||
;
|
||||
|
||||
public static final byte[] Bry__xowa_file = Bry_.new_a7("xowa:/file/"), Bry__xowa_math = Bry_.new_a7("xowa:/math/");
|
||||
public static Btrie_slim_mgr Src_xo_trie = Btrie_slim_mgr.cs()
|
||||
.Add_bry_byte(Bry__xowa_file, Path__file_wm)
|
||||
.Add_bry_byte(Bry__xowa_math, Path__math)
|
||||
;
|
||||
|
||||
private static final byte[]
|
||||
Bry__repo_remote = Bry_.new_a7("wikipedia/commons/")
|
||||
, Bry__thumb = Bry_.new_a7("thumb/")
|
||||
@ -190,17 +236,6 @@ public class Xosync_img_src_parser {
|
||||
, Bry__seek = Bry_.new_a7("-seek%3D")
|
||||
, Bry__page = Bry_.new_a7("page")
|
||||
;
|
||||
public static final byte Path__unknown = 0, Path__file = 1, Path__math = 2;
|
||||
private final Btrie_slim_mgr path_trie = Btrie_slim_mgr.cs()
|
||||
.Add_str_byte("//upload.wikimedia.org/", Path__file)
|
||||
.Add_str_byte("https://wikimedia.org/api/rest_v1/media/math/render/svg/", Path__math)
|
||||
;
|
||||
|
||||
public static final byte[] Bry__xowa_file = Bry_.new_a7("xowa:/file/"), Bry__xowa_math = Bry_.new_a7("xowa:/math/");
|
||||
public static Btrie_slim_mgr Src_xo_trie = Btrie_slim_mgr.cs()
|
||||
.Add_bry_byte(Bry__xowa_file, Path__file)
|
||||
.Add_bry_byte(Bry__xowa_math, Path__math)
|
||||
;
|
||||
|
||||
private static byte[] To_wmf_repo_or_null(Bry_bfr bfr, Xow_domain_itm domain_itm) {
|
||||
// add type; EX: "fr.wiktionary.org" -> "wiktionary/"
|
||||
|
@ -30,8 +30,6 @@ public class Sync_html_special implements Xow_special_page {
|
||||
// update
|
||||
Xosync_update_mgr updater = new Xosync_update_mgr();
|
||||
updater.Init_by_app(wiki.App());
|
||||
Xoh_page hpg = new Xoh_page();
|
||||
updater.Init_by_page(wiki, hpg);
|
||||
updater.Update(wiki.App().Wmf_mgr().Download_wkr(), wiki, redirect_ttl);
|
||||
((Xowe_wiki)wiki).Data_mgr().Redirect((Xoae_page)page, redirect_bry); // HACK: should call page.Redirect_trail() below, but need to handle Display_ttl
|
||||
// page.Redirect_trail().Itms__add__article(redirect_url, redirect_ttl, null);
|
||||
|
@ -27,6 +27,9 @@ public class Xow_hdump_mgr {
|
||||
public Xow_hdump_mgr__save Save_mgr() {return save_mgr;} private Xow_hdump_mgr__save save_mgr;
|
||||
public Xow_hdump_mgr__load Load_mgr() {return load_mgr;} private Xow_hdump_mgr__load load_mgr;
|
||||
public Xoh_hzip_mgr Hzip_mgr() {return hzip_mgr;} private final Xoh_hzip_mgr hzip_mgr = new Xoh_hzip_mgr();
|
||||
public void Init_by_app(Xoae_app app) {
|
||||
save_mgr.Init_by_app(app);
|
||||
}
|
||||
public void Init_by_db(Xow_wiki wiki) {
|
||||
byte dflt_zip_tid = gplx.core.ios.streams.Io_stream_tid_.Tid__raw;
|
||||
boolean dflt_hzip_enable = false;
|
||||
@ -41,7 +44,7 @@ public class Xow_hdump_mgr {
|
||||
load_mgr.Init_by_wiki(wiki);
|
||||
}
|
||||
public void Init_by_db(byte dflt_zip_tid, boolean dflt_hzip_enable, boolean mode_is_b256) {
|
||||
int dflt_hzip_tid = dflt_hzip_enable ? Xoh_hzip_dict_.Hzip__v1 : Xoh_hzip_dict_.Hzip__none;
|
||||
int dflt_hzip_tid = dflt_hzip_enable ? Xoh_hzip_dict_.Hzip__v1 : Xoh_hzip_dict_.Hzip__plain;
|
||||
save_mgr.Init_by_db(dflt_zip_tid, dflt_hzip_tid, Bool_.N);
|
||||
}
|
||||
}
|
||||
|
@ -16,13 +16,18 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.htmls.core; import gplx.*; import gplx.xowa.*; import gplx.xowa.htmls.*;
|
||||
import gplx.xowa.htmls.core.htmls.*; import gplx.xowa.htmls.core.wkrs.*; import gplx.xowa.htmls.core.hzips.*; import gplx.xowa.htmls.heads.*; import gplx.xowa.htmls.core.dbs.*;
|
||||
import gplx.core.ios.*; import gplx.core.primitives.*; import gplx.xowa.wikis.data.*; import gplx.xowa.wikis.pages.*;
|
||||
import gplx.xowa.addons.wikis.pages.syncs.core.*;
|
||||
public class Xow_hdump_mgr__save {
|
||||
private final Xow_wiki wiki; private final Xoh_hzip_mgr hzip_mgr; private final Io_stream_zip_mgr zip_mgr;
|
||||
private final Xosync_update_mgr update_mgr = new Xosync_update_mgr();
|
||||
private final Xoh_page tmp_hpg; private final Xoh_hzip_bfr tmp_bfr = Xoh_hzip_bfr.New_txt(32); private Bool_obj_ref html_db_is_new = Bool_obj_ref.n_();
|
||||
private int dflt_zip_tid, dflt_hzip_tid;
|
||||
public Xow_hdump_mgr__save(Xow_wiki wiki, Xoh_hzip_mgr hzip_mgr, Io_stream_zip_mgr zip_mgr, Xoh_page tmp_hpg) {
|
||||
this.wiki = wiki; this.hzip_mgr = hzip_mgr; this.zip_mgr = zip_mgr; this.tmp_hpg = tmp_hpg;
|
||||
}
|
||||
public void Init_by_app(Xoae_app app) {
|
||||
update_mgr.Init_by_app(app);
|
||||
}
|
||||
public void Init_by_db(int dflt_zip_tid, int dflt_hzip_tid, boolean mode_is_b256) {
|
||||
this.dflt_zip_tid = dflt_zip_tid; this.dflt_hzip_tid = dflt_hzip_tid; tmp_bfr.Mode_is_b256_(mode_is_b256);
|
||||
}
|
||||
@ -32,12 +37,12 @@ public class Xow_hdump_mgr__save {
|
||||
Bld_hdump(page);
|
||||
tmp_hpg.Ctor_by_hdiff(tmp_bfr, page, page.Wikie().Msg_mgr().Val_by_id(gplx.xowa.langs.msgs.Xol_msg_itm_.Id_toc));
|
||||
Xow_db_file html_db = Get_html_db(wiki, page, html_db_is_new.Val_n_());
|
||||
return Save(tmp_hpg, html_db.Tbl__html(), html_db_is_new.Val(), true);
|
||||
return Save(page, tmp_hpg, html_db.Tbl__html(), html_db_is_new.Val(), true);
|
||||
}
|
||||
}
|
||||
public int Save(Xoh_page hpg, Xowd_html_tbl html_tbl, boolean insert, boolean use_hzip_dflt) {
|
||||
public int Save(Xoae_page page, Xoh_page hpg, Xowd_html_tbl html_tbl, boolean insert, boolean use_hzip_dflt) {
|
||||
int hzip_tid = use_hzip_dflt ? dflt_hzip_tid : Xoh_hzip_dict_.Hzip__none;
|
||||
byte[] db_body = Write(tmp_bfr, wiki, hpg, hzip_mgr, zip_mgr, dflt_zip_tid, hzip_tid, hpg.Db().Html().Html_bry());
|
||||
byte[] db_body = Write(tmp_bfr, wiki, page, hpg, hzip_mgr, zip_mgr, dflt_zip_tid, hzip_tid, hpg.Db().Html().Html_bry());
|
||||
if (insert) html_tbl.Insert(hpg, dflt_zip_tid, dflt_hzip_tid, db_body);
|
||||
else html_tbl.Update(hpg, dflt_zip_tid, dflt_hzip_tid, db_body);
|
||||
return db_body.length;
|
||||
@ -47,8 +52,16 @@ public class Xow_hdump_mgr__save {
|
||||
wiki.Html__wtr_mgr().Wkr(Xopg_page_.Tid_read).Write_body(tmp_bfr, page.Wikie().Parser_mgr().Ctx(), Xoh_wtr_ctx.Hdump, page); // save as hdump_fmt
|
||||
page.Db().Html().Html_bry_(tmp_bfr.To_bry_and_clear());
|
||||
}
|
||||
private byte[] Write(Xoh_hzip_bfr bfr, Xow_wiki wiki, Xoh_page hpg, Xoh_hzip_mgr hzip_mgr, Io_stream_zip_mgr zip_mgr, int zip_tid, int hzip_tid, byte[] src) {
|
||||
if (hzip_tid != Xoh_hzip_dict_.Hzip__none) src = hzip_mgr.Encode_as_bry((Xoh_hzip_bfr)bfr.Clear(), wiki, hpg, src);
|
||||
private byte[] Write(Xoh_hzip_bfr bfr, Xow_wiki wiki, Xoae_page page, Xoh_page hpg, Xoh_hzip_mgr hzip_mgr, Io_stream_zip_mgr zip_mgr, int zip_tid, int hzip_tid, byte[] src) {
|
||||
switch (hzip_tid) {
|
||||
case Xoh_hzip_dict_.Hzip__none:
|
||||
update_mgr.Parse(hpg, wiki, page.Url_bry_safe(), src);
|
||||
src = hpg.Db().Html().Html_bry();
|
||||
break;
|
||||
case Xoh_hzip_dict_.Hzip__v1:
|
||||
src = hzip_mgr.Encode_as_bry((Xoh_hzip_bfr)bfr.Clear(), wiki, hpg, src);
|
||||
break;
|
||||
}
|
||||
src_as_hzip = src;
|
||||
if (zip_tid > gplx.core.ios.streams.Io_stream_tid_.Tid__raw)
|
||||
src = zip_mgr.Zip((byte)zip_tid, src);
|
||||
|
@ -40,6 +40,7 @@ public class Xob_hdump_bldr implements Gfo_invk {
|
||||
this.toc_label = wiki.Msg_mgr().Val_by_id(gplx.xowa.langs.msgs.Xol_msg_itm_.Id_toc);
|
||||
|
||||
if (zip_tid == Byte_.Max_value_127) zip_tid = Xobldr_cfg.Zip_mode__html(wiki.App());
|
||||
hdump_mgr.Init_by_app(wiki.Appe());
|
||||
hdump_mgr.Init_by_db(zip_tid, hzip_enabled, hzip_b256);
|
||||
return true;
|
||||
}
|
||||
@ -63,7 +64,7 @@ public class Xob_hdump_bldr implements Gfo_invk {
|
||||
|
||||
// save to db
|
||||
Xowd_html_tbl html_tbl = html_tbl_retriever.Get_html_tbl(wpg.Ttl().Ns(), prv_row_len); // get html_tbl
|
||||
this.prv_row_len = hdump_mgr.Save_mgr().Save(tmp_hpg.Ctor_by_hdiff(tmp_bfr, wpg, toc_label), html_tbl, true, is_wikitext); // save to db
|
||||
this.prv_row_len = hdump_mgr.Save_mgr().Save(wpg, tmp_hpg.Ctor_by_hdiff(tmp_bfr, wpg, toc_label), html_tbl, true, is_wikitext); // save to db
|
||||
stat_tbl.Insert(tmp_hpg, stat_itm, wpg.Root().Root_src().length, tmp_hpg.Db().Html().Html_bry().length, prv_row_len); // save stats
|
||||
|
||||
// run hzip diff if enabled
|
||||
|
@ -51,9 +51,9 @@ public class Xoh_hzip_dict_ {
|
||||
, Key__pgbnr = "pgbnr"
|
||||
, Key__media = "media"
|
||||
;
|
||||
public static final int // SERIALIZED
|
||||
Hzip__none = 0
|
||||
, Hzip__v1 = 1
|
||||
, Hzip__plain = 2 // used for page_sync
|
||||
;
|
||||
public static final int // SERIALIZED:html_db.html.body_flag
|
||||
Hzip__none = 0
|
||||
, Hzip__v1 = 1
|
||||
, Hzip__plain = 2 // used for page_sync
|
||||
;
|
||||
}
|
||||
|
@ -20,8 +20,8 @@ import gplx.xowa.htmls.core.hzips.*;
|
||||
import gplx.xowa.htmls.core.wkrs.hdrs.*; import gplx.xowa.htmls.core.wkrs.imgs.*;
|
||||
public class Xoh_hdoc_wkr__make implements Xoh_hdoc_wkr {
|
||||
private Xoh_hzip_bfr bfr; private Xoh_page hpg; private Xoh_hdoc_ctx hctx; private byte[] src;
|
||||
private final Xoh_hdr_wtr wkr__hdr = new Xoh_hdr_wtr();
|
||||
private final Xoh_img_wtr wkr__img = new Xoh_img_wtr();
|
||||
private final Xoh_hdr_wtr wkr__hdr = new Xoh_hdr_wtr();
|
||||
private final Xoh_img_wtr wkr__img = new Xoh_img_wtr();
|
||||
public void On_new_page(Xoh_hzip_bfr bfr, Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, int src_bgn, int src_end) {
|
||||
this.bfr = bfr; this.hpg = hpg; this.hctx = hctx; this.src = src;
|
||||
}
|
||||
@ -29,16 +29,24 @@ public class Xoh_hdoc_wkr__make implements Xoh_hdoc_wkr {
|
||||
public void On_escape (gplx.xowa.htmls.core.wkrs.escapes.Xoh_escape_data data) {bfr.Add(data.Hook());}
|
||||
public void On_xnde (gplx.xowa.htmls.core.wkrs.xndes.Xoh_xnde_parser data) {bfr.Add_mid(src, data.Src_bgn(), data.Src_end());}
|
||||
public void On_lnki (gplx.xowa.htmls.core.wkrs.lnkis.Xoh_lnki_data data) {bfr.Add_mid(src, data.Src_bgn(), data.Src_end());}
|
||||
public void On_thm (gplx.xowa.htmls.core.wkrs.thms.Xoh_thm_data data) {bfr.Add_mid(src, data.Src_bgn(), data.Src_end());}
|
||||
public void On_gly (gplx.xowa.htmls.core.wkrs.glys.Xoh_gly_grp_data data) {}
|
||||
public void On_thm (gplx.xowa.htmls.core.wkrs.thms.Xoh_thm_data data) {
|
||||
Xoh_img_data img_data = (gplx.xowa.htmls.core.wkrs.imgs.Xoh_img_data)data.Img_data();
|
||||
bfr.Add_mid(src, data.Src_bgn(), img_data.Src_bgn());
|
||||
wkr__img.Init_by_parse(bfr, hpg, hctx, src, img_data);
|
||||
bfr.Add_mid(src, img_data.Src_end(), data.Src_end());
|
||||
}
|
||||
public void On_gly (gplx.xowa.htmls.core.wkrs.glys.Xoh_gly_grp_data data) {
|
||||
bfr.Add_mid(src, data.Src_bgn(), data.Src_end());
|
||||
}
|
||||
public boolean Process_parse(Xoh_data_itm data) {
|
||||
switch (data.Tid()) {
|
||||
case Xoh_hzip_dict_.Tid__img: wkr__img.Init_by_parse(bfr, hpg, hctx, src, (gplx.xowa.htmls.core.wkrs.imgs.Xoh_img_data)data); return true;
|
||||
case Xoh_hzip_dict_.Tid__hdr: wkr__hdr.Init_by_parse(bfr, hpg, hctx, src, (gplx.xowa.htmls.core.wkrs.hdrs.Xoh_hdr_data)data); return true;
|
||||
case Xoh_hzip_dict_.Tid__lnke: break;
|
||||
default: throw Err_.new_unhandled(data.Tid());
|
||||
case Xoh_hzip_dict_.Tid__img: return wkr__img.Init_by_parse(bfr, hpg, hctx, src, (gplx.xowa.htmls.core.wkrs.imgs.Xoh_img_data)data);
|
||||
case Xoh_hzip_dict_.Tid__hdr: return wkr__hdr.Init_by_parse(bfr, hpg, hctx, src, (gplx.xowa.htmls.core.wkrs.hdrs.Xoh_hdr_data)data);
|
||||
case Xoh_hzip_dict_.Tid__lnke:
|
||||
default:
|
||||
bfr.Add_mid(src, data.Src_bgn(), data.Src_end());
|
||||
break;
|
||||
}
|
||||
bfr.Add_mid(src, data.Src_bgn(), data.Src_end());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -41,7 +41,7 @@ public abstract class Xoh_itm_parser_fxt {
|
||||
Xoh_page hpg = new Xoh_page(); // NOTE: no need to pass url and ttl now
|
||||
hctx.Init_by_page(wiki, hpg);
|
||||
err_wkr.Init_by_page(Xoa_page_.Main_page_str, src);
|
||||
Exec_parse_hook(err_wkr, hctx, 0, src_len);
|
||||
Exec_parse_hook(err_wkr, hctx, src, 0, src_len);
|
||||
}
|
||||
public abstract void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, int src_bgn, int src_end);
|
||||
public abstract void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] src, int src_bgn, int src_end);
|
||||
}
|
||||
|
@ -20,9 +20,10 @@ import gplx.xowa.htmls.sections.*; import gplx.xowa.htmls.core.hzips.*;
|
||||
public class Xoh_hdr_wtr implements gplx.core.brys.Bfr_arg, Xoh_wtr_itm {
|
||||
private int hdr_num; private byte[] hdr_id, hdr_content, hdr_capt_rhs;
|
||||
private Xoh_page hpg;
|
||||
public void Init_by_parse(Bry_bfr bfr, Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, Xoh_hdr_data data) {
|
||||
Init_by_decode(hpg, hctx, src, data);
|
||||
public boolean Init_by_parse(Bry_bfr bfr, Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, Xoh_hdr_data data) {
|
||||
if (!Init_by_decode(hpg, hctx, src, data)) return false;
|
||||
this.Bfr_arg__add(bfr);
|
||||
return true;
|
||||
}
|
||||
public boolean Init_by_decode(Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, Xoh_data_itm data_itm) {
|
||||
this.hpg = hpg;
|
||||
|
@ -56,9 +56,10 @@ public class Xoh_img_wtr implements Bfr_arg, Xoh_wtr_itm {
|
||||
}
|
||||
public Xoh_img_wtr Anch_cls_(byte[] v) {anch_cls.Set_by_bry(v); return this;}
|
||||
public Xoh_img_wtr Img_id_(int uid) {img_id.Set(uid); return this;}
|
||||
public void Init_by_parse(Bry_bfr bfr, Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, Xoh_img_data data) {
|
||||
Init_by_decode(hpg, hctx, src, data);
|
||||
public boolean Init_by_parse(Bry_bfr bfr, Xoh_page hpg, Xoh_hdoc_ctx hctx, byte[] src, Xoh_img_data data) {
|
||||
if (!Init_by_decode(hpg, hctx, src, data)) return false;
|
||||
this.Bfr_arg__add(bfr);
|
||||
return true;
|
||||
}
|
||||
public void Init_html(int html_w, int html_h, byte[] src_bry) {
|
||||
img_w.Set_by_int(html_w);
|
||||
@ -98,12 +99,13 @@ public class Xoh_img_wtr implements Bfr_arg, Xoh_wtr_itm {
|
||||
else if (data.Img_w() != -1) { // orig exists or some hard-coded image (hiero)
|
||||
Xoh_img_src_data img_src_data = data.Img_src();
|
||||
|
||||
this.Init_html(data.Img_w(), data.Img_h(), img_src_data.Src_bry());
|
||||
byte[] img_src_bry = img_src_data.Src_mid();
|
||||
this.Init_html(data.Img_w(), data.Img_h(), img_src_bry);
|
||||
int file_w = data.Img_src().File_w();
|
||||
// NOTE: init lnki with "64|file_w|-1|-1|-1|-1"; DATE:2016-08-10
|
||||
fsdb_itm.Init_at_lnki(Xof_exec_tid.Tid_wiki_page, hctx.Cache__wiki_abrv(img_src_data.Repo_is_commons()), lnki_ttl, Xop_lnki_type.Tid_orig_known, Xop_lnki_tkn.Upright_null, file_w, Xof_img_size.Null, data.Img_src().File_time(), data.Img_src().File_page(), Xof_patch_upright_tid_.Tid_all);
|
||||
fsdb_itm.Init_at_gallery_bgn(data.Img_w(), data.Img_h(), file_w);
|
||||
fsdb_itm.Html_view_url_(Io_url_.New__http_or_fail(img_src_data.Src_bry()));
|
||||
fsdb_itm.Html_view_url_(Io_url_.New__http_or_fail(img_src_bry));
|
||||
fsdb_itm.File_is_orig_(data.Img_src().File_is_orig());
|
||||
|
||||
// ASSUME: if file_w != img_w, then page has packed gallery; PAGE:en.w:Mexico; DATE:2016-08-14
|
||||
|
@ -45,6 +45,7 @@ public class Xoh_img_cls_data implements Bfr_arg_clearable {
|
||||
if (pos < src_end && src[pos] == Byte_ascii.Space)
|
||||
++pos;
|
||||
if (cls_tid == Xoh_img_cls_.Tid__manual || pos < src_end) {
|
||||
this.src = atr.Src(); // set src, else NPE when trying to write to bfr; DATE:2019-01-04
|
||||
this.other_bgn = pos;
|
||||
this.other_end = src_end;
|
||||
}
|
||||
|
@ -21,9 +21,10 @@ import gplx.xowa.wikis.domains.*;
|
||||
public class Xoh_img_src_data implements Bfr_arg_clearable, Xoh_itm_parser {
|
||||
private final Bry_rdr rdr = new Bry_rdr().Dflt_dlm_(Byte_ascii.Slash);
|
||||
private boolean fail_throws_err;
|
||||
public byte[] Src_bry() {return src_bry;} private byte[] src_bry;
|
||||
private byte[] src_bry;
|
||||
public int Src_bgn() {return src_bgn;} private int src_bgn;
|
||||
public int Src_end() {return src_end;} private int src_end;
|
||||
public byte[] Src_mid() {return src_mid;} private byte[] src_mid;
|
||||
public int Repo_bgn() {return repo_bgn;} private int repo_bgn;
|
||||
public int Repo_end() {return repo_end;} private int repo_end;
|
||||
public boolean Repo_is_commons() {return repo_is_commons;} private boolean repo_is_commons;
|
||||
@ -49,13 +50,14 @@ public class Xoh_img_src_data implements Bfr_arg_clearable, Xoh_itm_parser {
|
||||
public boolean Parse(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] domain_bry, Gfh_tag tag) {
|
||||
this.Clear();
|
||||
Gfh_atr atr = tag.Atrs__get_by_or_empty(Gfh_atr_.Bry__src);
|
||||
return Parse(err_wkr, hctx, domain_bry, atr.Val_bgn(), atr.Val_end());
|
||||
return Parse(err_wkr, hctx, domain_bry, atr.Src(), atr.Val_bgn(), atr.Val_end());
|
||||
}
|
||||
public boolean Parse(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] domain_bry, int src_bgn, int src_end) { // EX: src="file:///C:/xowa/file/commons.wikimedia.org/thumb/7/0/1/2/A.png/220px.png"
|
||||
public boolean Parse(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] domain_bry, byte[] src_bry, int src_bgn, int src_end) { // EX: src="file:///C:/xowa/file/commons.wikimedia.org/thumb/7/0/1/2/A.png/220px.png"
|
||||
this.Clear();
|
||||
this.src_bry = err_wkr.Src();
|
||||
this.src_bry = src_bry;
|
||||
this.src_bgn = src_bgn; this.src_end = src_end;
|
||||
if (src_end == src_bgn) return true; // empty src; just return true;
|
||||
this.src_mid = Bry_.Mid(src_bry, src_bgn, src_end);
|
||||
|
||||
// get repo_bgn; note that some <img> may be hiero / enlarge / magnify and should exit
|
||||
rdr.Init_by_wkr(err_wkr, "img.src.xowa", src_bgn, src_end).Fail_throws_err_(fail_throws_err);
|
||||
|
@ -59,8 +59,8 @@ class Xoh_img_src_data_fxt extends Xoh_itm_parser_fxt { private final Xoh_im
|
||||
Tfds.Eq_double(expd_time, parser.File_time());
|
||||
Tfds.Eq_int(expd_page, parser.File_page());
|
||||
}
|
||||
@Override public void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, int src_bgn, int src_end) {
|
||||
@Override public void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] src, int src_bgn, int src_end) {
|
||||
parser.Fail_throws_err_(true);
|
||||
parser.Parse(err_wkr, new Xoh_hdoc_ctx(), Xow_domain_itm_.Bry__enwiki, src_bgn, src_end);
|
||||
parser.Parse(err_wkr, new Xoh_hdoc_ctx(), Xow_domain_itm_.Bry__enwiki, src, src_bgn, src_end);
|
||||
}
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ class Xoh_anch_href_data_fxt extends Xoh_itm_parser_fxt { private final Xoh_
|
||||
Tfds.Eq_str(expd_site, parser.Site_bgn() == -1 ? "" : String_.new_u8(src, parser.Site_bgn(), parser.Site_end()));
|
||||
Tfds.Eq_str(expd_page, String_.new_u8(src, parser.Ttl_bgn(), parser.Ttl_end()));
|
||||
}
|
||||
@Override public void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, int src_bgn, int src_end) {
|
||||
parser.Parse(err_wkr, hctx, err_wkr.Src(), src_bgn, src_end);
|
||||
@Override public void Exec_parse_hook(Bry_err_wkr err_wkr, Xoh_hdoc_ctx hctx, byte[] src, int src_bgn, int src_end) {
|
||||
parser.Parse(err_wkr, hctx, src, src_bgn, src_end);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user