1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2015-09-20 23:43:51 -04:00
parent 5fe27b5b3b
commit fa70c05354
1056 changed files with 8375 additions and 7095 deletions

View File

@@ -17,6 +17,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.bldrs.css; import gplx.*; import gplx.xowa.*; import gplx.xowa.bldrs.*;
import gplx.ios.*; import gplx.xowa.html.*;
import gplx.langs.htmls.encoders.*;
import gplx.xowa.nss.*;
import gplx.xowa.wikis.*; import gplx.xowa.wikis.domains.*; import gplx.xowa.wikis.data.*;
import gplx.xowa.files.downloads.*;
import gplx.core.net.*;
@@ -171,9 +173,9 @@ public class Xoa_css_extractor {
private boolean Logo_copy_from_css(Io_url trg_fil) {
Io_url commons_file = wiki_html_dir.GenSubFil(Css_common_name);
byte[] commons_src = Io_mgr.I.LoadFilBry(commons_file);
int bgn_pos = Bry_finder.Find_fwd(commons_src, Bry_mw_wiki_logo); if (bgn_pos == Bry_finder.Not_found) return false;
int bgn_pos = Bry_find_.Find_fwd(commons_src, Bry_mw_wiki_logo); if (bgn_pos == Bry_find_.Not_found) return false;
bgn_pos += Bry_mw_wiki_logo.length;
int end_pos = Bry_finder.Find_fwd(commons_src, Byte_ascii.Quote, bgn_pos + 1); if (end_pos == Bry_finder.Not_found) return false;
int end_pos = Bry_find_.Find_fwd(commons_src, Byte_ascii.Quote, bgn_pos + 1); if (end_pos == Bry_find_.Not_found) return false;
byte[] src_bry = Bry_.Mid(commons_src, bgn_pos, end_pos);
src_bry = Xob_url_fixer.Fix(wiki_domain, src_bry, src_bry.length);
if (wiki_html_dir.Info().DirSpr_byte() == Byte_ascii.Backslash)
@@ -185,11 +187,11 @@ public class Xoa_css_extractor {
private String Logo_find_src() {
if (mainpage_html == null) return null;
int main_page_html_len = mainpage_html.length;
int logo_bgn = Bry_finder.Find_fwd(mainpage_html, Logo_find_bgn, 0); if (logo_bgn == Bry_.NotFound) return null;
int logo_bgn = Bry_find_.Find_fwd(mainpage_html, Logo_find_bgn, 0); if (logo_bgn == Bry_.NotFound) return null;
logo_bgn += Logo_find_bgn.length;
logo_bgn = Bry_finder.Find_fwd(mainpage_html, Logo_find_end, logo_bgn); if (logo_bgn == Bry_.NotFound) return null;
logo_bgn = Bry_find_.Find_fwd(mainpage_html, Logo_find_end, logo_bgn); if (logo_bgn == Bry_.NotFound) return null;
logo_bgn += Logo_find_end.length;
int logo_end = Bry_finder.Find_fwd(mainpage_html, Byte_ascii.Paren_end, logo_bgn, main_page_html_len); if (logo_bgn == Bry_.NotFound) return null;
int logo_end = Bry_find_.Find_fwd(mainpage_html, Byte_ascii.Paren_end, logo_bgn, main_page_html_len); if (logo_bgn == Bry_.NotFound) return null;
byte[] logo_bry = Bry_.Mid(mainpage_html, logo_bgn, logo_end);
return protocol_prefix + String_.new_u8(logo_bry);
}
@@ -238,12 +240,13 @@ public class Xoa_css_extractor {
byte[] protocol_prefix_bry = Bry_.new_u8(protocol_prefix);
Gfo_url gfo_url = new Gfo_url();
while (true) {
int url_bgn = Bry_finder.Find_fwd(raw, Css_find_bgn, prv_pos); if (url_bgn == Bry_.NotFound) break; // nothing left; stop
int url_bgn = Bry_find_.Find_fwd(raw, Css_find_bgn, prv_pos); if (url_bgn == Bry_.NotFound) break; // nothing left; stop
url_bgn += css_find_bgn_len;
int url_end = Bry_finder.Find_fwd(raw, Byte_ascii.Quote, url_bgn, raw_len); if (url_end == Bry_.NotFound) {usr_dlg.Warn_many("", "main_page.css_parse", "could not find css; pos='~{0}' text='~{1}'", url_bgn, String_.new_u8_by_len(raw, url_bgn, url_bgn + 32)); break;}
int url_end = Bry_find_.Find_fwd(raw, Byte_ascii.Quote, url_bgn, raw_len); if (url_end == Bry_.NotFound) {usr_dlg.Warn_many("", "main_page.css_parse", "could not find css; pos='~{0}' text='~{1}'", url_bgn, String_.new_u8__by_len(raw, url_bgn, url_bgn + 32)); break;}
byte[] css_url_bry = Bry_.Mid(raw, url_bgn, url_end);
css_url_bry = Bry_.Replace(css_url_bry, Css_amp_find, Css_amp_repl); // &amp; -> &
css_url_bry = url_encoder.Decode(css_url_bry); // %2C -> %7C -> |
css_url_bry = Xoa_css_extractor.Url_root_fix(wiki_domain, css_url_bry);
url_parser.Parse(gfo_url, css_url_bry, 0, css_url_bry.length);
if ( gfo_url.Protocol_tid() == Gfo_protocol_itm.Tid_relative_1 // if rel url, add protocol_prefix DATE:2015-08-01
|| (Env_.Mode_testing() && gfo_url.Protocol_tid() == Gfo_protocol_itm.Tid_unknown)) // TEST:
@@ -266,6 +269,15 @@ public class Xoa_css_extractor {
}
return tmp_bfr.Xto_bry_and_clear();
}
private static byte[] Url_root_fix(byte[] domain, byte[] url) {// DATE:2015-09-20
if (url.length < 3) return url; // need at least 2 chars
if ( url[0] == Byte_ascii.Slash // starts with "/" EX: "/w/api.php"
&& url[1] != Byte_ascii.Slash // but not "//"; EX: "//en.wikipedia.org"
)
return Bry_.Add(gplx.xowa.html.hrefs.Xoh_href_.Bry__https, domain, url);
else
return url;
}
public static final String Css_common_name = "xowa_common.css", Css_wiki_name = "xowa_wiki.css"
, Css_common_name_ltr = "xowa_common_ltr.css", Css_common_name_rtl = "xowa_common_rtl.css";
}

View File

@@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.bldrs.css; import gplx.*; import gplx.xowa.*; import gplx.xowa.bldrs.*;
import org.junit.*; import gplx.ios.*; import gplx.xowa.wikis.data.*; import gplx.xowa.files.downloads.*;
import org.junit.*; import gplx.ios.*; import gplx.langs.htmls.encoders.*; import gplx.xowa.wikis.data.*; import gplx.xowa.files.downloads.*;
public class Xoa_css_extractor_basic_tst {
@Before public void init() {fxt.Clear();} private Xoa_css_extractor_fxt fxt = new Xoa_css_extractor_fxt();
@Test public void Logo_download() {

View File

@@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.bldrs.css; import gplx.*; import gplx.xowa.*; import gplx.xowa.bldrs.*;
import org.junit.*; import gplx.ios.*;
import org.junit.*; import gplx.ios.*; import gplx.xowa.nss.*;
public class Xoa_css_extractor_wiki_tst {
@Before public void init() {fxt.Clear();} private Xoa_css_extractor_fxt fxt = new Xoa_css_extractor_fxt();
@Test public void Css_wiki_generate() {

View File

@@ -41,7 +41,7 @@ public class Xoa_css_img_downloader {
Bry_bfr bfr = Bry_bfr.new_(src_len);
Hash_adp img_hash = Hash_adp_bry.cs();
while (true) {
int url_pos = Bry_finder.Find_fwd(src, Bry_url, prv_pos);
int url_pos = Bry_find_.Find_fwd(src, Bry_url, prv_pos);
if (url_pos == Bry_.NotFound) {bfr.Add_mid(src, prv_pos, src_len); break;} // no more "url("; exit;
int bgn_pos = url_pos + Bry_url_len; // set bgn_pos after "url("
byte bgn_byte = src[bgn_pos];
@@ -57,14 +57,14 @@ public class Xoa_css_img_downloader {
quoted = false;
break;
}
int end_pos = Bry_finder.Find_fwd(src, end_byte, bgn_pos, src_len);
int end_pos = Bry_find_.Find_fwd(src, end_byte, bgn_pos, src_len);
if (end_pos == Bry_.NotFound) { // unclosed "url("; exit since nothing else will be found
usr_dlg.Warn_many(GRP_KEY, "parse.invalid_url.end_missing", "could not find end_sequence for 'url(': bgn='~{0}' end='~{1}'", prv_pos, String_.new_u8_by_len(src, prv_pos, prv_pos + 25));
usr_dlg.Warn_many(GRP_KEY, "parse.invalid_url.end_missing", "could not find end_sequence for 'url(': bgn='~{0}' end='~{1}'", prv_pos, String_.new_u8__by_len(src, prv_pos, prv_pos + 25));
bfr.Add_mid(src, prv_pos, src_len);
break;
}
if (end_pos - bgn_pos == 0) { // empty; "url()"; ignore
usr_dlg.Warn_many(GRP_KEY, "parse.invalid_url.empty", "'url(' is empty: bgn='~{0}' end='~{1}'", prv_pos, String_.new_u8_by_len(src, prv_pos, prv_pos + 25));
usr_dlg.Warn_many(GRP_KEY, "parse.invalid_url.empty", "'url(' is empty: bgn='~{0}' end='~{1}'", prv_pos, String_.new_u8__by_len(src, prv_pos, prv_pos + 25));
bfr.Add_mid(src, prv_pos, bgn_pos);
prv_pos = bgn_pos;
continue;
@@ -128,10 +128,10 @@ public class Xoa_css_img_downloader {
}
bfr.Add_mid(src, old_pos, find_bgn - Bry_import_len).Add_byte_nl();
bfr.Add(Bry_comment_bgn).Add(css_url).Add(Bry_comment_end).Add_byte_nl();
if (Bry_finder.Find_fwd(css_url, Wikisource_dynimg_ttl) != -1) css_trg_bry = Bry_.Replace(css_trg_bry, Wikisource_dynimg_find, Wikisource_dynimg_repl); // FreedImg hack; PAGE:en.s:Page:Notes_on_Osteology_of_Baptanodon._With_a_Description_of_a_New_Species.pdf/3 DATE:2014-09-06
if (Bry_find_.Find_fwd(css_url, Wikisource_dynimg_ttl) != -1) css_trg_bry = Bry_.Replace(css_trg_bry, Wikisource_dynimg_find, Wikisource_dynimg_repl); // FreedImg hack; PAGE:en.s:Page:Notes_on_Osteology_of_Baptanodon._With_a_Description_of_a_New_Species.pdf/3 DATE:2014-09-06
bfr.Add(css_trg_bry).Add_byte_nl();
bfr.Add_byte_nl();
int semic_pos = Bry_finder.Find_fwd(src, Byte_ascii.Semic, find_bgn + url_raw.length, src_len);
int semic_pos = Bry_find_.Find_fwd(src, Byte_ascii.Semic, find_bgn + url_raw.length, src_len);
return semic_pos + Int_.Const_dlm_len;
}
private static final byte[]
@@ -143,11 +143,11 @@ public class Xoa_css_img_downloader {
int pos_bgn = 0;
if (Bry_.Has_at_bgn(raw, Bry_fwd_slashes, 0, raw_len)) pos_bgn = Bry_fwd_slashes.length;
if (Bry_.Has_at_bgn(raw, Bry_http, 0, raw_len)) pos_bgn = Bry_http.length;
int pos_slash = Bry_finder.Find_fwd(raw, Byte_ascii.Slash, pos_bgn, raw_len);
int pos_slash = Bry_find_.Find_fwd(raw, Byte_ascii.Slash, pos_bgn, raw_len);
if (pos_slash == Bry_.NotFound) return null; // first segment is site_name; at least one slash must be present for image name; EX: site.org/img_name.jpg
if (pos_slash == raw_len - 1) return null; // "site.org/" is invalid
int pos_end = raw_len;
int pos_question = Bry_finder.Find_bwd(raw, Byte_ascii.Question);
int pos_question = Bry_find_.Find_bwd(raw, Byte_ascii.Question);
if (pos_question != Bry_.NotFound)
pos_end = pos_question; // remove query params; EX: img_name?key=val
return Bry_.Mid(raw, pos_bgn, pos_end);

View File

@@ -22,7 +22,7 @@ class Xob_css_parser__import {
private final Xob_css_parser__url url_parser;
public Xob_css_parser__import(Xob_css_parser__url url_parser) {this.url_parser = url_parser;}
public Xob_css_tkn__base Parse(byte[] src, int src_len, int tkn_bgn, int tkn_end) { // " @import"
int bgn_pos = Bry_finder.Find_fwd_while_ws(src, tkn_end, src_len); // skip any ws after " @import"
int bgn_pos = Bry_find_.Find_fwd_while_ws(src, tkn_end, src_len); // skip any ws after " @import"
if (bgn_pos == src_len) return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.import:EOS after import; bgn=~{0}", tkn_bgn);
if (!Bry_.Has_at_bgn(src, Tkn_url_bry, bgn_pos, src_len)) return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.import:url missing; bgn=~{0}", tkn_bgn);
tkn_end = bgn_pos + Tkn_url_bry.length;
@@ -31,7 +31,7 @@ class Xob_css_parser__import {
Xob_css_tkn__url url_frag = (Xob_css_tkn__url)frag;
byte[] src_url = url_frag.Src_url();
src_url = Bry_.Replace(src_url, Byte_ascii.Space, Byte_ascii.Underline); // NOTE: must replace spaces with underlines else download will fail; EX:https://it.wikivoyage.org/w/index.php?title=MediaWiki:Container e Infobox.css&action=raw&ctype=text/css; DATE:2015-03-08
int semic_pos = Bry_finder.Find_fwd(src, Byte_ascii.Semic, frag.Pos_end(), src_len);
int semic_pos = Bry_find_.Find_fwd(src, Byte_ascii.Semic, frag.Pos_end(), src_len);
return Xob_css_tkn__import.new_(tkn_bgn, semic_pos + 1, src_url, url_frag.Trg_url(), url_frag.Quote_byte());
}
private static final byte[] Tkn_url_bry = Bry_.new_a7("url(");

View File

@@ -20,7 +20,7 @@ class Xob_css_parser__url {
private final byte[] site;
public Xob_css_parser__url(byte[] site) {this.site = site;}
public Xob_css_tkn__base Parse(byte[] src, int src_len, int tkn_bgn, int tkn_end) { // " url"
int bgn_pos = Bry_finder.Find_fwd_while_ws(src, tkn_end, src_len); // skip any ws after " url("
int bgn_pos = Bry_find_.Find_fwd_while_ws(src, tkn_end, src_len); // skip any ws after " url("
if (bgn_pos == src_len) return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.url:EOS; bgn=~{0}", tkn_bgn);
byte end_byte = src[bgn_pos]; // note that first non-ws byte should determine end_byte
byte quote_byte = end_byte;
@@ -33,11 +33,11 @@ class Xob_css_parser__url {
quote_byte = Byte_ascii.Null;
break;
}
int end_pos = Bry_finder.Find_fwd(src, end_byte, bgn_pos, src_len);
int end_pos = Bry_find_.Find_fwd(src, end_byte, bgn_pos, src_len);
if (end_pos == Bry_.NotFound) // unclosed "url("; exit since nothing else will be found
return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.url:dangling; bgn=~{0} excerpt=~{1}", bgn_pos, String_.new_u8_by_len(src, tkn_bgn, tkn_bgn + 128));
return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.url:dangling; bgn=~{0} excerpt=~{1}", bgn_pos, String_.new_u8__by_len(src, tkn_bgn, tkn_bgn + 128));
if (end_pos - bgn_pos == 0) // empty; "url()"; ignore
return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.url:empty; bgn=~{0} excerpt=~{1}", bgn_pos, String_.new_u8_by_len(src, tkn_bgn, tkn_bgn + 128));
return Xob_css_tkn__warn.new_(tkn_bgn, tkn_end, "mirror.parser.url:empty; bgn=~{0} excerpt=~{1}", bgn_pos, String_.new_u8__by_len(src, tkn_bgn, tkn_bgn + 128));
byte[] url_orig = Bry_.Mid(src, bgn_pos, end_pos); int url_orig_len = url_orig.length;
++end_pos; // increment end_pos so rv will be after it;
if ( end_byte != Byte_ascii.Paren_end) { // end_byte is apos / quote