Mw_parse: Add clean_url and associated functions to sanitizer

pull/620/head
gnosygnu 8 years ago
parent c77e8a4374
commit 9a5c70b506

@ -40,7 +40,7 @@ public class Byte_ascii {
, Ltr_n = 110, Ltr_o = 111, Ltr_p = 112, Ltr_q = 113, Ltr_r = 114
, Ltr_s = 115, Ltr_t = 116, Ltr_u = 117, Ltr_v = 118, Ltr_w = 119
, Ltr_x = 120, Ltr_y = 121, Ltr_z = 122, Curly_bgn = 123, Pipe = 124
, Curly_end = 125, Tilde = 126
, Curly_end = 125, Tilde = 126, Delete = 127
;
public static final byte
Angle_bgn = Lt, Angle_end = Gt

@ -117,6 +117,14 @@ public class Btrie_slim_mgr implements Btrie_mgr {
}
return this;
}
public Btrie_slim_mgr Add_many_bry(byte[]... ary) {
int len = ary.length;
for (int i = 0; i < len; i++) {
byte[] itm = ary[i];
Add_obj(itm, itm);
}
return this;
}
public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));}
public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) {
int len = ary.length;

@ -0,0 +1,40 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.brys; import gplx.*; import gplx.core.*;
public class Bry_tmp {
public byte[] src;
public int src_bgn;
public int src_end;
public boolean dirty;
public Bry_tmp Init(byte[] src, int src_bgn, int src_end) {
this.dirty = false;
this.src = src;
this.src_bgn = src_bgn;
this.src_end = src_end;
return this;
}
public void Set_by_bfr(Bry_bfr bfr) {
dirty = true;
src = bfr.To_bry_and_clear();
src_bgn = 0;
src_end = src.length;
}
public void Add_to_bfr(Bry_bfr bfr) {
bfr.Add_mid(src, src_bgn, src_end);
}
}

@ -62,6 +62,12 @@ public class Gfo_url_encoder_ {
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.N)
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
}
public static Gfo_url_encoder_mkr New__php_urlencode() {
// equivalent to php's urlencode; http://php.net/manual/en/function.urlencode.php;
// "Returns a String in which all non-alphanumeric characters except -_. have been replaced with a percent (%) sign followed by two hex digits and spaces encoded as plus (+) signs"
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y)
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
}
private static Gfo_url_encoder_mkr New__http_url_ttl() {
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y);
}
@ -103,5 +109,6 @@ public class Gfo_url_encoder_ {
, Http_url = Gfo_url_encoder_.New__http_url().Make()
, Http_url_ttl = Gfo_url_encoder_.New__http_url_ttl().Make()
, Mw_ttl = Gfo_url_encoder_.New__mw_ttl().Make()
, Php_urlencode = Gfo_url_encoder_.New__php_urlencode().Make()
;
}

@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import gplx.core.btries.*;
import gplx.core.btries.*; import gplx.core.brys.*;
import gplx.core.primitives.*;
public class Php_preg_ {
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
@ -72,4 +72,41 @@ public class Php_preg_ {
}
return null;
}
public static void Replace(Bry_tmp bry, Bry_bfr tmp, Btrie_slim_mgr find_trie, Btrie_rv trv, byte[] repl_bry) {
byte[] src = bry.src;
int src_bgn = bry.src_bgn;
int src_end = bry.src_end;
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
while (true) {
// eos
if (cur == src_end) {
if (dirty) {
tmp.Add_mid(src, prv, src_end);
}
break;
}
byte b = src[cur];
Object o = find_trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null) {
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
}
else {
dirty = true;
tmp.Add_mid(src, prv, cur);
tmp.Add(repl_bry);
cur = trv.Pos();
prv = cur;
}
}
if (dirty) {
bry.Set_by_bfr(tmp);
}
}
}

@ -44,7 +44,7 @@ public class Php_str_ {
if (max == -1) max = src_len;
int rv = 0;
for (int i = bgn; i < src_len; i++) {
if (find[src[i]] && rv < max)
if (find[src[i] & 0xFF] && rv < max) // PATCH.JAVA:need to convert to unsigned byte
rv++;
else
break;
@ -94,7 +94,7 @@ public class Php_str_ {
if (max == -1) max = Int_.Max_value;
int rv = 0;
for (int i = bgn - 1; i > -1; i--) {
if (find[src[i]] && rv < max)
if (find[src[i & 0xFF]] && rv < max) // PATCH.JAVA:need to convert to unsigned byte
rv++;
else
break;

@ -16,65 +16,123 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.mws.parsers.*;
import gplx.xowa.mws.parsers.*; import gplx.langs.phps.utls.*;
public class Xomw_sanitizer {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
// static function cleanUrl($url) {
// // Normalize any HTML entities in input. They will be
// // re-escaped by makeExternalLink().
// $url = Sanitizer::decodeCharReferences($url);
//
// // Escape any control characters introduced by the above step
// $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
// [ __CLASS__, 'cleanUrlCallback' ], $url);
//
// // Validate hostname portion
// $matches = [];
// if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
// list(/* $whole */, $protocol, $host, $rest) = $matches;
//
// // Characters that will be ignored in IDNs.
// // https://tools.ietf.org/html/rfc3454#section-3.1
// // Strip them before further processing so blacklists and such work.
// $strip = "/
// \\s| // general whitespace
// \xc2\xad| // 00ad SOFT HYPHEN
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
// \xe2\x81\xa0| // 2060 WORD JOINER
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
// /xuD";
//
// $host = preg_replace($strip, '', $host);
//
// // IPv6 host names are bracketed with []. Url-decode these.
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// ) {
// $host = '//[' . $matches[1] . ']' . $matches[2];
// }
//
// // @todo FIXME: Validate hostnames here
//
// return $protocol . $host . $rest;
// } else {
// return $url;
// }
// }
//
// static function cleanUrlCallback($matches) {
// return urlencode($matches[0]);
// }
private final Xomw_regex_escape_invalid regex_clean_url = new Xomw_regex_escape_invalid();
private final Xomw_regex_find_domain regex_find_domain = new Xomw_regex_find_domain();
private final Xomw_regex_ipv6_brack regex_ipv6_brack = new Xomw_regex_ipv6_brack();
private final Bry_tmp tmp_host = new Bry_tmp();
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private final Xomw_regex_url_char_cbk__normalize normalize_cbk;
private final Xomw_regex_url_char_cbk__decode decode_cbk;
private static Xomw_regex_url_char regex_url_char;
private static Btrie_slim_mgr invalid_idn_trie;
public Xomw_sanitizer() {
this.normalize_cbk = new Xomw_regex_url_char_cbk__normalize(this);
this.decode_cbk = new Xomw_regex_url_char_cbk__decode(this);
if (regex_url_char == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
regex_url_char = new Xomw_regex_url_char();
// Characters that will be ignored in IDNs.
// https://tools.ietf.org/html/rfc3454#section-3.1
// $strip = "/
// \\s| // general whitespace
// \xc2\xad| // 00ad SOFT HYPHEN
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
// \xe2\x81\xa0| // 2060 WORD JOINER
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
// /xuD";
// XO.MW.REGEX:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
// /x : ignore embedded ws
// /u : enabled pcre utf8
// /D : $ matches EOS, not NL
invalid_idn_trie = Btrie_slim_mgr.cs()
.Add_many_bry(new Xomw_regex_parser().Add_ary
( "\\s"
, "\\xc2\\xad" // 00ad SOFT HYPHEN
, "\\xe1\\xa0\\x86" // 1806 MONGOLIAN TODO SOFT HYPHEN
, "\\xe2\\x80\\x8b" // 200b ZERO WIDTH SPACE
, "\\xe2\\x81\\xa0" // 2060 WORD JOINER
, "\\xef\\xbb\\xbf" // feff ZERO WIDTH NO-BREAK SPACE
, "\\xcd\\x8f" // 034f COMBINING GRAPHEME JOINER
, "\\xe1\\xa0\\x8b" // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
, "\\xe1\\xa0\\x8c" // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
, "\\xe1\\xa0\\x8d" // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
, "\\xe2\\x80\\x8c" // 200c ZERO WIDTH NON-JOINER
, "\\xe2\\x80\\x8d" // 200d ZERO WIDTH JOINER
)
.Add_rng
( "\\xef\\xb8\\x80", "\\xef\\xb8\\x8f" // fe00-fe0f VARIATION SELECTOR-1-16
)
.Rslt());
// assert static structs
if (html_entities == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
html_entities = Html_entities_new();
}
}
}
}
}
public byte[] Clean_url(byte[] url) {
// Normalize any HTML entities in input. They will be
// re-escaped by makeExternalLink().
url = Decode_char_references(null, Bool_.Y, url, 0, url.length);
// Escape any control characters introduced by the above step
// XO.MW.REGEX: $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/', [ __CLASS__, 'cleanUrlCallback' ], $url);
// '[]<>"' | '00 -> 32' | 127
if (regex_clean_url.Escape(tmp_bfr, url, 0, url.length))
url = tmp_bfr.To_bry_and_clear();
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches))
if (regex_find_domain.Match(url, 0, url.length)) {
// Characters that will be ignored in IDNs.
// https://tools.ietf.org/html/rfc3454#section-3.1
// Strip them before further processing so blacklists and such work.
Php_preg_.Replace(tmp_host.Init(url, regex_find_domain.host_bgn, regex_find_domain.host_end), tmp_bfr, invalid_idn_trie, trv, Bry_.Empty);
// IPv6 host names are bracketed with []. Url-decode these.
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// XO.MW.REGEX:
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
// EX: [ABCD]:80:12
if (regex_ipv6_brack.Match(tmp_host.src, tmp_host.src_bgn, tmp_host.src_end)) {
tmp_bfr.Add_str_a7("//[").Add_mid(tmp_host.src, regex_ipv6_brack.host_bgn, regex_ipv6_brack.host_end)
.Add_byte(Byte_ascii.Brack_end).Add_mid(tmp_host.src, regex_ipv6_brack.segs_bgn, regex_ipv6_brack.segs_end);
tmp_host.Set_by_bfr(tmp_bfr);
}
// @todo FIXME: Validate hostnames here
tmp_bfr.Add_mid(url, regex_find_domain.prot_bgn, regex_find_domain.prot_end);
tmp_host.Add_to_bfr(tmp_bfr);
tmp_bfr.Add_mid(url, regex_find_domain.rest_bgn, regex_find_domain.rest_end);
return tmp_bfr.To_bry_and_clear();
}
else {
return url;
}
}
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
@ -105,163 +163,13 @@ public class Xomw_sanitizer {
Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
}
public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
// assert static structs
if (Normalize__dec == null) {
synchronized (Xomw_sanitizer.class) {
html_entities = Html_entities_new();
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
Normalize__hex = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.To_ary();
Normalize__ent = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.Set_rng(128, 255)
.To_ary();
}
}
// XO.BRY_BFR
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
// search for "&"
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
int ent_bgn = find_bgn + 1; // +1 to skip &
// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
boolean[] regex = null;
// check for #;
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
ent_bgn++;
if (ent_bgn < src_end) {
byte nxt = src[ent_bgn];
// check for x
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
ent_bgn++;
regex = Normalize__hex;
}
}
if (regex == null)
regex = Normalize__dec;
}
else {
regex = Normalize__ent;
}
// keep looping until invalid regex
int ent_end = ent_bgn;
byte b = Byte_ascii.Null;
for (int i = ent_bgn; i < src_end; i++) {
b = src[i];
if (regex[b])
ent_end++;
else
break;
}
// mark dirty; can optimize later by checking if "&lt;" already exists
dirty = true;
if (bfr == null) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn); // add everything before &
// invalid <- regex ended, but not at semic
if (b != Byte_ascii.Semic) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
cur = find_bgn + 1; // position after "&"
continue;
}
// do normalization
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
boolean ret = false;
if (regex == Normalize__ent) {
Normalize_entity(bfr, name);
ret = true;
}
else if (regex == Normalize__dec) {
ret = Dec_char_reference(bfr, name);
}
else if (regex == Normalize__hex) {
ret = Hex_char_reference(bfr, name);
}
if (!ret) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
bfr.Add_bry_escape_html(src, find_bgn + 1, ent_end + 1); // "find_bgn + 1" to start after "&"; "ent_end + 1" to include ";"
}
cur = ent_end + 1; // +1 to position after ";"
}
// XO.BRY_BFR
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, normalize_cbk);
}
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the equivalent numeric entity reference (except for the core &lt;
// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
// the HTML equivalent. Otherwise, returns HTML-escaped text of
// pseudo-entity source (eg &amp;foo;)
private void Normalize_entity(Bry_bfr bfr, byte[] name) {
Object o = html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(entity.html);
}
public byte[] Decode_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, decode_cbk);
}
private boolean Dec_char_reference(Bry_bfr bfr, byte[] codepoint) {
int point = Bry_.To_int_or(codepoint, -1);
if (Validate_codepoint(point)) {
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
return true;
}
return false;
}
private boolean Hex_char_reference(Bry_bfr bfr, byte[] codepoint) {
int point = Hex_utl_.Parse_or(codepoint, -1);
if (Validate_codepoint(point)) {
bfr.Add_str_a7("&#x");
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
bfr.Add_byte_semic();
return true;
}
return false;
}
private boolean Validate_codepoint(int codepoint) {
public boolean Validate_codepoint(int codepoint) {
// U+000C is valid in HTML5 but not allowed in XML.
// U+000D is valid in XML but not allowed in HTML5.
// U+007F - U+009F are disallowed in HTML5 (control characters).
@ -273,14 +181,13 @@ public class Xomw_sanitizer {
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
}
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
private static Hash_adp_bry html_entities;
public static Hash_adp_bry html_entities;
private static Hash_adp_bry Html_entities_new() {
Bry_bfr tmp = Bry_bfr_.New();
Hash_adp_bry rv = Hash_adp_bry.cs();
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "רלמ", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "رلم", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "רלמ", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "رلم", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "&lt;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", "&gt;");
@ -568,3 +475,395 @@ class Xomw_html_ent {
public final byte[] html;
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
}
class Xomw_regex_find_domain {
public int prot_bgn;
public int prot_end;
public int host_bgn;
public int host_end;
public int rest_bgn;
public int rest_end;
public boolean Match(byte[] src, int src_bgn, int src_end) {
// Validate hostname portion
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
// ([^:]+:)(//[^/]+)?(.*)
// "protocol" + "host" + "rest"
// "protocol" -> ([^:]+:) EX: "https:" anything not-colon up to colon
// "host" -> (//[^/]+)? EX: "//abc/" anything not-slash up to slash
// "rest" -> (.*) EX: rest"
// /i : case-insensitive
// /D : $ matches EOS, not NL
// find prot; EX: "https:"
prot_bgn = src_bgn;
prot_end = Bry_find_.Move_fwd(src, Byte_ascii.Colon, prot_bgn, src_end);
// exit if not found
if (prot_end == Bry_find_.Not_found) return false;
// find host: EX: "//a.org"
host_bgn = prot_end;
int double_slash_end = host_bgn + 2;
// exit if eos
if (double_slash_end >= src_end) return false;
// exit if not "//"
if ( src[host_bgn ] != Byte_ascii.Slash
|| src[host_bgn + 1] != Byte_ascii.Slash
) return false;
host_end = Bry_find_.Find_fwd(src, Byte_ascii.Slash, double_slash_end, src_end);
// exit if not found
if (host_end == Bry_find_.Not_found) {
host_end = src_end;
rest_bgn = rest_end = -1;
}
// exit if only "//"
if (host_end - host_bgn == 2) return false;
// set rest
rest_bgn = host_end;
rest_end = src_end;
return true;
}
}
class Xomw_regex_escape_invalid {
// [\][<>"\\x00-\\x20\\x7F\|]
public boolean Escape(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
boolean dirty = false;
int cur = src_bgn;
int prv = cur;
while (true) {
// eos
if (cur == src_end) {
if (dirty) {
bfr.Add_mid(src, prv, src_end);
}
break;
}
boolean match = false;
byte b = src[cur];
switch (b) {
case Byte_ascii.Brack_bgn:
case Byte_ascii.Brack_end:
case Byte_ascii.Angle_bgn:
case Byte_ascii.Angle_end:
case Byte_ascii.Quote:
case Byte_ascii.Pipe:
case Byte_ascii.Delete:
match = true;
break;
default:
if (b >= 0 && b <= 32)
match = true;
break;
}
if (match) {
bfr.Add_mid(src, prv, cur);
gplx.langs.htmls.encoders.Gfo_url_encoder_.Php_urlencode.Encode(bfr, src, cur, cur + 1);
dirty = true;
cur++;
prv = cur;
}
else
cur++;
}
return dirty;
}
}
class Xomw_regex_ipv6_brack {
public int host_bgn;
public int host_end;
public int segs_bgn;
public int segs_end;
private final byte[]
Bry__host_bgn = Bry_.new_a7("//%5B")
, Bry__host_end = Bry_.new_a7("%5D")
;
public boolean Match(byte[] src, int src_bgn, int src_end) {
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// XO.MW.REGEX:
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
// EX: [ABCD]:80:12
host_bgn = src_bgn + Bry__host_bgn.length;
// exit if no match for "//%5B"
if (!Bry_.Match(src, src_bgn, host_bgn, Bry__host_bgn)) return false;
// skip all [0-9A-Fa-f:.]
host_end = host_bgn;
while (true) {
// exit if eos
if (host_end == src_end) return false;
boolean done = false;
byte b = src[host_end];
switch (b) {
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f:
case Byte_ascii.Colon:
case Byte_ascii.Dot:
host_end++;
break;
case Byte_ascii.Percent:
// matches "%5D"
segs_bgn = host_end + Bry__host_end.length;
if ( Bry_.Match(src, host_end, segs_bgn, Bry__host_end)
&& host_end - host_bgn > 0) // host can't be 0-len; EX: "//%5B%5D"
done = true;
// exit if no match
else {
return false;
}
break;
// exit if no match
default: {
return false;
}
}
if (done) break;
}
// skip all (:\d+)
segs_end = segs_bgn;
while (true) {
// stop if eos
if (segs_end == src_end) return true;
// check if ":"
if (src[segs_end] == Byte_ascii.Colon) {
int num_bgn = segs_end + 1;
int num_end = Bry_find_.Find_fwd_while_num(src, num_bgn, src_end);
// exit if no nums found; EX:"[ABC]:80:"
if (num_end == num_bgn) {
return false;
}
segs_end = num_end;
}
// exit if seg doesn't start with ":"
else {
return false;
}
}
}
}
interface Xomw_regex_url_char_cbk {
boolean When_ent(Bry_bfr bfr, byte[] name);
boolean When_dec(Bry_bfr bfr, byte[] name);
boolean When_hex(Bry_bfr bfr, byte[] name);
boolean When_amp(Bry_bfr bfr);
}
class Xomw_regex_url_char_cbk__normalize implements Xomw_regex_url_char_cbk {
private final Xomw_sanitizer sanitizer;
public Xomw_regex_url_char_cbk__normalize(Xomw_sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
public boolean When_ent(Bry_bfr bfr, byte[] name) { // XO.MW:normalizeEntity
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the equivalent numeric entity reference (except for the core &lt;
// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
// the HTML equivalent. Otherwise, returns HTML-escaped text of
// pseudo-entity source (eg &amp;foo;)
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
return false;
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(entity.html);
return true;
}
}
public boolean When_dec(Bry_bfr bfr, byte[] name) { // XO.MW:decCharReference
int point = Bry_.To_int_or(name, -1);
if (sanitizer.Validate_codepoint(point)) {
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
return true;
}
return false;
}
public boolean When_hex(Bry_bfr bfr, byte[] name) { // XO.MW:hexCharReference
int point = Hex_utl_.Parse_or(name, -1);
if (sanitizer.Validate_codepoint(point)) {
bfr.Add_str_a7("&#x");
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
bfr.Add_byte_semic();
return true;
}
return false;
}
public boolean When_amp(Bry_bfr bfr) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
return true;
}
}
class Xomw_regex_url_char_cbk__decode implements Xomw_regex_url_char_cbk {
private final Xomw_sanitizer sanitizer;
public Xomw_regex_url_char_cbk__decode(Xomw_sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
public boolean When_ent(Bry_bfr bfr, byte[] name) {// XO.MW:decodeEntity
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the UTF-8 encoding of that character. Otherwise, returns
// pseudo-entity source (eg "&foo;")
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_byte(Byte_ascii.Amp).Add(name).Add_byte_semic();
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(entity.code));
}
return true;
}
public boolean When_dec(Bry_bfr bfr, byte[] name) {
return Decode_char(bfr, Bry_.To_int(name));
}
public boolean When_hex(Bry_bfr bfr, byte[] name) {
return Decode_char(bfr, gplx.core.encoders.Hex_utl_.Parse_or(name, 0, name.length, -1));
}
public boolean When_amp(Bry_bfr bfr) {
bfr.Add_byte(Byte_ascii.Amp);
return true;
}
private boolean Decode_char(Bry_bfr bfr, int point) {// XO.MW:decodeChar
// Return UTF-8 String for a codepoint if that is a valid
// character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
if (sanitizer.Validate_codepoint(point)) {
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(point));
}
else {
bfr.Add(Utf8_replacement_char);
}
return true;
}
private static final byte[] Utf8_replacement_char = Bry_.New_by_ints(255, 253); // 0xfffd
}
class Xomw_regex_url_char {
// Regular expression to match various types of character references in
// Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
// static final CHAR_REFS_REGEX =
// '/&([A-Za-z0-9\x80-\xff]+);
// |&\#([0-9]+);
// |&\#[xX]([0-9A-Fa-f]+);
// |(&)/x';
public Xomw_regex_url_char() {
// assert static structs
if (Normalize__dec == null) {
synchronized (Xomw_sanitizer.class) {
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
Normalize__hex = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.To_ary();
Normalize__ent = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.Set_rng(128, 255)
.To_ary();
}
}
}
public byte[] Replace_by_cbk(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, Xomw_regex_url_char_cbk cbk) {
// XO.BRY_BFR
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
// search for "&"
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
int ent_bgn = find_bgn + 1; // +1 to skip &
// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
boolean[] regex = null;
// check for #;
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
ent_bgn++;
if (ent_bgn < src_end) {
byte nxt = src[ent_bgn];
// check for x
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
ent_bgn++;
regex = Normalize__hex;
}
}
if (regex == null)
regex = Normalize__dec;
}
else {
regex = Normalize__ent;
}
// keep looping until invalid regex
int ent_end = ent_bgn;
int b = Byte_ascii.Null;
for (int i = ent_bgn; i < src_end; i++) {
b = src[i] & 0xFF; // PATCH.JAVA:need to convert to unsigned byte
if (regex[b])
ent_end++;
else
break;
}
// mark dirty; can optimize later by checking if "&lt;" already exists
dirty = true;
if (bfr == null) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn); // add everything before &
// invalid <- regex ended, but not at semic
if (b != Byte_ascii.Semic) {
cbk.When_amp(bfr);
cur = find_bgn + 1; // position after "&"
continue;
}
// do normalization
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
boolean ret = false;
if (regex == Normalize__ent) {
cbk.When_ent(bfr, name);
ret = true;
}
else if (regex == Normalize__dec) {
ret = cbk.When_dec(bfr, name);
}
else if (regex == Normalize__hex) {
ret = cbk.When_hex(bfr, name);
}
if (!ret) {
cbk.When_amp(bfr);
cur = find_bgn + 1; // position after "&"
continue;
}
cur = ent_end + 1; // +1 to position after ";"
}
// XO.BRY_BFR
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
}

@ -19,19 +19,94 @@ package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
public class Xomw_sanitizer__tst {
private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
@Test public void Text() {fxt.Test__normalize_char_references("abc" , "abc");}
@Test public void Dec() {fxt.Test__normalize_char_references("&#08;" , "&amp;#08;");}
@Test public void Dec__invalid() {fxt.Test__normalize_char_references("&#09;" , "&#9;");}
@Test public void Hex() {fxt.Test__normalize_char_references("&#xFF;" , "&#xff;");}
@Test public void Entity() {fxt.Test__normalize_char_references("&alpha;" , "&#945;");}
@Test public void Entity__lt() {fxt.Test__normalize_char_references("&lt;" , "&lt;");}
@Test public void Invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&amp;(invalid);");}
@Test public void Many() {
@Test public void Normalize__text() {fxt.Test__normalize_char_references("abc" , "abc");}
@Test public void Normalize__dec() {fxt.Test__normalize_char_references("&#08;" , "&amp;#08;");}
@Test public void Normalize__dec__invalid() {fxt.Test__normalize_char_references("&#09;" , "&#9;");}
@Test public void Normalize__hex() {fxt.Test__normalize_char_references("&#xFF;" , "&#xff;");}
@Test public void Normalize__entity() {fxt.Test__normalize_char_references("&alpha;" , "&#945;");}
@Test public void Normalize__entity__lt() {fxt.Test__normalize_char_references("&lt;" , "&lt;");}
@Test public void Normalize__entity__alias() {fxt.Test__normalize_char_references("&רלמ;" , "&rlm;");}
@Test public void Normalize__amp() {fxt.Test__normalize_char_references("a&b" , "a&amp;b");}
@Test public void Normalize__invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&amp;(invalid);");}
@Test public void Normalize__many() {
fxt.Test__normalize_char_references
( "a &#09; b &alpha; c &#xFF; d &(invalid); e"
, "a &#9; b &#945; c &#xff; d &amp;(invalid); e"
);
}
@Test public void Regex__domain() {
Xomw_regex_find_domain regex_domain = new Xomw_regex_find_domain();
// normal
fxt.Test__regex_domain_y(regex_domain, "https://a.org/bcd", "https:", "//a.org", "/bcd");
// trailing backslash
fxt.Test__regex_domain_y(regex_domain, "https://a.org/", "https:", "//a.org", "/");
// domain only
fxt.Test__regex_domain_y(regex_domain, "https://a.org", "https:", "//a.org", "");
// colon not found
fxt.Test__regex_domain_n(regex_domain, "https//a.org/bcd");
// host_bgn.eos
fxt.Test__regex_domain_n(regex_domain, "https:");
// host_bgn.//
fxt.Test__regex_domain_n(regex_domain, "https:a//");
// host_bgn.///
fxt.Test__regex_domain_n(regex_domain, "https:///a.org/b");
}
@Test public void Regex__clean_url() {
Xomw_regex_escape_invalid regex = new Xomw_regex_escape_invalid();
// noop
fxt.Test__regex_escape_invalid(regex, "https://a.org/bcd", Bool_.N, "");
// symbols
fxt.Test__regex_escape_invalid(regex, "[]<>\"|", Bool_.Y, "%5B%5D%3C%3E%22%7C%7F");
// range: 00 - 32
fxt.Test__regex_escape_invalid(regex, "\t\n ", Bool_.Y, "%09%0A+");
}
@Test public void Regex__ipv6_brack() {
Xomw_regex_ipv6_brack regex = new Xomw_regex_ipv6_brack();
// basic
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5B0a.1b:12%5D:123");
// port: none
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D");
// port: multiple
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D:1:2:3");
// "//%5B" missing
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "abc");
// ipv6: invalid
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba!%5D:1");
// ipv6: 0-len
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5B%5D:1");
// port: invalid
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:a");
// port: 0-len
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:");
}
@Test public void Decode() {
// dec
fxt.Test__decode_char_references("&#33;" , "!");
// hex
fxt.Test__decode_char_references("&#x23;" , "#");
// entity
fxt.Test__decode_char_references("&alpha;" , "α");
// entity:lt
fxt.Test__decode_char_references("&lt;" , "<");
// entity:rlm
fxt.Test__decode_char_references("&רלמ;" , "");
// entity:invalid
fxt.Test__decode_char_references("&invalid;" , "&invalid;");
// amp
fxt.Test__decode_char_references("a&b" , "a&b");
}
@Test public void Clean_url() {
// entity
fxt.Test__clean_url("http://a.org/b&amp;c" , "http://a.org/b&c");
// entity: escape
fxt.Test__clean_url("http://a.org/b&quot;c" , "http://a.org/b%22c");
// domain=n; make sure &quot; is changed, but not soft-hyphen
fxt.Test__clean_url("a&quot;­z" , "a%22­z");
// host: invalid idn
fxt.Test__clean_url("http://a᠆b.org/c᠆d" , "http://ab.org/c᠆d");
// ipv6_brack
fxt.Test__clean_url("http://[0a.1b:12]:123/cd" , "http://[0a.1b:12]:123/cd");
}
}
class Xomw_sanitizer__fxt {
private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
@ -41,4 +116,33 @@ class Xomw_sanitizer__fxt {
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
Gftest.Eq__str(expd, tmp.To_str_and_clear());
}
public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(true, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
Gftest.Eq__str(expd_prot, Bry_.Mid(src_bry, regex_domain.prot_bgn, regex_domain.prot_end));
Gftest.Eq__str(expd_host, Bry_.Mid(src_bry, regex_domain.host_bgn, regex_domain.host_end));
Gftest.Eq__str(expd_rest, Bry_.Mid(src_bry, regex_domain.rest_bgn, regex_domain.rest_end));
}
public void Test__regex_domain_n(Xomw_regex_find_domain regex_domain, String src_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(false, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
}
public void Test__regex_escape_invalid(Xomw_regex_escape_invalid regex, String src_str, boolean expd_rslt, String expd_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(expd_rslt, regex.Escape(tmp, src_bry, 0, src_bry.length));
Gftest.Eq__str(expd_str, tmp.To_bry_and_clear());
}
public void Test__regex_ipv6_brack(Xomw_regex_ipv6_brack regex, boolean expd_rslt, String src_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(expd_rslt, regex.Match(src_bry, 0, src_bry.length));
}
public void Test__decode_char_references(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
Gftest.Eq__str(expd, tmp.To_str_and_clear());
}
public void Test__clean_url(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
}
}

@ -29,7 +29,7 @@ public class Xomw_parser {
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
private final Xomw_magiclinks_wkr magiclinks_wkr;
private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
private final Xomw_link_holders holders;
@ -50,13 +50,6 @@ public class Xomw_parser {
public Xomw_lnki_wkr Lnki_wkr() {return lnki_wkr;} private final Xomw_lnki_wkr lnki_wkr;
public boolean Output_type__wiki() {return output_type__wiki;} private final boolean output_type__wiki = false;
public Xomw_parser() {
this.protocols_trie = Xomw_parser.Protocols__dflt();
this.holders = new Xomw_link_holders(link_renderer, tmp);
this.table_wkr = new Xomw_table_wkr(this);
this.quote_wkr = new Xomw_quote_wkr(this);
this.lnke_wkr = new Xomw_lnke_wkr(this);
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
if (regex_space == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
regex_space = new Xomw_regex_space();
@ -64,13 +57,22 @@ public class Xomw_parser {
regex_url = new Xomw_regex_url(regex_space);
}
}
this.protocols_trie = Xomw_parser.Protocols__dflt();
this.holders = new Xomw_link_holders(link_renderer, tmp);
this.table_wkr = new Xomw_table_wkr(this);
this.quote_wkr = new Xomw_quote_wkr(this);
this.lnke_wkr = new Xomw_lnke_wkr(this);
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
this.magiclinks_wkr = new Xomw_magiclinks_wkr(sanitizer, linker, regex_boundary, regex_url);
}
public void Init_by_wiki(Xowe_wiki wiki) {
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
lnki_wkr.Init_by_wiki(wiki);
magiclinks_wkr.Init_by_wiki(linker, regex_boundary, regex_url);
doubleunder_wkr.Init_by_wiki(doubleunder_data, wiki.Lang());
magiclinks_wkr.Init_by_wiki();
}
public void Init_by_page(Xoa_ttl ttl) {
pctx.Init_by_page(ttl);
@ -115,7 +117,7 @@ public class Xomw_parser {
table_wkr.Do_table_stuff(pctx, pbfr);
hr_wkr.Replace_hrs(pctx, pbfr);
doubleunder_wkr.Do_double_underscore(pctx, pbfr);
doubleunder_wkr.Do_double_underscore(pctx, pbfr); // DONE: DATE:2017-01-27
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
lnki_wkr.Replace_internal_links(pctx, pbfr);

@ -0,0 +1,101 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
public class Xomw_regex_parser {
private Bry_bfr tmp;
public byte[][] Rslt() {return rslt;} private byte[][] rslt;
public Xomw_regex_parser Add_ary(String... ary) {return Set_or_add(Parse_ary(ary));}
private byte[][] Parse_ary(String... ary) {
if (tmp == null) tmp = Bry_bfr_.New();
int ary_len = ary.length;
byte[][] rv = new byte[ary_len][];
for (int i = 0; i < ary_len; i++) {
rv[i] = Compile_itm(tmp, Bry_.new_u8(ary[i]));
}
return rv;
}
public Xomw_regex_parser Add_rng(String bgn, String end) {return Set_or_add(Parse_rng(bgn, end));}
private byte[][] Parse_rng(String bgn, String end) {
if (tmp == null) tmp = Bry_bfr_.New();
byte[] bgn_bry = Compile_itm(tmp, Bry_.new_u8(bgn));
int bgn_val = gplx.core.intls.Utf16_.Decode_to_int(bgn_bry, 0);
byte[] end_bry = Compile_itm(tmp, Bry_.new_u8(end));
int end_val = gplx.core.intls.Utf16_.Decode_to_int(end_bry, 0);
int rv_len = end_val - bgn_val + 1;
byte[][] rv = new byte[rv_len][];
for (int i = 0; i < rv_len; i++) {
rv[i] = gplx.core.intls.Utf16_.Encode_int_to_bry(i + bgn_val);
}
return rv;
}
private Xomw_regex_parser Set_or_add(byte[][] val) {
rslt = rslt == null ? val : Bry_.Ary_add(rslt, val);
return this;
}
private static byte[] Compile_itm(Bry_bfr tmp, byte[] src) {
// parse each itm
int src_end = src.length;
int cur = 0;
int prv = cur;
boolean dirty = false;
while (true) {
// eos
if (cur == src_end) {
if (dirty)
tmp.Add_mid(src, prv, src_end);
break;
}
// look at byte
byte b = src[cur];
switch (b) { // escape
case Byte_ascii.Backslash:
int nxt = cur + 1;
if (nxt >= src_end) throw Err_.new_wo_type("regex escape failed: no more chars left", "src", src, "pos", nxt);
byte nxt_byte = src[nxt];
switch (nxt_byte) {
case Byte_ascii.Ltr_s: // \s -> " "
src = Byte_ascii.Space_bry;
cur = src_end;
break;
case Byte_ascii.Ltr_x: // \ u -> utf8 sequence in hex-dec; EX: "\xc2\xad" -> new byte[] {194, 160}
// read next two bytes
dirty = true;
nxt++;
if (nxt + 2 > src_end) throw Err_.new_wo_type("utf8 escape failed: no more chars left", "src", src, "pos", nxt);
tmp.Add_byte((byte)gplx.core.encoders.Hex_utl_.Parse_or(src, nxt, nxt + 2, -1));
cur = nxt + 2;
prv = cur;
break;
default:
throw Err_.new_wo_type("regex escape failed: unknown char", "src", src, "pos", nxt);
}
break;
default: // handles ascii only
if (b > 127)
throw Err_.new_wo_type("regex compiled failed: unknown char", "src", src, "pos", cur);
cur++;
break;
}
}
// set item
return dirty ? tmp.To_bry_and_clear() : src;
}
}

@ -0,0 +1,42 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_regex_parser__tst {
private final Xomw_regex_parser__fxt fxt = new Xomw_regex_parser__fxt();
@Test public void Ary__space() {
fxt.Test__parse_ary(String_.Ary("\\s"), String_.Ary(" "));
}
@Test public void Ary__utf8() {
fxt.Test__parse_ary(String_.Ary("\\xc2\\xa7", "\\xe0\\xb9\\x90"), String_.Ary("§", ""));
}
@Test public void Rng__ascii() {
fxt.Test__parse_rng("a", "c", String_.Ary("a", "b", "c"));
}
}
class Xomw_regex_parser__fxt {
private final Xomw_regex_parser parser = new Xomw_regex_parser();
public void Test__parse_ary(String[] ary, String[] expd) {
parser.Add_ary(ary);
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
}
public void Test__parse_rng(String bgn, String end, String[] expd) {
parser.Add_rng("a", "c");
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
}
}

@ -20,6 +20,7 @@ import gplx.core.btries.*;
public class Xomw_regex_url {
private final Btrie_slim_mgr trie;
public Xomw_regex_url(Xomw_regex_space regex_space) {
// [^][<>"\\x00-\\x20\\x7F\|]
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
this.trie = Btrie_slim_mgr.cs();
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");

@ -19,28 +19,40 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
import gplx.langs.regxs.*;
// TODO.XO: getExternalLinkAttribs($url)
// TODO.XO: this->getConverterLanguage()->markNoConversion($url, true),
public class Xomw_magiclinks_wkr {
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
private final Btrie_rv trv = new Btrie_rv();
private static byte[] Tag__anch__rhs;
private boolean[] url_separators;
private static Xomw_regex_link_interrupt regex_link_interrupt;
private Xomw_regex_boundary regex_boundary;
private Xomw_regex_url regex_url;
private Xomw_linker linker;
private final Xomw_regex_boundary regex_boundary;
private final Xomw_regex_url regex_url;
private final Xomw_sanitizer sanitizer;
private final Xomw_linker linker;
private byte[] page_title;
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
public Xomw_magiclinks_wkr() {
public Xomw_magiclinks_wkr(Xomw_sanitizer sanitizer, Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
this.sanitizer = sanitizer;
this.linker = linker;
this.regex_boundary = regex_boundary;
this.regex_url = regex_url;
// ',;\.:!?'
url_separators = Bool_ary_bldr.New_u8()
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
.To_ary();
if (Tag__anch__rhs == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Tag__anch__rhs = Bry_.new_a7("</a>");
regex_link_interrupt = new Xomw_regex_link_interrupt();
}
}
}
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
this.linker = linker;
this.regex_boundary = regex_boundary;
this.regex_url = regex_url;
public void Init_by_wiki() {
regex_trie.Add_str_byte("<a", Regex__anch);
regex_trie.Add_str_byte("<" , Regex__elem);
@ -50,13 +62,6 @@ public class Xomw_magiclinks_wkr {
Gfo_protocol_itm itm = protocol_ary[i];
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
}
if (Tag__anch__rhs == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Tag__anch__rhs = Bry_.new_a7("</a>");
regex_link_interrupt = new Xomw_regex_link_interrupt();
}
}
}
// Replace special strings like "ISBN xxx" and "RFC xxx" with
@ -247,7 +252,7 @@ public class Xomw_magiclinks_wkr {
return;
}
// $url = Sanitizer::cleanUrl($url);
url = sanitizer.Clean_url(url);
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
// Is this an external image?

@ -34,10 +34,8 @@ public class Xomw_magiclinks_wkr__tst {
fxt.Test__parse("a https://&lt; z" , "a https://&lt; z");
}
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
// hex-dec
fxt.Test__parse("a https://b.org&#x60;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;z'>https://b.org&amp;#x60;z</a>");
// dec-hex
fxt.Test__parse("a https://b.org&#3c;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
fxt.Test__parse("a https://b.org&#3c;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
}
@Test public void Separator() {
// basic; ,;.:!?
@ -50,10 +48,10 @@ public class Xomw_magiclinks_wkr__tst {
fxt.Test__parse("a https://b.org;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
// ";" included b/c of ent
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? z");
// ";" included b/c of hex
fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? z");
// ";" included b/c of dec
fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? z");
// ";" included b/c of hex; note that Clean_url changes "&#xB1;" to "±"
fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org±'>https://b.org±</a>.:!? z");
// ";" included b/c of dec; note that Clean_url changes "&#123;" to "{"
fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org{'>https://b.org{</a>.:!? z");
// ";" excluded b/c of invalid.ent
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? z");
// ";" excluded b/c of invalid.hex
@ -63,9 +61,13 @@ public class Xomw_magiclinks_wkr__tst {
// num_post_proto rule
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
}
@Test public void Clean_url() {
// basic
fxt.Test__parse("http://a᠆b.org/c᠆d" , "<a class='external free' rel='nofollow' href='http://ab.org/c᠆d'>http://ab.org/c᠆d</a>");
}
}
class Xomw_magiclinks_wkr__fxt {
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
private final Xomw_magiclinks_wkr wkr;
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public Xomw_magiclinks_wkr__fxt() {
@ -74,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {
Xomw_regex_space regex_space = new Xomw_regex_space();
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
this.wkr = new Xomw_magiclinks_wkr(new Xomw_sanitizer(), new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
wkr.Init_by_wiki();
}
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
public void Test__parse(boolean apos, String src_str, String expd) {

@ -60,13 +60,13 @@ public class Xomw_ttl_utl {
if (cur == src_end) break;
byte b = src[cur];
int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
if (b_len == 1) { // ASCII
if (valid[b]) // valid; EX: "a0A B&$"
if (b_len == 1) { // ASCII
if (valid[b & 0xFF]) // valid; EX: "a0A B&$"; PATCH.JAVA:need to convert to unsigned byte
cur++;
else // invalid; EX: "<title>"
else // invalid; EX: "<title>"
break;
}
else { // Multi-byte UTF8; NOTE: all sequences are valid
else { // Multi-byte UTF8; NOTE: all sequences are valid
cur += b_len;
}
}

Loading…
Cancel
Save