1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Mw_parse: Support num_post_proto rule; clarify variable names

This commit is contained in:
gnosygnu 2017-01-28 07:15:35 -05:00
parent 7e27b5415d
commit c77e8a4374
3 changed files with 125 additions and 51 deletions

View File

@ -22,6 +22,59 @@ import gplx.xowa.mws.parsers.*;
public class Xomw_sanitizer {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
// static function cleanUrl($url) {
// // Normalize any HTML entities in input. They will be
// // re-escaped by makeExternalLink().
// $url = Sanitizer::decodeCharReferences($url);
//
// // Escape any control characters introduced by the above step
// $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
// [ __CLASS__, 'cleanUrlCallback' ], $url);
//
// // Validate hostname portion
// $matches = [];
// if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
// list(/* $whole */, $protocol, $host, $rest) = $matches;
//
// // Characters that will be ignored in IDNs.
// // https://tools.ietf.org/html/rfc3454#section-3.1
// // Strip them before further processing so blacklists and such work.
// $strip = "/
// \\s| // general whitespace
// \xc2\xad| // 00ad SOFT HYPHEN
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
// \xe2\x81\xa0| // 2060 WORD JOINER
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
// /xuD";
//
// $host = preg_replace($strip, '', $host);
//
// // IPv6 host names are bracketed with []. Url-decode these.
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// ) {
// $host = '//[' . $matches[1] . ']' . $matches[2];
// }
//
// // @todo FIXME: Validate hostnames here
//
// return $protocol . $host . $rest;
// } else {
// return $url;
// }
// }
//
// static function cleanUrlCallback($matches) {
// return urlencode($matches[0]);
// }
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
@ -29,11 +82,11 @@ public class Xomw_sanitizer {
// PORTED: Sanitizer.php|safeEncodeTagAttributes
for (int i = 0; i < len; i++) {
// $encAttribute = htmlspecialchars( $attribute );
// $encValue = Sanitizer::safeEncodeAttribute( $value );
// $encAttribute = htmlspecialchars($attribute);
// $encValue = Sanitizer::safeEncodeAttribute($value);
// $attribs[] = "$encAttribute=\"$encValue\"";
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
bfr.Add_byte_space(); // "return count($attribs) ? ' ' . implode(' ', $attribs) : '';"
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
bfr.Add_byte_eq().Add_byte_quote();
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
@ -201,7 +254,7 @@ public class Xomw_sanitizer {
int point = Hex_utl_.Parse_or(codepoint, -1);
if (Validate_codepoint(point)) {
bfr.Add_str_a7("&#x");
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf( '&#x%x;', $point )
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
bfr.Add_byte_semic();
return true;
}

View File

@ -104,32 +104,36 @@ public class Xomw_magiclinks_wkr {
// looks like magiclink; do additional processing
byte regex_tid = ((Byte_obj_val)o).Val();
int old_pos = cur;
int trv_pos = trv.Pos();
int nxt_pos = trv_pos;
int hook_bgn = cur;
int hook_end = trv.Pos();
int tmp_pos = hook_end;
boolean regex_valid = true;
switch (regex_tid) {
case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
if (trv_pos < src_end) {
// find ws in "[ \t\r\n>]"
byte ws_byte = src[cur];
if (tmp_pos < src_end) {
// find "[ \t\r\n>]" after "<a"; i.e.: don't match "<ab" or "<ac", etc..
byte ws_byte = src[tmp_pos];
switch (ws_byte) {
// next char after "<a" is ws -> valid
case Byte_ascii.Space:
case Byte_ascii.Tab:
case Byte_ascii.Cr:
case Byte_ascii.Nl:
break;
// next char after "<a" is not ws -> invalid
default:
regex_valid = false;
break;
}
if (regex_valid) {
// find </a>
nxt_pos++;
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
tmp_pos++;
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, tmp_pos, src_end);
// </a> not found -> invalid
if (anch_end == Bry_find_.Not_found) {
regex_valid = false;
}
// </a> found -> valid; set cur to after "</a>"
else {
cur = anch_end + Tag__anch__rhs.length;
}
@ -141,36 +145,51 @@ public class Xomw_magiclinks_wkr {
break;
case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// just find ">"
int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
if (elem_end == Bry_find_.Not_found)
tmp_pos = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, tmp_pos, src_end);
// > not found -> invalid
if (tmp_pos == Bry_find_.Not_found) {
regex_valid = false;
else
cur = elem_end + 1;
}
// > found -> valid; set cur to after ">"
else {
cur = tmp_pos + 1;
}
break;
case Regex__free:
if (regex_boundary.Is_boundary_prv(src, cur)) {
int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end);
if (url_end == nxt_pos) {
// make sure that protocol starts at word bound; EX: "ahttp://a.org" should be invalid
if (regex_boundary.Is_boundary_prv(src, hook_bgn)) {
// skip forward until invalid url char
tmp_pos = regex_url.Find_fwd_while(trv, src, tmp_pos, src_end);
// no url chars found -> invalid
if (tmp_pos == hook_end) {
regex_valid = false;
}
else
cur = url_end;
// url chars found -> valid; set cur to 1st invalid url-char;
else {
cur = tmp_pos;
}
}
else
regex_valid = false;
break;
}
// regex is invalid; advance by 1 and continue;
if (!regex_valid) {
cur++;
}
// regex is valid
else {
// handle free
if (regex_tid == Regex__free) {
this.page_title = pctx.Page_title().Full_db();
dirty = true;
bfr.Add_mid(src, prv, old_pos);
this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0);
bfr.Add_mid(src, prv, hook_bgn);
byte[] url = Bry_.Mid(src, hook_bgn, cur);
int num_post_proto = cur - hook_end; // get length of url without proto; EX: "http://a.org" should be 5 ("a.org")
this.Make_free_external_link(bfr, url, num_post_proto);
prv = cur;
}
// "<a " and "<" just need to be ignored; note that they already update cur so noop
else {
}
}
@ -223,9 +242,10 @@ public class Xomw_magiclinks_wkr {
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
// if (strlen($trail) >= $numPostProto) {
// return $url . $trail;
// }
if (trail.length >= num_post_proto) {
bfr.Add_bry_many(url, trail);
return;
}
// $url = Sanitizer::cleanUrl($url);

View File

@ -19,55 +19,55 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
import org.junit.*;
public class Xomw_magiclinks_wkr__tst {
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
@Test public void Basic() {fxt.Test__parse("a https://b.org z", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> z");}
@Test public void Invalid() {fxt.Test__parse("a _https://b.org z", "a _https://b.org z");}
@Test public void Tag__anch() {fxt.Test__parse("a <a title=\"https://b.org\">b</a> z", "a <a title=\"https://b.org\">b</a> z");}
@Test public void Tag__misc() {fxt.Test__parse("a <div title=\"https://b.org\">b</div> z", "a <div title=\"https://b.org\">b</div> z");}
@Test public void Interrupt() {
// ent
fxt.Test__parse("a https://b.org&lt;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&lt;c");
fxt.Test__parse("a https://b.org&lt;z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&lt;z");
// hex
fxt.Test__parse("a https://b.org&#x3c;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#x3c;c");
fxt.Test__parse("a https://b.org&#x3c;z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#x3c;z");
// dec
fxt.Test__parse("a https://b.org&#60;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#60;c");
fxt.Test__parse("a https://b.org&#60;z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#60;z");
// num_post_proto rule
fxt.Test__parse("a https://&lt; z" , "a https://&lt; z");
}
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
// hex-dec
fxt.Test__parse("a https://b.org&#x60;c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;c'>https://b.org&amp;#x60;c</a>");
fxt.Test__parse("a https://b.org&#x60;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;z'>https://b.org&amp;#x60;z</a>");
// dec-hex
fxt.Test__parse("a https://b.org&#3c;c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;c'>https://b.org&amp;#3c;c</a>");
fxt.Test__parse("a https://b.org&#3c;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
}
@Test public void Separator() {
// basic
fxt.Test__parse("a https://b.org.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>.:!? c");
// basic; ,;.:!?
fxt.Test__parse("a https://b.org,;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>,;.:!? z");
// ")" excluded
fxt.Test__parse("a https://b.org).:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? c");
fxt.Test__parse("a https://b.org).:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? z");
// ")" included b/c "(" exists
fxt.Test__parse("a https://b.org().:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? c");
fxt.Test__parse("a https://b.org().:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? z");
// ";" excluded
fxt.Test__parse("a https://b.org;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? c");
fxt.Test__parse("a https://b.org;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
// ";" included b/c of ent
fxt.Test__parse("a https://b.org&abc;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? c");
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? z");
// ";" included b/c of hex
fxt.Test__parse("a https://b.org&#xB1;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? c");
fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? z");
// ";" included b/c of dec
fxt.Test__parse("a https://b.org&#123;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? c");
fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? z");
// ";" excluded b/c of invalid.ent
fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? c");
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? z");
// ";" excluded b/c of invalid.hex
fxt.Test__parse("a https://b.org&#x;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? c");
fxt.Test__parse("a https://b.org&#x;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? z");
// ";" excluded b/c of invalid.dec
fxt.Test__parse("a https://b.org&#a;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? c");
fxt.Test__parse("a https://b.org&#a;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? z");
// num_post_proto rule
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
}
/*
TESTS: regex
"<a https://a.org>"
"<img https://a.org>"
*/
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
}
class Xomw_magiclinks_wkr__fxt {
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_magiclinks_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
@ -76,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
}
public void Test__parse(String src_str, String expd) {
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
public void Test__parse(boolean apos, String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
pbfr.Init(src_bry);
wkr.Do_magic_links(pctx, pbfr);