mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse: Support num_post_proto rule; clarify variable names
This commit is contained in:
parent
7e27b5415d
commit
c77e8a4374
@ -22,6 +22,59 @@ import gplx.xowa.mws.parsers.*;
|
||||
public class Xomw_sanitizer {
|
||||
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
||||
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
||||
// static function cleanUrl($url) {
|
||||
// // Normalize any HTML entities in input. They will be
|
||||
// // re-escaped by makeExternalLink().
|
||||
// $url = Sanitizer::decodeCharReferences($url);
|
||||
//
|
||||
// // Escape any control characters introduced by the above step
|
||||
// $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
|
||||
// [ __CLASS__, 'cleanUrlCallback' ], $url);
|
||||
//
|
||||
// // Validate hostname portion
|
||||
// $matches = [];
|
||||
// if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
|
||||
// list(/* $whole */, $protocol, $host, $rest) = $matches;
|
||||
//
|
||||
// // Characters that will be ignored in IDNs.
|
||||
// // https://tools.ietf.org/html/rfc3454#section-3.1
|
||||
// // Strip them before further processing so blacklists and such work.
|
||||
// $strip = "/
|
||||
// \\s| // general whitespace
|
||||
// \xc2\xad| // 00ad SOFT HYPHEN
|
||||
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
|
||||
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
|
||||
// \xe2\x81\xa0| // 2060 WORD JOINER
|
||||
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
|
||||
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
|
||||
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
|
||||
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
|
||||
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
|
||||
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
|
||||
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
|
||||
// /xuD";
|
||||
//
|
||||
// $host = preg_replace($strip, '', $host);
|
||||
//
|
||||
// // IPv6 host names are bracketed with []. Url-decode these.
|
||||
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
|
||||
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
|
||||
// ) {
|
||||
// $host = '//[' . $matches[1] . ']' . $matches[2];
|
||||
// }
|
||||
//
|
||||
// // @todo FIXME: Validate hostnames here
|
||||
//
|
||||
// return $protocol . $host . $rest;
|
||||
// } else {
|
||||
// return $url;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// static function cleanUrlCallback($matches) {
|
||||
// return urlencode($matches[0]);
|
||||
// }
|
||||
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
|
||||
atr_bldr.Atrs__clear();
|
||||
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
|
||||
@ -29,11 +82,11 @@ public class Xomw_sanitizer {
|
||||
|
||||
// PORTED: Sanitizer.php|safeEncodeTagAttributes
|
||||
for (int i = 0; i < len; i++) {
|
||||
// $encAttribute = htmlspecialchars( $attribute );
|
||||
// $encValue = Sanitizer::safeEncodeAttribute( $value );
|
||||
// $encAttribute = htmlspecialchars($attribute);
|
||||
// $encValue = Sanitizer::safeEncodeAttribute($value);
|
||||
// $attribs[] = "$encAttribute=\"$encValue\"";
|
||||
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
|
||||
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
|
||||
bfr.Add_byte_space(); // "return count($attribs) ? ' ' . implode(' ', $attribs) : '';"
|
||||
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
|
||||
bfr.Add_byte_eq().Add_byte_quote();
|
||||
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
|
||||
@ -201,7 +254,7 @@ public class Xomw_sanitizer {
|
||||
int point = Hex_utl_.Parse_or(codepoint, -1);
|
||||
if (Validate_codepoint(point)) {
|
||||
bfr.Add_str_a7("&#x");
|
||||
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf( '&#x%x;', $point )
|
||||
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
|
||||
bfr.Add_byte_semic();
|
||||
return true;
|
||||
}
|
||||
|
@ -104,32 +104,36 @@ public class Xomw_magiclinks_wkr {
|
||||
|
||||
// looks like magiclink; do additional processing
|
||||
byte regex_tid = ((Byte_obj_val)o).Val();
|
||||
int old_pos = cur;
|
||||
int trv_pos = trv.Pos();
|
||||
int nxt_pos = trv_pos;
|
||||
int hook_bgn = cur;
|
||||
int hook_end = trv.Pos();
|
||||
int tmp_pos = hook_end;
|
||||
boolean regex_valid = true;
|
||||
switch (regex_tid) {
|
||||
case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
if (trv_pos < src_end) {
|
||||
// find ws in "[ \t\r\n>]"
|
||||
byte ws_byte = src[cur];
|
||||
if (tmp_pos < src_end) {
|
||||
// find "[ \t\r\n>]" after "<a"; i.e.: don't match "<ab" or "<ac", etc..
|
||||
byte ws_byte = src[tmp_pos];
|
||||
switch (ws_byte) {
|
||||
// next char after "<a" is ws -> valid
|
||||
case Byte_ascii.Space:
|
||||
case Byte_ascii.Tab:
|
||||
case Byte_ascii.Cr:
|
||||
case Byte_ascii.Nl:
|
||||
break;
|
||||
// next char after "<a" is not ws -> invalid
|
||||
default:
|
||||
regex_valid = false;
|
||||
break;
|
||||
}
|
||||
if (regex_valid) {
|
||||
// find </a>
|
||||
nxt_pos++;
|
||||
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
|
||||
tmp_pos++;
|
||||
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, tmp_pos, src_end);
|
||||
// </a> not found -> invalid
|
||||
if (anch_end == Bry_find_.Not_found) {
|
||||
regex_valid = false;
|
||||
}
|
||||
// </a> found -> valid; set cur to after "</a>"
|
||||
else {
|
||||
cur = anch_end + Tag__anch__rhs.length;
|
||||
}
|
||||
@ -141,36 +145,51 @@ public class Xomw_magiclinks_wkr {
|
||||
break;
|
||||
case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
|
||||
// just find ">"
|
||||
int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
|
||||
if (elem_end == Bry_find_.Not_found)
|
||||
tmp_pos = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, tmp_pos, src_end);
|
||||
// > not found -> invalid
|
||||
if (tmp_pos == Bry_find_.Not_found) {
|
||||
regex_valid = false;
|
||||
else
|
||||
cur = elem_end + 1;
|
||||
}
|
||||
// > found -> valid; set cur to after ">"
|
||||
else {
|
||||
cur = tmp_pos + 1;
|
||||
}
|
||||
break;
|
||||
case Regex__free:
|
||||
if (regex_boundary.Is_boundary_prv(src, cur)) {
|
||||
int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end);
|
||||
if (url_end == nxt_pos) {
|
||||
// make sure that protocol starts at word bound; EX: "ahttp://a.org" should be invalid
|
||||
if (regex_boundary.Is_boundary_prv(src, hook_bgn)) {
|
||||
// skip forward until invalid url char
|
||||
tmp_pos = regex_url.Find_fwd_while(trv, src, tmp_pos, src_end);
|
||||
// no url chars found -> invalid
|
||||
if (tmp_pos == hook_end) {
|
||||
regex_valid = false;
|
||||
}
|
||||
else
|
||||
cur = url_end;
|
||||
// url chars found -> valid; set cur to 1st invalid url-char;
|
||||
else {
|
||||
cur = tmp_pos;
|
||||
}
|
||||
}
|
||||
else
|
||||
regex_valid = false;
|
||||
break;
|
||||
}
|
||||
// regex is invalid; advance by 1 and continue;
|
||||
if (!regex_valid) {
|
||||
cur++;
|
||||
}
|
||||
// regex is valid
|
||||
else {
|
||||
// handle free
|
||||
if (regex_tid == Regex__free) {
|
||||
this.page_title = pctx.Page_title().Full_db();
|
||||
dirty = true;
|
||||
bfr.Add_mid(src, prv, old_pos);
|
||||
this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0);
|
||||
bfr.Add_mid(src, prv, hook_bgn);
|
||||
byte[] url = Bry_.Mid(src, hook_bgn, cur);
|
||||
int num_post_proto = cur - hook_end; // get length of url without proto; EX: "http://a.org" should be 5 ("a.org")
|
||||
this.Make_free_external_link(bfr, url, num_post_proto);
|
||||
prv = cur;
|
||||
}
|
||||
// "<a " and "<" just need to be ignored; note that they already update cur so noop
|
||||
else {
|
||||
}
|
||||
}
|
||||
@ -223,9 +242,10 @@ public class Xomw_magiclinks_wkr {
|
||||
|
||||
// Verify that we still have a real URL after trail removal, and
|
||||
// not just lone protocol
|
||||
// if (strlen($trail) >= $numPostProto) {
|
||||
// return $url . $trail;
|
||||
// }
|
||||
if (trail.length >= num_post_proto) {
|
||||
bfr.Add_bry_many(url, trail);
|
||||
return;
|
||||
}
|
||||
|
||||
// $url = Sanitizer::cleanUrl($url);
|
||||
|
||||
|
@ -19,55 +19,55 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
|
||||
import org.junit.*;
|
||||
public class Xomw_magiclinks_wkr__tst {
|
||||
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
|
||||
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
|
||||
@Test public void Basic() {fxt.Test__parse("a https://b.org z", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> z");}
|
||||
@Test public void Invalid() {fxt.Test__parse("a _https://b.org z", "a _https://b.org z");}
|
||||
@Test public void Tag__anch() {fxt.Test__parse("a <a title=\"https://b.org\">b</a> z", "a <a title=\"https://b.org\">b</a> z");}
|
||||
@Test public void Tag__misc() {fxt.Test__parse("a <div title=\"https://b.org\">b</div> z", "a <div title=\"https://b.org\">b</div> z");}
|
||||
@Test public void Interrupt() {
|
||||
// ent
|
||||
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||
fxt.Test__parse("a https://b.org<z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><z");
|
||||
// hex
|
||||
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||
fxt.Test__parse("a https://b.org<z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><z");
|
||||
// dec
|
||||
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||
fxt.Test__parse("a https://b.org<z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><z");
|
||||
// num_post_proto rule
|
||||
fxt.Test__parse("a https://< z" , "a https://< z");
|
||||
}
|
||||
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
|
||||
// hex-dec
|
||||
fxt.Test__parse("a https://b.org`c" , "a <a class='external free' rel='nofollow' href='https://b.org&#x60;c'>https://b.org&#x60;c</a>");
|
||||
fxt.Test__parse("a https://b.org`z" , "a <a class='external free' rel='nofollow' href='https://b.org&#x60;z'>https://b.org&#x60;z</a>");
|
||||
// dec-hex
|
||||
fxt.Test__parse("a https://b.orgc;c" , "a <a class='external free' rel='nofollow' href='https://b.org&#3c;c'>https://b.org&#3c;c</a>");
|
||||
fxt.Test__parse("a https://b.orgc;z" , "a <a class='external free' rel='nofollow' href='https://b.org&#3c;z'>https://b.org&#3c;z</a>");
|
||||
}
|
||||
@Test public void Separator() {
|
||||
// basic
|
||||
fxt.Test__parse("a https://b.org.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>.:!? c");
|
||||
// basic; ,;.:!?
|
||||
fxt.Test__parse("a https://b.org,;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>,;.:!? z");
|
||||
// ")" excluded
|
||||
fxt.Test__parse("a https://b.org).:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? c");
|
||||
fxt.Test__parse("a https://b.org).:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? z");
|
||||
// ")" included b/c "(" exists
|
||||
fxt.Test__parse("a https://b.org().:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? c");
|
||||
fxt.Test__parse("a https://b.org().:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? z");
|
||||
// ";" excluded
|
||||
fxt.Test__parse("a https://b.org;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? c");
|
||||
fxt.Test__parse("a https://b.org;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
|
||||
// ";" included b/c of ent
|
||||
fxt.Test__parse("a https://b.org&abc;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&abc;'>https://b.org&abc;</a>.:!? c");
|
||||
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&abc;'>https://b.org&abc;</a>.:!? z");
|
||||
// ";" included b/c of hex
|
||||
fxt.Test__parse("a https://b.org±.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&#xB1;'>https://b.org&#xB1;</a>.:!? c");
|
||||
fxt.Test__parse("a https://b.org±.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&#xB1;'>https://b.org&#xB1;</a>.:!? z");
|
||||
// ";" included b/c of dec
|
||||
fxt.Test__parse("a https://b.org{.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&#123;'>https://b.org&#123;</a>.:!? c");
|
||||
fxt.Test__parse("a https://b.org{.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&#123;'>https://b.org&#123;</a>.:!? z");
|
||||
// ";" excluded b/c of invalid.ent
|
||||
fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&a1b'>https://b.org&a1b</a>;.:!? c");
|
||||
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&a1b'>https://b.org&a1b</a>;.:!? z");
|
||||
// ";" excluded b/c of invalid.hex
|
||||
fxt.Test__parse("a https://b.org&#x;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&#x'>https://b.org&#x</a>;.:!? c");
|
||||
fxt.Test__parse("a https://b.org&#x;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&#x'>https://b.org&#x</a>;.:!? z");
|
||||
// ";" excluded b/c of invalid.dec
|
||||
fxt.Test__parse("a https://b.org&#a;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&#a'>https://b.org&#a</a>;.:!? c");
|
||||
fxt.Test__parse("a https://b.org&#a;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&#a'>https://b.org&#a</a>;.:!? z");
|
||||
// num_post_proto rule
|
||||
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
|
||||
}
|
||||
/*
|
||||
TESTS: regex
|
||||
"<a https://a.org>"
|
||||
"<img https://a.org>"
|
||||
*/
|
||||
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
|
||||
}
|
||||
class Xomw_magiclinks_wkr__fxt {
|
||||
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
|
||||
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
||||
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
|
||||
private boolean apos = true;
|
||||
public Xomw_magiclinks_wkr__fxt() {
|
||||
Xoae_app app = Xoa_app_fxt.Make__app__edit();
|
||||
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
|
||||
@ -76,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {
|
||||
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
|
||||
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
|
||||
}
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
|
||||
public void Test__parse(boolean apos, String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
pbfr.Init(src_bry);
|
||||
wkr.Do_magic_links(pctx, pbfr);
|
||||
|
Loading…
Reference in New Issue
Block a user