From c77e8a4374457331d85259d4441f2923aeb54c23 Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sat, 28 Jan 2017 07:15:35 -0500 Subject: [PATCH] Mw_parse: Support num_post_proto rule; clarify variable names --- .../src/gplx/xowa/mws/Xomw_sanitizer.java | 61 ++++++++++++++++-- .../magiclinks/Xomw_magiclinks_wkr.java | 64 ++++++++++++------- .../magiclinks/Xomw_magiclinks_wkr__tst.java | 51 +++++++-------- 3 files changed, 125 insertions(+), 51 deletions(-) diff --git a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java index 470cd0ec6..fc111d4b1 100644 --- a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java +++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java @@ -22,6 +22,59 @@ import gplx.xowa.mws.parsers.*; public class Xomw_sanitizer { private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr(); private final Mwh_atr_parser atr_parser = new Mwh_atr_parser(); +// static function cleanUrl($url) { +// // Normalize any HTML entities in input. They will be +// // re-escaped by makeExternalLink(). +// $url = Sanitizer::decodeCharReferences($url); +// +// // Escape any control characters introduced by the above step +// $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/', +// [ __CLASS__, 'cleanUrlCallback' ], $url); +// +// // Validate hostname portion +// $matches = []; +// if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) { +// list(/* $whole */, $protocol, $host, $rest) = $matches; +// +// // Characters that will be ignored in IDNs. +// // https://tools.ietf.org/html/rfc3454#section-3.1 +// // Strip them before further processing so blacklists and such work. +// $strip = "/ +// \\s| // general whitespace +// \xc2\xad| // 00ad SOFT HYPHEN +// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN +// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE +// \xe2\x81\xa0| // 2060 WORD JOINER +// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE +// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER +// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE +// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO +// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE +// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER +// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER +// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16 +// /xuD"; +// +// $host = preg_replace($strip, '', $host); +// +// // IPv6 host names are bracketed with []. Url-decode these. +// if (substr_compare("//%5B", $host, 0, 5) === 0 && +// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches) +// ) { +// $host = '//[' . $matches[1] . ']' . $matches[2]; +// } +// +// // @todo FIXME: Validate hostnames here +// +// return $protocol . $host . $rest; +// } else { +// return $url; +// } +// } +// +// static function cleanUrlCallback($matches) { +// return urlencode($matches[0]); +// } public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) { atr_bldr.Atrs__clear(); atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length); @@ -29,11 +82,11 @@ public class Xomw_sanitizer { // PORTED: Sanitizer.php|safeEncodeTagAttributes for (int i = 0; i < len; i++) { - // $encAttribute = htmlspecialchars( $attribute ); - // $encValue = Sanitizer::safeEncodeAttribute( $value ); + // $encAttribute = htmlspecialchars($attribute); + // $encValue = Sanitizer::safeEncodeAttribute($value); // $attribs[] = "$encAttribute=\"$encValue\""; Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i); - bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';" + bfr.Add_byte_space(); // "return count($attribs) ? ' ' . implode(' ', $attribs) : '';" bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end()); bfr.Add_byte_eq().Add_byte_quote(); bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode @@ -201,7 +254,7 @@ public class Xomw_sanitizer { int point = Hex_utl_.Parse_or(codepoint, -1); if (Validate_codepoint(point)) { bfr.Add_str_a7("&#x"); - Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf( '&#x%x;', $point ) + Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point) bfr.Add_byte_semic(); return true; } diff --git a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java index 63191243e..1b81fc0c1 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java @@ -104,32 +104,36 @@ public class Xomw_magiclinks_wkr { // looks like magiclink; do additional processing byte regex_tid = ((Byte_obj_val)o).Val(); - int old_pos = cur; - int trv_pos = trv.Pos(); - int nxt_pos = trv_pos; + int hook_bgn = cur; + int hook_end = trv.Pos(); + int tmp_pos = hook_end; boolean regex_valid = true; switch (regex_tid) { case Regex__anch: // (].*?) | // m[1]: Skip link text - if (trv_pos < src_end) { - // find ws in "[ \t\r\n>]" - byte ws_byte = src[cur]; + if (tmp_pos < src_end) { + // find "[ \t\r\n>]" after " valid case Byte_ascii.Space: case Byte_ascii.Tab: case Byte_ascii.Cr: case Byte_ascii.Nl: break; + // next char after " invalid default: regex_valid = false; break; } if (regex_valid) { // find - nxt_pos++; - int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end); + tmp_pos++; + int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, tmp_pos, src_end); + // not found -> invalid if (anch_end == Bry_find_.Not_found) { regex_valid = false; } + // found -> valid; set cur to after "" else { cur = anch_end + Tag__anch__rhs.length; } @@ -141,36 +145,51 @@ public class Xomw_magiclinks_wkr { break; case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside // just find ">" - int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end); - if (elem_end == Bry_find_.Not_found) + tmp_pos = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, tmp_pos, src_end); + // > not found -> invalid + if (tmp_pos == Bry_find_.Not_found) { regex_valid = false; - else - cur = elem_end + 1; + } + // > found -> valid; set cur to after ">" + else { + cur = tmp_pos + 1; + } break; case Regex__free: - if (regex_boundary.Is_boundary_prv(src, cur)) { - int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end); - if (url_end == nxt_pos) { + // make sure that protocol starts at word bound; EX: "ahttp://a.org" should be invalid + if (regex_boundary.Is_boundary_prv(src, hook_bgn)) { + // skip forward until invalid url char + tmp_pos = regex_url.Find_fwd_while(trv, src, tmp_pos, src_end); + // no url chars found -> invalid + if (tmp_pos == hook_end) { regex_valid = false; } - else - cur = url_end; + // url chars found -> valid; set cur to 1st invalid url-char; + else { + cur = tmp_pos; + } } else regex_valid = false; break; } + // regex is invalid; advance by 1 and continue; if (!regex_valid) { cur++; } + // regex is valid else { + // handle free if (regex_tid == Regex__free) { this.page_title = pctx.Page_title().Full_db(); dirty = true; - bfr.Add_mid(src, prv, old_pos); - this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0); + bfr.Add_mid(src, prv, hook_bgn); + byte[] url = Bry_.Mid(src, hook_bgn, cur); + int num_post_proto = cur - hook_end; // get length of url without proto; EX: "http://a.org" should be 5 ("a.org") + this.Make_free_external_link(bfr, url, num_post_proto); prv = cur; } + // "= $numPostProto) { -// return $url . $trail; -// } + if (trail.length >= num_post_proto) { + bfr.Add_bry_many(url, trail); + return; + } // $url = Sanitizer::cleanUrl($url); diff --git a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java index f12ac9974..722628a55 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java @@ -19,55 +19,55 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp import org.junit.*; public class Xomw_magiclinks_wkr__tst { private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt(); - @Test public void Basic() {fxt.Test__parse("a https://b.org c", "a https://b.org c");} + @Test public void Basic() {fxt.Test__parse("a https://b.org z", "a https://b.org z");} + @Test public void Invalid() {fxt.Test__parse("a _https://b.org z", "a _https://b.org z");} + @Test public void Tag__anch() {fxt.Test__parse("a b z", "a b z");} + @Test public void Tag__misc() {fxt.Test__parse("a
b
z", "a
b
z");} @Test public void Interrupt() { // ent - fxt.Test__parse("a https://b.org<c" , "a https://b.org<c"); + fxt.Test__parse("a https://b.org<z" , "a https://b.org<z"); // hex - fxt.Test__parse("a https://b.org<c" , "a https://b.org<c"); + fxt.Test__parse("a https://b.org<z" , "a https://b.org<z"); // dec - fxt.Test__parse("a https://b.org<c" , "a https://b.org<c"); + fxt.Test__parse("a https://b.org<z" , "a https://b.org<z"); + // num_post_proto rule + fxt.Test__parse("a https://< z" , "a https://< z"); } @Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec // hex-dec - fxt.Test__parse("a https://b.org`c" , "a https://b.org&#x60;c"); + fxt.Test__parse("a https://b.org`z" , "a https://b.org&#x60;z"); // dec-hex - fxt.Test__parse("a https://b.orgc;c" , "a https://b.org&#3c;c"); + fxt.Test__parse("a https://b.orgc;z" , "a https://b.org&#3c;z"); } @Test public void Separator() { - // basic - fxt.Test__parse("a https://b.org.:!? c" , "a https://b.org.:!? c"); + // basic; ,;.:!? + fxt.Test__parse("a https://b.org,;.:!? z" , "a https://b.org,;.:!? z"); // ")" excluded - fxt.Test__parse("a https://b.org).:!? c" , "a https://b.org).:!? c"); + fxt.Test__parse("a https://b.org).:!? z" , "a https://b.org).:!? z"); // ")" included b/c "(" exists - fxt.Test__parse("a https://b.org().:!? c" , "a https://b.org().:!? c"); + fxt.Test__parse("a https://b.org().:!? z" , "a https://b.org().:!? z"); // ";" excluded - fxt.Test__parse("a https://b.org;.:!? c" , "a https://b.org;.:!? c"); + fxt.Test__parse("a https://b.org;.:!? z" , "a https://b.org;.:!? z"); // ";" included b/c of ent - fxt.Test__parse("a https://b.org&abc;.:!? c" , "a https://b.org&abc;.:!? c"); + fxt.Test__parse("a https://b.org&abc;.:!? z" , "a https://b.org&abc;.:!? z"); // ";" included b/c of hex - fxt.Test__parse("a https://b.org±.:!? c", "a https://b.org&#xB1;.:!? c"); + fxt.Test__parse("a https://b.org±.:!? z", "a https://b.org&#xB1;.:!? z"); // ";" included b/c of dec - fxt.Test__parse("a https://b.org{.:!? c", "a https://b.org&#123;.:!? c"); + fxt.Test__parse("a https://b.org{.:!? z", "a https://b.org&#123;.:!? z"); // ";" excluded b/c of invalid.ent - fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a https://b.org&a1b;.:!? c"); + fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a https://b.org&a1b;.:!? z"); // ";" excluded b/c of invalid.hex - fxt.Test__parse("a https://b.org&#x;.:!? c" , "a https://b.org&#x;.:!? c"); + fxt.Test__parse("a https://b.org&#x;.:!? z" , "a https://b.org&#x;.:!? z"); // ";" excluded b/c of invalid.dec - fxt.Test__parse("a https://b.org&#a;.:!? c" , "a https://b.org&#a;.:!? c"); + fxt.Test__parse("a https://b.org&#a;.:!? z" , "a https://b.org&#a;.:!? z"); + // num_post_proto rule + fxt.Test__parse("a https://.:!? z" , "a https://.:!? z"); } -/* -TESTS: regex -"" -"" -*/ - @Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");} } class Xomw_magiclinks_wkr__fxt { private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr(); private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); - private boolean apos = true; public Xomw_magiclinks_wkr__fxt() { Xoae_app app = Xoa_app_fxt.Make__app__edit(); Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); @@ -76,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt { pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1"))); wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space)); } - public void Test__parse(String src_str, String expd) { + public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);} + public void Test__parse(boolean apos, String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); pbfr.Init(src_bry); wkr.Do_magic_links(pctx, pbfr);