Mw_parse: Support num_post_proto rule; clarify variable names

2025-06-13 12:54:14 +00:00 · 2017-01-28 07:15:35 -05:00 · 2017-01-28 07:15:35 -05:00 · c77e8a4374
commit c77e8a4374
parent 7e27b5415d
3 changed files with 125 additions and 51 deletions
--- a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java
+++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java
@ -22,6 +22,59 @@ import gplx.xowa.mws.parsers.*;
 public class Xomw_sanitizer {
 	private final    Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
 	private final    Mwh_atr_parser atr_parser = new Mwh_atr_parser();
+//		static function cleanUrl($url) {
+//			// Normalize any HTML entities in input. They will be
+//			// re-escaped by makeExternalLink().
+//			$url = Sanitizer::decodeCharReferences($url);
+//
+//			// Escape any control characters introduced by the above step
+//			$url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
+//				[ __CLASS__, 'cleanUrlCallback' ], $url);
+//
+//			// Validate hostname portion
+//			$matches = [];
+//			if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
+//				list(/* $whole */, $protocol, $host, $rest) = $matches;
+//
+//				// Characters that will be ignored in IDNs.
+//				// https://tools.ietf.org/html/rfc3454#section-3.1
+//				// Strip them before further processing so blacklists and such work.
+//				$strip = "/
+//					\\s|          // general whitespace
+//					\xc2\xad|     // 00ad SOFT HYPHEN
+//					\xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
+//					\xe2\x80\x8b| // 200b ZERO WIDTH SPACE
+//					\xe2\x81\xa0| // 2060 WORD JOINER
+//					\xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
+//					\xcd\x8f|     // 034f COMBINING GRAPHEME JOINER
+//					\xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+//					\xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+//					\xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+//					\xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
+//					\xe2\x80\x8d| // 200d ZERO WIDTH JOINER
+//					[\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
+//					/xuD";
+//
+//				$host = preg_replace($strip, '', $host);
+//
+//				// IPv6 host names are bracketed with [].  Url-decode these.
+//				if (substr_compare("//%5B", $host, 0, 5) === 0 &&
+//					preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
+//				) {
+//					$host = '//[' . $matches[1] . ']' . $matches[2];
+//				}
+//
+//				// @todo FIXME: Validate hostnames here
+//
+//				return $protocol . $host . $rest;
+//			} else {
+//				return $url;
+//			}
+//		}
+//
+//		static function cleanUrlCallback($matches) {
+//			return urlencode($matches[0]);
+//		}
 	public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
 		atr_bldr.Atrs__clear();
 		atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
@ -29,11 +82,11 @@ public class Xomw_sanitizer {

 		// PORTED: Sanitizer.php|safeEncodeTagAttributes
 		for (int i = 0; i < len; i++) {
-			// $encAttribute = htmlspecialchars( $attribute );
-			// $encValue = Sanitizer::safeEncodeAttribute( $value );
+			// $encAttribute = htmlspecialchars($attribute);
+			// $encValue = Sanitizer::safeEncodeAttribute($value);
 			// $attribs[] = "$encAttribute=\"$encValue\"";
 			Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
-			bfr.Add_byte_space();	// "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
+			bfr.Add_byte_space();	// "return count($attribs) ? ' ' . implode(' ', $attribs) : '';"
 			bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
 			bfr.Add_byte_eq().Add_byte_quote();
 			bfr.Add(itm.Val_as_bry());	// TODO.XO:Sanitizer::encode
@ -201,7 +254,7 @@ public class Xomw_sanitizer {
 		int point = Hex_utl_.Parse_or(codepoint, -1);
 		if (Validate_codepoint(point)) {
 			bfr.Add_str_a7("&#x");
-			Hex_utl_.Write_bfr(bfr, Bool_.Y, point);	// sprintf( '&#x%x;', $point )
+			Hex_utl_.Write_bfr(bfr, Bool_.Y, point);	// sprintf('&#x%x;', $point)
 			bfr.Add_byte_semic();
 			return true;
 		}
--- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java
@ -104,32 +104,36 @@ public class Xomw_magiclinks_wkr {

 			// looks like magiclink; do additional processing
 			byte regex_tid = ((Byte_obj_val)o).Val();
-			int old_pos = cur;
-			int trv_pos = trv.Pos();
-			int nxt_pos = trv_pos;
+			int hook_bgn = cur;
+			int hook_end = trv.Pos();
+			int tmp_pos = hook_end;
 			boolean regex_valid = true;
 			switch (regex_tid) {
 				case Regex__anch:	// (<a[ \t\r\n>].*?</a>) |      // m[1]: Skip link text
-					if (trv_pos < src_end) {
-						// find ws in "[ \t\r\n>]"
-						byte ws_byte = src[cur];
+					if (tmp_pos < src_end) {
+						// find "[ \t\r\n>]" after "<a"; i.e.: don't match "<ab" or "<ac", etc..
+						byte ws_byte = src[tmp_pos];
 						switch (ws_byte) {
+							// next char after "<a" is ws -> valid
 							case Byte_ascii.Space:
 							case Byte_ascii.Tab:
 							case Byte_ascii.Cr:
 							case Byte_ascii.Nl:
 								break;
+							// next char after "<a" is not ws -> invalid
 							default:
 								regex_valid = false;
 								break;
 						}
 						if (regex_valid) {
 							// find </a>
-							nxt_pos++;
-							int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
+							tmp_pos++;
+							int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, tmp_pos, src_end);
+							// </a> not found -> invalid
 							if (anch_end == Bry_find_.Not_found) {
 								regex_valid = false;
 							}
+							// </a> found -> valid; set cur to after "</a>"
 							else {
 								cur = anch_end + Tag__anch__rhs.length;
 							}
@ -141,36 +145,51 @@ public class Xomw_magiclinks_wkr {
 					break;
 				case Regex__elem: // (<.*?>) |                    // m[2]: Skip stuff inside
 					// just find ">"
-					int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
-					if (elem_end == Bry_find_.Not_found)
+					tmp_pos = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, tmp_pos, src_end);
+					// > not found -> invalid
+					if (tmp_pos == Bry_find_.Not_found) {
 						regex_valid = false;
-					else
-						cur = elem_end + 1;
+					}
+					// > found -> valid; set cur to after ">"
+					else {
+						cur = tmp_pos + 1;
+					}
 					break;
 				case Regex__free:
-					if (regex_boundary.Is_boundary_prv(src, cur)) {
-						int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end);
-						if (url_end == nxt_pos) {
+					// make sure that protocol starts at word bound; EX: "ahttp://a.org" should be invalid
+					if (regex_boundary.Is_boundary_prv(src, hook_bgn)) {
+						// skip forward until invalid url char
+						tmp_pos = regex_url.Find_fwd_while(trv, src, tmp_pos, src_end);
+						// no url chars found -> invalid
+						if (tmp_pos == hook_end) {
 							regex_valid = false;
 						}
-						else
-							cur = url_end;
+						// url chars found -> valid; set cur to 1st invalid url-char;
+						else {
+							cur = tmp_pos;
+						}
 					}
 					else
 						regex_valid = false;
 					break;
 			}
+			// regex is invalid; advance by 1 and continue;
 			if (!regex_valid) {
 				cur++;
 			}
+			// regex is valid
 			else {
+				// handle free
 				if (regex_tid == Regex__free) {
 					this.page_title = pctx.Page_title().Full_db();
                        dirty = true;
-					bfr.Add_mid(src, prv, old_pos);
-                        this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0);
+					bfr.Add_mid(src, prv, hook_bgn);
+					byte[] url = Bry_.Mid(src, hook_bgn, cur);
+					int num_post_proto = cur - hook_end; // get length of url without proto; EX: "http://a.org" should be 5 ("a.org")
+                        this.Make_free_external_link(bfr, url, num_post_proto);
 					prv = cur;
 				}
+				// "<a " and "<" just need to be ignored; note that they already update cur so noop
 				else {
 				}
 			}
@ -223,9 +242,10 @@ public class Xomw_magiclinks_wkr {

 		// Verify that we still have a real URL after trail removal, and
 		// not just lone protocol
-//			if (strlen($trail) >= $numPostProto) {
-//				return $url . $trail;
-//			}
+		if (trail.length >= num_post_proto) {
+			bfr.Add_bry_many(url, trail);
+			return;
+		}

 //			$url = Sanitizer::cleanUrl($url);

--- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java
@ -19,55 +19,55 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
 import org.junit.*;
 public class Xomw_magiclinks_wkr__tst {
 	private final    Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
-	@Test   public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
+	@Test   public void Basic() {fxt.Test__parse("a https://b.org z", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> z");}
+	@Test   public void Invalid() {fxt.Test__parse("a _https://b.org z", "a _https://b.org z");}
+	@Test   public void Tag__anch() {fxt.Test__parse("a <a title=\"https://b.org\">b</a> z", "a <a title=\"https://b.org\">b</a> z");}
+	@Test   public void Tag__misc() {fxt.Test__parse("a <div title=\"https://b.org\">b</div> z", "a <div title=\"https://b.org\">b</div> z");}
 	@Test   public void Interrupt() {
 		// ent
-		fxt.Test__parse("a https://b.org&lt;c"   , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&lt;c");
+		fxt.Test__parse("a https://b.org&lt;z"   , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&lt;z");
 		// hex
-		fxt.Test__parse("a https://b.org&#x3c;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#x3c;c");
+		fxt.Test__parse("a https://b.org&#x3c;z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#x3c;z");
 		// dec
-		fxt.Test__parse("a https://b.org&#60;c"  , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#60;c");
+		fxt.Test__parse("a https://b.org&#60;z"  , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#60;z");
+		// num_post_proto rule
+		fxt.Test__parse("a https://&lt; z"       , "a https://&lt; z");
 	}
 	@Test   public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
 		// hex-dec
-		fxt.Test__parse("a https://b.org&#x60;c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;c'>https://b.org&amp;#x60;c</a>");
+		fxt.Test__parse("a https://b.org&#x60;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;z'>https://b.org&amp;#x60;z</a>");
 		// dec-hex
-		fxt.Test__parse("a https://b.org&#3c;c"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;c'>https://b.org&amp;#3c;c</a>");
+		fxt.Test__parse("a https://b.org&#3c;z"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
 	}
 	@Test   public void Separator() {
-		// basic
-		fxt.Test__parse("a https://b.org.:!? c"      , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>.:!? c");
+		// basic; ,;.:!?
+		fxt.Test__parse("a https://b.org,;.:!? z"    , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>,;.:!? z");
 		// ")" excluded
-		fxt.Test__parse("a https://b.org).:!? c"     , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? c");
+		fxt.Test__parse("a https://b.org).:!? z"     , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? z");
 		// ")" included b/c "(" exists
-		fxt.Test__parse("a https://b.org().:!? c"    , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? c");
+		fxt.Test__parse("a https://b.org().:!? z"    , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? z");
 		// ";" excluded
-		fxt.Test__parse("a https://b.org;.:!? c"     , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? c");
+		fxt.Test__parse("a https://b.org;.:!? z"     , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
 		// ";" included b/c of ent
-		fxt.Test__parse("a https://b.org&abc;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? c");
+		fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? z");
 		// ";" included b/c of hex
-		fxt.Test__parse("a https://b.org&#xB1;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? c");
+		fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? z");
 		// ";" included b/c of dec
-		fxt.Test__parse("a https://b.org&#123;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? c");
+		fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? z");
 		// ";" excluded b/c of invalid.ent
-		fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? c");
+		fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? z");
 		// ";" excluded b/c of invalid.hex
-		fxt.Test__parse("a https://b.org&#x;.:!? c"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? c");
+		fxt.Test__parse("a https://b.org&#x;.:!? z"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? z");
 		// ";" excluded b/c of invalid.dec
-		fxt.Test__parse("a https://b.org&#a;.:!? c"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? c");
+		fxt.Test__parse("a https://b.org&#a;.:!? z"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? z");
+		// num_post_proto rule
+		fxt.Test__parse("a https://.:!? z"           , "a https://.:!? z");
 	}
-/*
-TESTS: regex
-"<a https://a.org>"
-"<img https://a.org>"
-*/
-	@Test   public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
 }
 class Xomw_magiclinks_wkr__fxt {
 	private final    Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
 	private final    Xomw_parser_ctx pctx = new Xomw_parser_ctx();
 	private final    Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
-	private boolean apos = true;
 	public Xomw_magiclinks_wkr__fxt() {
 		Xoae_app app = Xoa_app_fxt.Make__app__edit();
 		Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
@ -76,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {
 		pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
 		wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
 	}
-	public void Test__parse(String src_str, String expd) {
+	public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
+	public void Test__parse(boolean apos, String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
 		pbfr.Init(src_bry);
 		wkr.Do_magic_links(pctx, pbfr);