Mw_parse: Add clean_url and associated functions to sanitizer

2025-06-13 12:54:14 +00:00 · 2017-01-30 09:51:17 -05:00 · 2017-01-30 09:51:17 -05:00 · 9a5c70b506
commit 9a5c70b506
parent c77e8a4374
15 changed files with 914 additions and 265 deletions
--- a/100_core/src/gplx/Byte_ascii.java
+++ b/100_core/src/gplx/Byte_ascii.java
@ -40,7 +40,7 @@ public class Byte_ascii {
 		, Ltr_n			= 110, Ltr_o			= 111, Ltr_p			= 112, Ltr_q			= 113, Ltr_r			= 114
 		, Ltr_s			= 115, Ltr_t			= 116, Ltr_u			= 117, Ltr_v			= 118, Ltr_w			= 119
 		, Ltr_x			= 120, Ltr_y			= 121, Ltr_z			= 122, Curly_bgn		= 123, Pipe				= 124
-		, Curly_end		= 125, Tilde			= 126
+		, Curly_end		= 125, Tilde			= 126, Delete           = 127
 		;
 	public static final byte
 	  Angle_bgn = Lt, Angle_end = Gt
--- a/100_core/src/gplx/core/btries/Btrie_slim_mgr.java
+++ b/100_core/src/gplx/core/btries/Btrie_slim_mgr.java
@ -117,6 +117,14 @@ public class Btrie_slim_mgr implements Btrie_mgr {
 		}
 		return this;
 	}
+	public Btrie_slim_mgr Add_many_bry(byte[]... ary) {
+		int len = ary.length;			
+		for (int i = 0; i < len; i++) {
+			byte[] itm = ary[i];
+			Add_obj(itm, itm);
+		}
+		return this;
+	}
 	public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));}
 	public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) {
 		int len = ary.length;
--- a/400_xowa/src/gplx/core/brys/Bry_tmp.java
+++ b/400_xowa/src/gplx/core/brys/Bry_tmp.java
@ -0,0 +1,40 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012 gnosygnu@gmail.com
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+package gplx.core.brys; import gplx.*; import gplx.core.*;
+public class Bry_tmp {
+	public byte[] src;
+	public int src_bgn;
+	public int src_end;
+	public boolean dirty;
+	public Bry_tmp Init(byte[] src, int src_bgn, int src_end) {
+		this.dirty = false;
+		this.src = src;
+		this.src_bgn = src_bgn;
+		this.src_end = src_end;
+		return this;
+	}
+	public void Set_by_bfr(Bry_bfr bfr) {
+		dirty = true;
+		src = bfr.To_bry_and_clear();
+		src_bgn = 0;
+		src_end = src.length;
+	}
+	public void Add_to_bfr(Bry_bfr bfr) {
+		bfr.Add_mid(src, src_bgn, src_end);
+	}
+}
--- a/400_xowa/src/gplx/langs/htmls/encoders/Gfo_url_encoder_.java
+++ b/400_xowa/src/gplx/langs/htmls/encoders/Gfo_url_encoder_.java
@ -62,6 +62,12 @@ public class Gfo_url_encoder_ {
 		return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.N)
 			.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
 	}
+	public static Gfo_url_encoder_mkr New__php_urlencode() {
+		// equivalent to php's urlencode; http://php.net/manual/en/function.urlencode.php;
+		// "Returns a String in which all non-alphanumeric characters except -_. have been replaced with a percent (%) sign followed by two hex digits and spaces encoded as plus (+) signs"
+		return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y)
+			.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
+	}
 	private static Gfo_url_encoder_mkr New__http_url_ttl() {
 		return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y);
 	}
@ -103,5 +109,6 @@ public class Gfo_url_encoder_ {
 	, Http_url			= Gfo_url_encoder_.New__http_url().Make()
 	, Http_url_ttl		= Gfo_url_encoder_.New__http_url_ttl().Make()
 	, Mw_ttl			= Gfo_url_encoder_.New__mw_ttl().Make()
+	, Php_urlencode		= Gfo_url_encoder_.New__php_urlencode().Make()
 	;
 }
--- a/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java
+++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java
@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
-import gplx.core.btries.*;
+import gplx.core.btries.*; import gplx.core.brys.*;
 import gplx.core.primitives.*;
 public class Php_preg_ {
 	public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
@ -72,4 +72,41 @@ public class Php_preg_ {
 		}
 		return null;
 	}
+	
+	public static void Replace(Bry_tmp bry, Bry_bfr tmp, Btrie_slim_mgr find_trie, Btrie_rv trv, byte[] repl_bry) {
+		byte[] src = bry.src;
+		int src_bgn = bry.src_bgn;
+		int src_end = bry.src_end;
+ 
+		int cur = src_bgn;
+		int prv = cur;
+		boolean dirty = false;
+
+		while (true) {
+			// eos
+			if (cur == src_end) {
+				if (dirty) {
+					tmp.Add_mid(src, prv, src_end);
+				}
+				break;
+			}
+
+			byte b = src[cur];
+			Object o = find_trie.Match_at_w_b0(trv, b, src, cur, src_end);
+			if (o == null) {
+				cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
+			}
+			else {
+				dirty = true;
+				tmp.Add_mid(src, prv, cur);
+				tmp.Add(repl_bry);
+				cur = trv.Pos();
+				prv = cur;
+			}
+		}
+
+		if (dirty) {
+			bry.Set_by_bfr(tmp);
+		}
+	}
 }
--- a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java
+++ b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java
@ -44,7 +44,7 @@ public class Php_str_ {
 		if (max == -1) max = src_len;
 		int rv = 0;
 		for (int i = bgn; i < src_len; i++) {
-			if (find[src[i]] && rv < max) 
+			if (find[src[i] & 0xFF] && rv < max) // PATCH.JAVA:need to convert to unsigned byte
 				rv++;
 			else
 				break;
@ -94,7 +94,7 @@ public class Php_str_ {
 		if (max == -1) max = Int_.Max_value;
 		int rv = 0;
 		for (int i = bgn - 1; i > -1; i--) {
-			if (find[src[i]] && rv < max) 
+			if (find[src[i & 0xFF]] && rv < max)  // PATCH.JAVA:need to convert to unsigned byte
 				rv++;
 			else
 				break;
--- a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java
+++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer.java
@ -16,65 +16,123 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
-import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
+import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
 import gplx.xowa.parsers.htmls.*;
-import gplx.xowa.mws.parsers.*;
+import gplx.xowa.mws.parsers.*; import gplx.langs.phps.utls.*;
 public class Xomw_sanitizer {
 	private final    Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
 	private final    Mwh_atr_parser atr_parser = new Mwh_atr_parser();
-//		static function cleanUrl($url) {
-//			// Normalize any HTML entities in input. They will be
-//			// re-escaped by makeExternalLink().
-//			$url = Sanitizer::decodeCharReferences($url);
-//
-//			// Escape any control characters introduced by the above step
-//			$url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
-//				[ __CLASS__, 'cleanUrlCallback' ], $url);
-//
-//			// Validate hostname portion
-//			$matches = [];
-//			if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
-//				list(/* $whole */, $protocol, $host, $rest) = $matches;
-//
-//				// Characters that will be ignored in IDNs.
-//				// https://tools.ietf.org/html/rfc3454#section-3.1
-//				// Strip them before further processing so blacklists and such work.
-//				$strip = "/
-//					\\s|          // general whitespace
-//					\xc2\xad|     // 00ad SOFT HYPHEN
-//					\xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
-//					\xe2\x80\x8b| // 200b ZERO WIDTH SPACE
-//					\xe2\x81\xa0| // 2060 WORD JOINER
-//					\xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
-//					\xcd\x8f|     // 034f COMBINING GRAPHEME JOINER
-//					\xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
-//					\xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
-//					\xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
-//					\xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
-//					\xe2\x80\x8d| // 200d ZERO WIDTH JOINER
-//					[\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
-//					/xuD";
-//
-//				$host = preg_replace($strip, '', $host);
-//
-//				// IPv6 host names are bracketed with [].  Url-decode these.
-//				if (substr_compare("//%5B", $host, 0, 5) === 0 &&
-//					preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
-//				) {
-//					$host = '//[' . $matches[1] . ']' . $matches[2];
-//				}
-//
-//				// @todo FIXME: Validate hostnames here
-//
-//				return $protocol . $host . $rest;
-//			} else {
-//				return $url;
-//			}
-//		}
-//
-//		static function cleanUrlCallback($matches) {
-//			return urlencode($matches[0]);
-//		}
+	private final    Xomw_regex_escape_invalid regex_clean_url = new Xomw_regex_escape_invalid();
+	private final    Xomw_regex_find_domain regex_find_domain = new Xomw_regex_find_domain();
+	private final    Xomw_regex_ipv6_brack regex_ipv6_brack = new Xomw_regex_ipv6_brack();
+	private final    Bry_tmp tmp_host = new Bry_tmp();
+	private final    Bry_bfr tmp_bfr = Bry_bfr_.New();
+	private final    Btrie_rv trv = new Btrie_rv();
+	private final    Xomw_regex_url_char_cbk__normalize normalize_cbk;
+	private final    Xomw_regex_url_char_cbk__decode decode_cbk;
+
+	private static Xomw_regex_url_char regex_url_char;
+	private static Btrie_slim_mgr invalid_idn_trie;
+	public Xomw_sanitizer() {
+		this.normalize_cbk = new Xomw_regex_url_char_cbk__normalize(this);
+		this.decode_cbk = new Xomw_regex_url_char_cbk__decode(this);
+		if (regex_url_char == null) {
+			synchronized (Type_adp_.ClassOf_obj(this)) {
+				regex_url_char = new Xomw_regex_url_char();
+
+				// Characters that will be ignored in IDNs.
+				// https://tools.ietf.org/html/rfc3454#section-3.1
+				// $strip = "/
+				//	 \\s|          // general whitespace
+				//	 \xc2\xad|     // 00ad SOFT HYPHEN
+				//	 \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
+				//	 \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
+				//	 \xe2\x81\xa0| // 2060 WORD JOINER
+				//	 \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
+				//	 \xcd\x8f|     // 034f COMBINING GRAPHEME JOINER
+				//	 \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+				//	 \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+				//	 \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+				//	 \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
+				//	 \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
+				//	 [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
+				//	 /xuD";
+				// XO.MW.REGEX:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
+				//   /x : ignore embedded ws
+				//   /u : enabled pcre utf8
+				//   /D : $ matches EOS, not NL
+				invalid_idn_trie = Btrie_slim_mgr.cs()
+				.Add_many_bry(new Xomw_regex_parser().Add_ary
+				( "\\s"
+				, "\\xc2\\xad"      // 00ad SOFT HYPHEN
+				, "\\xe1\\xa0\\x86" // 1806 MONGOLIAN TODO SOFT HYPHEN
+				, "\\xe2\\x80\\x8b" // 200b ZERO WIDTH SPACE
+				, "\\xe2\\x81\\xa0" // 2060 WORD JOINER
+				, "\\xef\\xbb\\xbf" // feff ZERO WIDTH NO-BREAK SPACE
+				, "\\xcd\\x8f"      // 034f COMBINING GRAPHEME JOINER
+				, "\\xe1\\xa0\\x8b" // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+				, "\\xe1\\xa0\\x8c" // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+				, "\\xe1\\xa0\\x8d" // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+				, "\\xe2\\x80\\x8c" // 200c ZERO WIDTH NON-JOINER
+				, "\\xe2\\x80\\x8d" // 200d ZERO WIDTH JOINER
+				)
+				.Add_rng
+				( "\\xef\\xb8\\x80", "\\xef\\xb8\\x8f" // fe00-fe0f VARIATION SELECTOR-1-16
+				)
+				.Rslt());
+
+				// assert static structs
+				if (html_entities == null) {
+					synchronized (Type_adp_.ClassOf_obj(this)) {
+						html_entities = Html_entities_new();
+					}
+				}
+			}
+		}
+	}
+
+	public byte[] Clean_url(byte[] url) {
+		// Normalize any HTML entities in input. They will be
+		// re-escaped by makeExternalLink().			
+		url = Decode_char_references(null, Bool_.Y, url, 0, url.length);
+
+		// Escape any control characters introduced by the above step
+		// XO.MW.REGEX: $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/', [ __CLASS__, 'cleanUrlCallback' ], $url);
+		//   '[]<>"' | '00 -> 32' | 127
+		if (regex_clean_url.Escape(tmp_bfr, url, 0, url.length))
+			url = tmp_bfr.To_bry_and_clear();
+
+		// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches))
+		if (regex_find_domain.Match(url, 0, url.length)) {
+			// Characters that will be ignored in IDNs.
+			// https://tools.ietf.org/html/rfc3454#section-3.1
+			// Strip them before further processing so blacklists and such work.
+			Php_preg_.Replace(tmp_host.Init(url, regex_find_domain.host_bgn, regex_find_domain.host_end), tmp_bfr, invalid_idn_trie, trv, Bry_.Empty);
+			
+			// IPv6 host names are bracketed with [].  Url-decode these.
+			// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
+			//	preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
+			//  XO.MW.REGEX:
+			//    !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
+			//    "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
+			//    EX: [ABCD]:80:12
+			if (regex_ipv6_brack.Match(tmp_host.src, tmp_host.src_bgn, tmp_host.src_end)) {
+				tmp_bfr.Add_str_a7("//[").Add_mid(tmp_host.src, regex_ipv6_brack.host_bgn, regex_ipv6_brack.host_end)
+					.Add_byte(Byte_ascii.Brack_end).Add_mid(tmp_host.src, regex_ipv6_brack.segs_bgn, regex_ipv6_brack.segs_end);
+				tmp_host.Set_by_bfr(tmp_bfr);
+			}
+
+			// @todo FIXME: Validate hostnames here
+
+			tmp_bfr.Add_mid(url, regex_find_domain.prot_bgn, regex_find_domain.prot_end);
+			tmp_host.Add_to_bfr(tmp_bfr);
+			tmp_bfr.Add_mid(url, regex_find_domain.rest_bgn, regex_find_domain.rest_end);
+			return tmp_bfr.To_bry_and_clear();
+		}
+		else {
+			return url;
+		}
+	}
 	public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
 		atr_bldr.Atrs__clear();
 		atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
@ -105,163 +163,13 @@ public class Xomw_sanitizer {
 		Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
 	}
 	public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
-		// assert static structs
-		if (Normalize__dec == null) {
-			synchronized (Xomw_sanitizer.class) {
-				html_entities = Html_entities_new();
-				Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
-				Normalize__hex = Bool_ary_bldr.New_u8()
-					.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
-					.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
-					.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
-					.To_ary();
-				Normalize__ent = Bool_ary_bldr.New_u8()
-					.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
-					.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
-					.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
-					.Set_rng(128, 255)
-					.To_ary();
-			}
-		}
-
-		// XO.BRY_BFR
-		boolean dirty = false;
-		int cur = src_bgn;
-		boolean called_by_bry = bfr == null;
-
-		while (true) {
-			// search for "&"
-			int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
-			if (find_bgn == Bry_find_.Not_found) {	// "&" not found; exit
-				if (dirty)
-					bfr.Add_mid(src, cur, src_end);
-				break;
-			}
-			int ent_bgn = find_bgn + 1;	// +1 to skip &
-
-			// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
-			boolean[] regex = null;
-			// check for #;
-			if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
-				ent_bgn++;
-				if (ent_bgn < src_end) {
-					byte nxt = src[ent_bgn];
-					// check for x
-					if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
-						ent_bgn++;
-						regex = Normalize__hex;
-					}
-				}
-				if (regex == null)
-					regex = Normalize__dec;
-			}
-			else {
-				regex = Normalize__ent;
-			}
-
-			// keep looping until invalid regex
-			int ent_end = ent_bgn;
-			byte b = Byte_ascii.Null;
-			for (int i = ent_bgn; i < src_end; i++) {
-				b = src[i];
-				if (regex[b])
-					ent_end++;
-				else
-					break;
-			}
-
-			// mark dirty; can optimize later by checking if "&lt;" already exists
-			dirty = true;
-			if (bfr == null) bfr = Bry_bfr_.New();
-			bfr.Add_mid(src, cur, find_bgn); // add everything before &
-
-			// invalid <- regex ended, but not at semic
-			if (b != Byte_ascii.Semic) {
-				bfr.Add(Gfh_entity_.Amp_bry);       // transform "&" to "&amp;"
-				cur = find_bgn + 1;                 // position after "&"
-				continue;
-			}
-
-			// do normalization
-			byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
-			boolean ret = false;
-			if      (regex == Normalize__ent) {
-				Normalize_entity(bfr, name);
-				ret = true;
-			}
-			else if (regex == Normalize__dec) {
-				ret = Dec_char_reference(bfr, name);
-			}
-			else if (regex == Normalize__hex) {
-				ret = Hex_char_reference(bfr, name);
-			}
-			if (!ret) {
-				bfr.Add(Gfh_entity_.Amp_bry);       // transform "&" to "&amp;"
-				bfr.Add_bry_escape_html(src, find_bgn + 1, ent_end + 1); // "find_bgn + 1" to start after "&"; "ent_end + 1" to include ";"
-			}
-
-			cur = ent_end + 1;	// +1 to position after ";"
-		}
-
-		// XO.BRY_BFR
-		if (dirty) {
-			if (called_by_bry)
-				return bfr.To_bry_and_clear();
-			else
-				return Bry_.Empty;
-		}
-		else {
-			if (called_by_bry) {
-				if (src_bgn == 0 && src_end == src.length)
-					return src;
-				else
-					return Bry_.Mid(src, src_bgn, src_end);
-			}
-			else {
-				if (lone_bfr)
-					bfr.Add_mid(src, src_bgn, src_end);
-				return null;
-			}
-		}
+		return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, normalize_cbk);
+	}
+	public byte[] Decode_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
+		return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, decode_cbk);
 	}

-	// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
-	// return the equivalent numeric entity reference (except for the core &lt;
-	// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
-	// the HTML equivalent. Otherwise, returns HTML-escaped text of
-	// pseudo-entity source (eg &amp;foo;)
-	private void Normalize_entity(Bry_bfr bfr, byte[] name) {
-		Object o = html_entities.Get_by_bry(name);
-		if (o == null) {
-			bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
-		}
-		else {
-			Xomw_html_ent entity = (Xomw_html_ent)o;
-			bfr.Add(entity.html);
-		}
-	}
-
-	private boolean Dec_char_reference(Bry_bfr bfr, byte[] codepoint) {
-		int point = Bry_.To_int_or(codepoint, -1);
-		if (Validate_codepoint(point)) {
-			bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
-			return true;
-		}
-		return false;
-	}
-
-	private boolean Hex_char_reference(Bry_bfr bfr, byte[] codepoint) {
-		int point = Hex_utl_.Parse_or(codepoint, -1);
-		if (Validate_codepoint(point)) {
-			bfr.Add_str_a7("&#x");
-			Hex_utl_.Write_bfr(bfr, Bool_.Y, point);	// sprintf('&#x%x;', $point)
-			bfr.Add_byte_semic();
-			return true;
-		}
-		return false;
-	}
-
-	private boolean Validate_codepoint(int codepoint) {
+	public boolean Validate_codepoint(int codepoint) {
 		// U+000C is valid in HTML5 but not allowed in XML.
 		// U+000D is valid in XML but not allowed in HTML5.
 		// U+007F - U+009F are disallowed in HTML5 (control characters).
@ -273,14 +181,13 @@ public class Xomw_sanitizer {
 			|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
 	}

-	private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent; 
-	private static Hash_adp_bry html_entities;
+	public static Hash_adp_bry html_entities;
 	private static Hash_adp_bry Html_entities_new() {
 		Bry_bfr tmp = Bry_bfr_.New();
 		Hash_adp_bry rv = Hash_adp_bry.cs();

-		Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "רלמ", "&rlm;");
-		Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "رلم", "&rlm;");
+		Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "רלמ", "&rlm;");
+		Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "رلم", "&rlm;");

 		Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "&lt;");
 		Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", "&gt;");
@ -568,3 +475,395 @@ class Xomw_html_ent {
 	public final    byte[] html;
 	public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
 }
+class Xomw_regex_find_domain {
+	public int prot_bgn;
+	public int prot_end;
+	public int host_bgn;
+	public int host_end;
+	public int rest_bgn;
+	public int rest_end;
+	public boolean Match(byte[] src, int src_bgn, int src_end) {
+		// Validate hostname portion
+		// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
+		//   ([^:]+:)(//[^/]+)?(.*) 
+		//   "protocol" + "host" + "rest"
+		//   "protocol" -> ([^:]+:)     EX: "https:"    anything not-colon up to colon
+		//   "host"     -> (//[^/]+)?   EX: "//abc/"    anything not-slash up to slash
+		//   "rest"     -> (.*)         EX: rest"
+	    //   /i : case-insensitive
+	    //   /D : $ matches EOS, not NL
+
+		// find prot; EX: "https:"
+		prot_bgn = src_bgn;
+		prot_end = Bry_find_.Move_fwd(src, Byte_ascii.Colon, prot_bgn, src_end);
+		// exit if not found
+		if (prot_end == Bry_find_.Not_found) return false;
+
+		// find host: EX: "//a.org"
+		host_bgn = prot_end;
+		int double_slash_end = host_bgn + 2;
+		// exit if eos
+		if (double_slash_end >= src_end) return false;
+		// exit if not "//"
+		if (   src[host_bgn    ] != Byte_ascii.Slash
+			|| src[host_bgn + 1] != Byte_ascii.Slash
+			) return false;
+		host_end = Bry_find_.Find_fwd(src, Byte_ascii.Slash, double_slash_end, src_end);
+		// exit if not found
+		if (host_end == Bry_find_.Not_found) {
+			host_end = src_end;
+			rest_bgn = rest_end = -1;
+		}
+		// exit if only "//"
+		if (host_end - host_bgn == 2) return false;
+
+		// set rest
+		rest_bgn = host_end;
+		rest_end = src_end;
+		return true;
+	}
+}
+class Xomw_regex_escape_invalid {
+	// [\][<>"\\x00-\\x20\\x7F\|]
+	public boolean Escape(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
+		boolean dirty = false;
+		int cur = src_bgn;
+		int prv = cur;
+		while (true) {
+			// eos
+			if (cur == src_end) {
+				if (dirty) {
+					bfr.Add_mid(src, prv, src_end);
+				}
+				break;
+			}
+			boolean match = false;
+			byte b = src[cur];
+			switch (b) {
+				case Byte_ascii.Brack_bgn:
+				case Byte_ascii.Brack_end:
+				case Byte_ascii.Angle_bgn:
+				case Byte_ascii.Angle_end:
+				case Byte_ascii.Quote:
+				case Byte_ascii.Pipe:
+				case Byte_ascii.Delete:
+					match = true;
+					break;
+				default:
+					if (b >= 0 && b <= 32)
+						match = true;
+					break;
+			}
+			if (match) {
+				bfr.Add_mid(src, prv, cur);
+				gplx.langs.htmls.encoders.Gfo_url_encoder_.Php_urlencode.Encode(bfr, src, cur, cur + 1);
+				dirty = true;
+				cur++;
+				prv = cur;
+			}
+			else
+				cur++;
+		}
+		return dirty;
+	}
+}
+class Xomw_regex_ipv6_brack {
+	public int host_bgn;
+	public int host_end;
+	public int segs_bgn;
+	public int segs_end;
+	private final    byte[] 
+	  Bry__host_bgn = Bry_.new_a7("//%5B")
+	, Bry__host_end = Bry_.new_a7("%5D")
+	;
+	public boolean Match(byte[] src, int src_bgn, int src_end) {
+		//	preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
+		//  XO.MW.REGEX:
+		//    !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
+		//    "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
+		//    EX: [ABCD]:80:12
+		host_bgn = src_bgn + Bry__host_bgn.length;
+		// exit if no match for "//%5B"
+		if (!Bry_.Match(src, src_bgn, host_bgn, Bry__host_bgn)) return false;
+
+		// skip all [0-9A-Fa-f:.]
+		host_end = host_bgn;
+		while (true) {
+			// exit if eos
+			if (host_end == src_end) return false;
+			boolean done = false;
+			byte b = src[host_end];
+			switch (b) {
+				case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
+				case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
+				case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F:
+				case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f:
+				case Byte_ascii.Colon:
+				case Byte_ascii.Dot:
+					host_end++;
+					break;
+				case Byte_ascii.Percent:
+					// matches "%5D"
+					segs_bgn = host_end + Bry__host_end.length;
+					if (   Bry_.Match(src, host_end, segs_bgn, Bry__host_end)
+						&& host_end - host_bgn > 0) // host can't be 0-len; EX: "//%5B%5D"
+						done = true;
+					// exit if no match
+					else {
+						return false;
+					}
+					break;
+				// exit if no match
+				default: {
+					return false;
+				}
+			}
+			if (done) break;
+		}
+		// skip all (:\d+)
+		segs_end = segs_bgn;
+		while (true) {
+			// stop if eos
+			if (segs_end == src_end) return true;
+
+			// check if ":"
+			if (src[segs_end] == Byte_ascii.Colon) {
+				int num_bgn = segs_end + 1;
+				int num_end = Bry_find_.Find_fwd_while_num(src, num_bgn, src_end);
+				// exit if no nums found; EX:"[ABC]:80:"
+				if (num_end == num_bgn) {
+					return false;
+				}
+				segs_end = num_end;
+			}
+			// exit if seg doesn't start with ":"
+			else {
+				return false;
+			}
+		}
+	}
+}
+interface Xomw_regex_url_char_cbk {
+	boolean When_ent(Bry_bfr bfr, byte[] name);
+	boolean When_dec(Bry_bfr bfr, byte[] name);
+	boolean When_hex(Bry_bfr bfr, byte[] name);
+	boolean When_amp(Bry_bfr bfr);
+}
+class Xomw_regex_url_char_cbk__normalize implements Xomw_regex_url_char_cbk {
+	private final    Xomw_sanitizer sanitizer;
+	public Xomw_regex_url_char_cbk__normalize(Xomw_sanitizer sanitizer) {
+		this.sanitizer = sanitizer;
+	}
+	public boolean When_ent(Bry_bfr bfr, byte[] name) {  // XO.MW:normalizeEntity
+		// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+		// return the equivalent numeric entity reference (except for the core &lt;
+		// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
+		// the HTML equivalent. Otherwise, returns HTML-escaped text of
+		// pseudo-entity source (eg &amp;foo;)
+		Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
+		if (o == null) {
+			bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
+			return false;
+		}
+		else {
+			Xomw_html_ent entity = (Xomw_html_ent)o;
+			bfr.Add(entity.html);
+			return true;
+		}
+	}
+	public boolean When_dec(Bry_bfr bfr, byte[] name) {  // XO.MW:decCharReference
+		int point = Bry_.To_int_or(name, -1);
+		if (sanitizer.Validate_codepoint(point)) {
+			bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
+			return true;
+		}
+		return false;
+	}
+	public boolean When_hex(Bry_bfr bfr, byte[] name) {  // XO.MW:hexCharReference
+		int point = Hex_utl_.Parse_or(name, -1);
+		if (sanitizer.Validate_codepoint(point)) {
+			bfr.Add_str_a7("&#x");
+			Hex_utl_.Write_bfr(bfr, Bool_.Y, point);	// sprintf('&#x%x;', $point)
+			bfr.Add_byte_semic();
+			return true;
+		}
+		return false;
+	}
+	public boolean When_amp(Bry_bfr bfr) {
+		bfr.Add(Gfh_entity_.Amp_bry);       // transform "&" to "&amp;"
+		return true;
+	}
+}
+class Xomw_regex_url_char_cbk__decode implements Xomw_regex_url_char_cbk {
+	private final    Xomw_sanitizer sanitizer;
+	public Xomw_regex_url_char_cbk__decode(Xomw_sanitizer sanitizer) {
+		this.sanitizer = sanitizer;
+	}
+	public boolean When_ent(Bry_bfr bfr, byte[] name) {// XO.MW:decodeEntity
+		// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+		// return the UTF-8 encoding of that character. Otherwise, returns
+		// pseudo-entity source (eg "&foo;")
+		Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
+		if (o == null) {
+			bfr.Add_byte(Byte_ascii.Amp).Add(name).Add_byte_semic();
+		}
+		else {
+			Xomw_html_ent entity = (Xomw_html_ent)o;
+			bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(entity.code));
+		}
+		return true;
+	}
+	public boolean When_dec(Bry_bfr bfr, byte[] name) {
+		return Decode_char(bfr, Bry_.To_int(name));
+	}
+	public boolean When_hex(Bry_bfr bfr, byte[] name) {
+		return Decode_char(bfr, gplx.core.encoders.Hex_utl_.Parse_or(name, 0, name.length, -1));
+	}
+	public boolean When_amp(Bry_bfr bfr) {
+		bfr.Add_byte(Byte_ascii.Amp);
+		return true;
+	}
+	private boolean Decode_char(Bry_bfr bfr, int point) {// XO.MW:decodeChar
+		// Return UTF-8 String for a codepoint if that is a valid
+		// character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
+		if (sanitizer.Validate_codepoint(point)) {
+			bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(point));
+		}
+		else {
+			bfr.Add(Utf8_replacement_char);
+		}
+		return true;
+	}
+	private static final    byte[] Utf8_replacement_char = Bry_.New_by_ints(255, 253); // 0xfffd 
+}
+class Xomw_regex_url_char {
+	// Regular expression to match various types of character references in
+	// Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
+	// static final CHAR_REFS_REGEX =
+	//	'/&([A-Za-z0-9\x80-\xff]+);
+	//	|&\#([0-9]+);
+	//	|&\#[xX]([0-9A-Fa-f]+);
+	//	|(&)/x';
+	public Xomw_regex_url_char() {
+		// assert static structs
+		if (Normalize__dec == null) {
+			synchronized (Xomw_sanitizer.class) {
+				Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
+				Normalize__hex = Bool_ary_bldr.New_u8()
+					.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
+					.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
+					.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
+					.To_ary();
+				Normalize__ent = Bool_ary_bldr.New_u8()
+					.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
+					.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
+					.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
+					.Set_rng(128, 255)
+					.To_ary();
+			}
+		}
+	}
+	public byte[] Replace_by_cbk(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, Xomw_regex_url_char_cbk cbk) {
+		// XO.BRY_BFR
+		boolean dirty = false;
+		int cur = src_bgn;
+		boolean called_by_bry = bfr == null;
+
+		while (true) {
+			// search for "&"
+			int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
+			if (find_bgn == Bry_find_.Not_found) {	// "&" not found; exit
+				if (dirty)
+					bfr.Add_mid(src, cur, src_end);
+				break;
+			}
+			int ent_bgn = find_bgn + 1;	// +1 to skip &
+
+			// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
+			boolean[] regex = null;
+			// check for #;
+			if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
+				ent_bgn++;
+				if (ent_bgn < src_end) {
+					byte nxt = src[ent_bgn];
+					// check for x
+					if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
+						ent_bgn++;
+						regex = Normalize__hex;
+					}
+				}
+				if (regex == null)
+					regex = Normalize__dec;
+			}
+			else {
+				regex = Normalize__ent;
+			}
+
+			// keep looping until invalid regex
+			int ent_end = ent_bgn;
+			int b = Byte_ascii.Null;
+			for (int i = ent_bgn; i < src_end; i++) {
+				b = src[i] & 0xFF; // PATCH.JAVA:need to convert to unsigned byte
+				if (regex[b])
+					ent_end++;
+				else
+					break;
+			}
+
+			// mark dirty; can optimize later by checking if "&lt;" already exists
+			dirty = true;
+			if (bfr == null) bfr = Bry_bfr_.New();
+			bfr.Add_mid(src, cur, find_bgn); // add everything before &
+
+			// invalid <- regex ended, but not at semic
+			if (b != Byte_ascii.Semic) {
+				cbk.When_amp(bfr);
+				cur = find_bgn + 1;                 // position after "&"
+				continue;
+			}
+
+			// do normalization
+			byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
+			boolean ret = false;
+			if      (regex == Normalize__ent) {
+				cbk.When_ent(bfr, name);
+				ret = true;
+			}
+			else if (regex == Normalize__dec) {
+				ret = cbk.When_dec(bfr, name);
+			}
+			else if (regex == Normalize__hex) {
+				ret = cbk.When_hex(bfr, name);
+			}
+			if (!ret) {
+				cbk.When_amp(bfr);
+				cur = find_bgn + 1;                 // position after "&"
+				continue;
+			}
+
+			cur = ent_end + 1;	// +1 to position after ";"
+		}
+
+		// XO.BRY_BFR
+		if (dirty) {
+			if (called_by_bry)
+				return bfr.To_bry_and_clear();
+			else
+				return Bry_.Empty;
+		}
+		else {
+			if (called_by_bry) {
+				if (src_bgn == 0 && src_end == src.length)
+					return src;
+				else
+					return Bry_.Mid(src, src_bgn, src_end);
+			}
+			else {
+				if (lone_bfr)
+					bfr.Add_mid(src, src_bgn, src_end);
+				return null;
+			}
+		}
+	}
+	private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent; 
+}
--- a/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer__tst.java
+++ b/400_xowa/src/gplx/xowa/mws/Xomw_sanitizer__tst.java
@ -19,19 +19,94 @@ package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
 import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
 public class Xomw_sanitizer__tst {
 	private final    Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
-	@Test   public void Text()                  {fxt.Test__normalize_char_references("abc"                      , "abc");}
-	@Test   public void Dec()                   {fxt.Test__normalize_char_references("&#08;"                    , "&amp;#08;");}
-	@Test   public void Dec__invalid()          {fxt.Test__normalize_char_references("&#09;"                    , "&#9;");}
-	@Test   public void Hex()                   {fxt.Test__normalize_char_references("&#xFF;"                   , "&#xff;");}
-	@Test   public void Entity()                {fxt.Test__normalize_char_references("&alpha;"                  , "&#945;");}
-	@Test   public void Entity__lt()            {fxt.Test__normalize_char_references("&lt;"                     , "&lt;");}
-	@Test   public void Invalid()               {fxt.Test__normalize_char_references("&(invalid);"              , "&amp;(invalid);");}
-	@Test   public void Many() {
+	@Test   public void Normalize__text()                  {fxt.Test__normalize_char_references("abc"                      , "abc");}
+	@Test   public void Normalize__dec()                   {fxt.Test__normalize_char_references("&#08;"                    , "&amp;#08;");}
+	@Test   public void Normalize__dec__invalid()          {fxt.Test__normalize_char_references("&#09;"                    , "&#9;");}
+	@Test   public void Normalize__hex()                   {fxt.Test__normalize_char_references("&#xFF;"                   , "&#xff;");}
+	@Test   public void Normalize__entity()                {fxt.Test__normalize_char_references("&alpha;"                  , "&#945;");}
+	@Test   public void Normalize__entity__lt()            {fxt.Test__normalize_char_references("&lt;"                     , "&lt;");}
+	@Test   public void Normalize__entity__alias()         {fxt.Test__normalize_char_references("&רלמ;"                    , "&rlm;");}
+	@Test   public void Normalize__amp()                   {fxt.Test__normalize_char_references("a&b"                      , "a&amp;b");}
+	@Test   public void Normalize__invalid()               {fxt.Test__normalize_char_references("&(invalid);"              , "&amp;(invalid);");}
+	@Test   public void Normalize__many() {
 		fxt.Test__normalize_char_references
 		( "a &#09; b &alpha; c &#xFF; d &(invalid); e"
 		, "a &#9; b &#945; c &#xff; d &amp;(invalid); e"
 		);
 	}
+	@Test   public void Regex__domain() {
+		Xomw_regex_find_domain regex_domain = new Xomw_regex_find_domain();
+		// normal
+		fxt.Test__regex_domain_y(regex_domain, "https://a.org/bcd", "https:", "//a.org", "/bcd");
+		// trailing backslash
+		fxt.Test__regex_domain_y(regex_domain, "https://a.org/", "https:", "//a.org", "/");
+		// domain only
+		fxt.Test__regex_domain_y(regex_domain, "https://a.org", "https:", "//a.org", "");
+		// colon not found
+		fxt.Test__regex_domain_n(regex_domain, "https//a.org/bcd");
+		// host_bgn.eos
+		fxt.Test__regex_domain_n(regex_domain, "https:");
+		// host_bgn.//
+		fxt.Test__regex_domain_n(regex_domain, "https:a//");
+		// host_bgn.///
+		fxt.Test__regex_domain_n(regex_domain, "https:///a.org/b");
+	}
+	@Test   public void Regex__clean_url() {
+		Xomw_regex_escape_invalid regex = new Xomw_regex_escape_invalid();
+		// noop
+		fxt.Test__regex_escape_invalid(regex, "https://a.org/bcd", Bool_.N, "");
+		// symbols
+		fxt.Test__regex_escape_invalid(regex, "[]<>\"|", Bool_.Y, "%5B%5D%3C%3E%22%7C%7F");
+		// range: 00 - 32
+		fxt.Test__regex_escape_invalid(regex, "\t\n ", Bool_.Y, "%09%0A+");
+	}
+	@Test   public void Regex__ipv6_brack() {
+		Xomw_regex_ipv6_brack regex = new Xomw_regex_ipv6_brack();
+		// basic
+		fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5B0a.1b:12%5D:123");
+		// port: none
+		fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D");
+		// port: multiple
+		fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D:1:2:3");
+		// "//%5B" missing
+		fxt.Test__regex_ipv6_brack(regex, Bool_.N, "abc");
+		// ipv6: invalid
+		fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba!%5D:1");
+		// ipv6: 0-len
+		fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5B%5D:1");
+		// port: invalid
+		fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:a");
+		// port: 0-len
+		fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:");
+	}
+	@Test   public void Decode() {
+		// dec
+		fxt.Test__decode_char_references("&#33;"           , "!");
+		// hex
+		fxt.Test__decode_char_references("&#x23;"          , "#");
+		// entity
+		fxt.Test__decode_char_references("&alpha;"         , "α");
+		// entity:lt
+		fxt.Test__decode_char_references("&lt;"            , "<");
+		// entity:rlm
+		fxt.Test__decode_char_references("&רלמ;"           , "‏");
+		// entity:invalid
+		fxt.Test__decode_char_references("&invalid;"       , "&invalid;");
+		// amp
+		fxt.Test__decode_char_references("a&b"             , "a&b");
+	}
+	@Test   public void Clean_url() {
+		// entity
+		fxt.Test__clean_url("http://a.org/b&amp;c"           , "http://a.org/b&c");
+		// entity: escape
+		fxt.Test__clean_url("http://a.org/b&quot;c"          , "http://a.org/b%22c");
+		// domain=n; make sure &quot; is changed, but not soft-hyphen
+		fxt.Test__clean_url("a&quot;z"                      , "a%22z");
+		// host: invalid idn
+		fxt.Test__clean_url("http://a᠆b.org/c᠆d"              , "http://ab.org/c᠆d");
+		// ipv6_brack
+		fxt.Test__clean_url("http://[0a.1b:12]:123/cd"       , "http://[0a.1b:12]:123/cd");
+	}
 }
 class Xomw_sanitizer__fxt {
 	private final    Xomw_sanitizer sanitizer = new Xomw_sanitizer();
@ -41,4 +116,33 @@ class Xomw_sanitizer__fxt {
 		sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
 		Gftest.Eq__str(expd, tmp.To_str_and_clear());
 	}
+	public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Gftest.Eq__bool(true, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
+		Gftest.Eq__str(expd_prot, Bry_.Mid(src_bry, regex_domain.prot_bgn, regex_domain.prot_end));
+		Gftest.Eq__str(expd_host, Bry_.Mid(src_bry, regex_domain.host_bgn, regex_domain.host_end));
+		Gftest.Eq__str(expd_rest, Bry_.Mid(src_bry, regex_domain.rest_bgn, regex_domain.rest_end));
+	}
+	public void Test__regex_domain_n(Xomw_regex_find_domain regex_domain, String src_str) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Gftest.Eq__bool(false, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
+	}
+	public void Test__regex_escape_invalid(Xomw_regex_escape_invalid regex, String src_str, boolean expd_rslt, String expd_str) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Gftest.Eq__bool(expd_rslt, regex.Escape(tmp, src_bry, 0, src_bry.length));
+		Gftest.Eq__str(expd_str, tmp.To_bry_and_clear());
+	}
+	public void Test__regex_ipv6_brack(Xomw_regex_ipv6_brack regex, boolean expd_rslt, String src_str) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Gftest.Eq__bool(expd_rslt, regex.Match(src_bry, 0, src_bry.length));
+	}
+	public void Test__decode_char_references(String src_str, String expd) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
+		Gftest.Eq__str(expd, tmp.To_str_and_clear());
+	}
+	public void Test__clean_url(String src_str, String expd) {
+		byte[] src_bry = Bry_.new_u8(src_str);
+		Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
+	}
 }
--- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java
@ -29,7 +29,7 @@ public class Xomw_parser {
 	private final    Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
 	private final    Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
 	private final    Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
-	private final    Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
+	private final    Xomw_magiclinks_wkr magiclinks_wkr;
 	private final    Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
 	private final    Xomw_link_renderer link_renderer = new Xomw_link_renderer();
 	private final    Xomw_link_holders holders;
@ -50,13 +50,6 @@ public class Xomw_parser {
 	public Xomw_lnki_wkr           Lnki_wkr()        {return lnki_wkr;}       private final    Xomw_lnki_wkr lnki_wkr;
 	public boolean                 Output_type__wiki() {return output_type__wiki;} private final    boolean output_type__wiki = false;
 	public Xomw_parser() {
-		this.protocols_trie = Xomw_parser.Protocols__dflt();
-		this.holders = new Xomw_link_holders(link_renderer, tmp);
-		this.table_wkr = new Xomw_table_wkr(this);
-		this.quote_wkr = new Xomw_quote_wkr(this);
-		this.lnke_wkr = new Xomw_lnke_wkr(this);
-		this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
-		this.heading_wkr_cbk = new Xomw_heading_cbk__html();
 		if (regex_space == null) {
 			synchronized (Type_adp_.ClassOf_obj(this)) {
 				regex_space = new Xomw_regex_space();
@ -64,13 +57,22 @@ public class Xomw_parser {
 				regex_url = new Xomw_regex_url(regex_space);
 			}
 		}
+
+		this.protocols_trie = Xomw_parser.Protocols__dflt();
+		this.holders = new Xomw_link_holders(link_renderer, tmp);
+		this.table_wkr = new Xomw_table_wkr(this);
+		this.quote_wkr = new Xomw_quote_wkr(this);
+		this.lnke_wkr = new Xomw_lnke_wkr(this);
+		this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
+		this.heading_wkr_cbk = new Xomw_heading_cbk__html();
+		this.magiclinks_wkr = new Xomw_magiclinks_wkr(sanitizer, linker, regex_boundary, regex_url);
 	}
 	public void Init_by_wiki(Xowe_wiki wiki) {
 		linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
 		lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
 		lnki_wkr.Init_by_wiki(wiki);
-		magiclinks_wkr.Init_by_wiki(linker, regex_boundary, regex_url);
 		doubleunder_wkr.Init_by_wiki(doubleunder_data, wiki.Lang());
+		magiclinks_wkr.Init_by_wiki();
 	}
 	public void Init_by_page(Xoa_ttl ttl) {
 		pctx.Init_by_page(ttl);
@ -115,7 +117,7 @@ public class Xomw_parser {
 		table_wkr.Do_table_stuff(pctx, pbfr);
 		hr_wkr.Replace_hrs(pctx, pbfr);

-		doubleunder_wkr.Do_double_underscore(pctx, pbfr);
+		doubleunder_wkr.Do_double_underscore(pctx, pbfr);   // DONE: DATE:2017-01-27

 		heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
 		lnki_wkr.Replace_internal_links(pctx, pbfr);
--- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser.java
@ -0,0 +1,101 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012 gnosygnu@gmail.com
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
+public class Xomw_regex_parser {
+	private Bry_bfr tmp;	
+	public byte[][] Rslt() {return rslt;} private byte[][] rslt;
+	public Xomw_regex_parser Add_ary(String... ary) {return Set_or_add(Parse_ary(ary));}
+	private byte[][] Parse_ary(String... ary) {
+		if (tmp == null) tmp = Bry_bfr_.New();
+		int ary_len = ary.length;
+		byte[][] rv = new byte[ary_len][];
+		for (int i = 0; i < ary_len; i++) {
+			rv[i] = Compile_itm(tmp, Bry_.new_u8(ary[i]));
+		}
+		return rv;
+	}
+	public Xomw_regex_parser Add_rng(String bgn, String end) {return Set_or_add(Parse_rng(bgn, end));}
+	private byte[][] Parse_rng(String bgn, String end) {
+		if (tmp == null) tmp = Bry_bfr_.New();
+		byte[] bgn_bry = Compile_itm(tmp, Bry_.new_u8(bgn));
+		int bgn_val = gplx.core.intls.Utf16_.Decode_to_int(bgn_bry, 0);
+		byte[] end_bry = Compile_itm(tmp, Bry_.new_u8(end));
+		int end_val = gplx.core.intls.Utf16_.Decode_to_int(end_bry, 0);
+
+		int rv_len = end_val - bgn_val + 1;
+		byte[][] rv = new byte[rv_len][];
+		for (int i = 0; i < rv_len; i++) {
+			rv[i] = gplx.core.intls.Utf16_.Encode_int_to_bry(i + bgn_val);
+		}
+		return rv;
+	}
+	private Xomw_regex_parser Set_or_add(byte[][] val) {
+		rslt = rslt == null ? val : Bry_.Ary_add(rslt, val);
+		return this;
+	}
+	private static byte[] Compile_itm(Bry_bfr tmp, byte[] src) {
+		// parse each itm
+		int src_end = src.length;
+		int cur = 0;
+		int prv = cur;
+		boolean dirty = false;
+		while (true) {
+			// eos
+			if (cur == src_end) {
+				if (dirty)
+					tmp.Add_mid(src, prv, src_end);
+				break;
+			}
+
+			// look at byte
+			byte b = src[cur];
+			switch (b) {	// escape
+				case Byte_ascii.Backslash:
+					int nxt = cur + 1;
+					if (nxt >= src_end) throw Err_.new_wo_type("regex escape failed: no more chars left", "src", src, "pos", nxt);
+					byte nxt_byte = src[nxt];
+					switch (nxt_byte) {
+						case Byte_ascii.Ltr_s: // \s -> " "
+							src = Byte_ascii.Space_bry;
+							cur = src_end;
+							break;
+						case Byte_ascii.Ltr_x: // \ u -> utf8 sequence in hex-dec; EX: "\xc2\xad" -> new byte[] {194, 160}
+							// read next two bytes
+							dirty = true;
+							nxt++;
+							if (nxt + 2 > src_end) throw Err_.new_wo_type("utf8 escape failed: no more chars left", "src", src, "pos", nxt);
+							tmp.Add_byte((byte)gplx.core.encoders.Hex_utl_.Parse_or(src, nxt, nxt + 2, -1));
+							cur = nxt + 2;
+							prv = cur;
+							break;
+						default:
+							throw Err_.new_wo_type("regex escape failed: unknown char", "src", src, "pos", nxt);
+					}
+					break;
+				default: // handles ascii only
+					if (b > 127)
+						throw Err_.new_wo_type("regex compiled failed: unknown char", "src", src, "pos", cur);
+					cur++;
+					break;
+			}
+		}
+
+		// set item
+		return dirty ? tmp.To_bry_and_clear() : src;
+	}
+}
--- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser__tst.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser__tst.java
@ -0,0 +1,42 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012 gnosygnu@gmail.com
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
+import org.junit.*; import gplx.core.tests.*;
+public class Xomw_regex_parser__tst {
+	private final    Xomw_regex_parser__fxt fxt = new Xomw_regex_parser__fxt();
+	@Test   public void Ary__space() {
+		fxt.Test__parse_ary(String_.Ary("\\s"), String_.Ary(" "));
+	}
+	@Test   public void Ary__utf8() {
+		fxt.Test__parse_ary(String_.Ary("\\xc2\\xa7", "\\xe0\\xb9\\x90"), String_.Ary("§", "๐"));
+	}
+	@Test   public void Rng__ascii() {
+		fxt.Test__parse_rng("a", "c", String_.Ary("a", "b", "c"));
+	}
+}
+class Xomw_regex_parser__fxt {
+	private final    Xomw_regex_parser parser = new Xomw_regex_parser();
+	public void Test__parse_ary(String[] ary, String[] expd) {
+		parser.Add_ary(ary);
+		Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
+	}
+	public void Test__parse_rng(String bgn, String end, String[] expd) {
+		parser.Add_rng("a", "c");
+		Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
+	}
+}
--- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java
@ -20,6 +20,7 @@ import gplx.core.btries.*;
 public class Xomw_regex_url {
 	private final    Btrie_slim_mgr trie;
 	public Xomw_regex_url(Xomw_regex_space regex_space) {
+		//       [^][<>"\\x00-\\x20\\x7F\|]
 		// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
 		this.trie = Btrie_slim_mgr.cs();
 		trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
--- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java
@ -19,28 +19,40 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
 import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
 import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
 import gplx.langs.regxs.*;
+// TODO.XO: getExternalLinkAttribs($url)
+// TODO.XO: this->getConverterLanguage()->markNoConversion($url, true),
 public class Xomw_magiclinks_wkr {
 	private final    Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
 	private final    Btrie_rv trv = new Btrie_rv();
 	private static byte[] Tag__anch__rhs;
 	private boolean[] url_separators;
 	private static Xomw_regex_link_interrupt regex_link_interrupt;
-	private Xomw_regex_boundary regex_boundary;
-	private Xomw_regex_url regex_url;
-	private Xomw_linker linker;
+	private final    Xomw_regex_boundary regex_boundary;
+	private final    Xomw_regex_url regex_url;
+	private final    Xomw_sanitizer sanitizer;
+	private final    Xomw_linker linker;
 	private byte[] page_title;

 	private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
-	public Xomw_magiclinks_wkr() {
+	public Xomw_magiclinks_wkr(Xomw_sanitizer sanitizer, Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
+		this.sanitizer = sanitizer;
+		this.linker = linker;
+		this.regex_boundary = regex_boundary;
+		this.regex_url = regex_url;
+
 		// ',;\.:!?'
 		url_separators = Bool_ary_bldr.New_u8()
 			.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
 			.To_ary();
+
+		if (Tag__anch__rhs == null) {
+			synchronized (Type_adp_.ClassOf_obj(this)) {
+				Tag__anch__rhs = Bry_.new_a7("</a>");
+				regex_link_interrupt = new Xomw_regex_link_interrupt();
+			}
+		}
 	}
-	public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
-		this.linker = linker;
-		this.regex_boundary = regex_boundary;
-		this.regex_url = regex_url;
+	public void Init_by_wiki() {
 		regex_trie.Add_str_byte("<a", Regex__anch);
 		regex_trie.Add_str_byte("<" , Regex__elem);
 		
@ -50,13 +62,6 @@ public class Xomw_magiclinks_wkr {
 			Gfo_protocol_itm itm = protocol_ary[i];
 			regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
 		}
-
-		if (Tag__anch__rhs == null) {
-			synchronized (Type_adp_.ClassOf_obj(this)) {
-				Tag__anch__rhs = Bry_.new_a7("</a>");
-				regex_link_interrupt = new Xomw_regex_link_interrupt();
-			}
-		}
 	}

 	// Replace special strings like "ISBN xxx" and "RFC xxx" with
@ -247,7 +252,7 @@ public class Xomw_magiclinks_wkr {
 			return;
 		}

-//			$url = Sanitizer::cleanUrl($url);
+		url = sanitizer.Clean_url(url);

 		// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
 		// Is this an external image?			
--- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java
+++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java
@ -34,10 +34,8 @@ public class Xomw_magiclinks_wkr__tst {
 		fxt.Test__parse("a https://&lt; z"       , "a https://&lt; z");
 	}
 	@Test   public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
-		// hex-dec
-		fxt.Test__parse("a https://b.org&#x60;z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;z'>https://b.org&amp;#x60;z</a>");
 		// dec-hex
-		fxt.Test__parse("a https://b.org&#3c;z"  , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
+		fxt.Test__parse("a https://b.org&#3c;z"      , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
 	}
 	@Test   public void Separator() {
 		// basic; ,;.:!?
@ -50,10 +48,10 @@ public class Xomw_magiclinks_wkr__tst {
 		fxt.Test__parse("a https://b.org;.:!? z"     , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
 		// ";" included b/c of ent
 		fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? z");
-		// ";" included b/c of hex
-		fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? z");
-		// ";" included b/c of dec
-		fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? z");
+		// ";" included b/c of hex; note that Clean_url changes "&#xB1;" to "±"
+		fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org±'>https://b.org±</a>.:!? z");
+		// ";" included b/c of dec; note that Clean_url changes "&#123;" to "{"
+		fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org{'>https://b.org{</a>.:!? z");
 		// ";" excluded b/c of invalid.ent
 		fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? z");
 		// ";" excluded b/c of invalid.hex
@ -63,9 +61,13 @@ public class Xomw_magiclinks_wkr__tst {
 		// num_post_proto rule
 		fxt.Test__parse("a https://.:!? z"           , "a https://.:!? z");
 	}
+	@Test   public void Clean_url() {
+		// basic
+		fxt.Test__parse("http://a᠆b.org/c᠆d"          , "<a class='external free' rel='nofollow' href='http://ab.org/c᠆d'>http://ab.org/c᠆d</a>");
+	}
 }
 class Xomw_magiclinks_wkr__fxt {
-	private final    Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
+	private final    Xomw_magiclinks_wkr wkr;
 	private final    Xomw_parser_ctx pctx = new Xomw_parser_ctx();
 	private final    Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
 	public Xomw_magiclinks_wkr__fxt() {
@ -74,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {

 		Xomw_regex_space regex_space = new Xomw_regex_space();
 		pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
-		wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
+		this.wkr = new Xomw_magiclinks_wkr(new Xomw_sanitizer(), new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
+		wkr.Init_by_wiki();
 	}
 	public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
 	public void Test__parse(boolean apos, String src_str, String expd) {
--- a/400_xowa/src/gplx/xowa/mws/utls/Xomw_ttl_utl.java
+++ b/400_xowa/src/gplx/xowa/mws/utls/Xomw_ttl_utl.java
@ -60,13 +60,13 @@ public class Xomw_ttl_utl {
 			if (cur == src_end) break;
 			byte b = src[cur];
 			int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
-			if (b_len == 1) {   // ASCII
-				if (valid[b])	// valid; EX: "a0A B&$"
+			if (b_len == 1) {         // ASCII
+				if (valid[b & 0xFF])  // valid; EX: "a0A B&$"; PATCH.JAVA:need to convert to unsigned byte
 					cur++;
-				else            // invalid; EX: "<title>"
+				else                  // invalid; EX: "<title>"
 					break;
 			}
-			else {              // Multi-byte UTF8; NOTE: all sequences are valid
+			else {                    // Multi-byte UTF8; NOTE: all sequences are valid
 				cur += b_len;
 			}
 		}