Xomw: Convert Sanitizer, StringUtils; also, support stripAllTags

2025-06-13 12:54:14 +00:00 · 2017-02-23 09:08:03 -05:00 · 2017-02-23 09:08:03 -05:00 · 31fcfaf1bd
commit 31fcfaf1bd
parent 09dbfc894e
13 changed files with 2082 additions and 238 deletions
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizerTest.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizerTest.java
@ -121,13 +121,20 @@ public class XomwSanitizerTest {
 		// cls: ws
 		fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, "  v1   v2  "), trg_atrs.Clear().Add_many(cls, "  v3   v4   "), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
 	}
+
+	@Test   public void normalizeWhitespace() {
+		fxt.Test_normalizeWhitespace("a\r\nb", "a b");
+		fxt.Test_normalizeWhitespace("a\rb", "a b");
+		fxt.Test_normalizeWhitespace("a\nb", "a b");
+		fxt.Test_normalizeWhitespace("a\tb", "a b");
+	}
 }
 class XomwSanitizerFxt {
 	private final    XomwSanitizer sanitizer = new XomwSanitizer();
 	private final    Bry_bfr tmp = Bry_bfr_.New();
 	public void Test__normalize_char_references(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
+		sanitizer.normalizeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
 		Gftest.Eq__str(expd, tmp.To_str_and_clear());
 	}
 	public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
@ -152,15 +159,18 @@ class XomwSanitizerFxt {
 	}
 	public void Test__decode_char_references(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
+		sanitizer.decodeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
 		Gftest.Eq__str(expd, tmp.To_str_and_clear());
 	}
 	public void Test__clean_url(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
+		Gftest.Eq__str(expd, sanitizer.cleanUrl(src_bry));
 	}
 	public void Test__merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg, Xomw_atr_mgr expd) {
-		sanitizer.Merge_attributes(src, trg);
+		sanitizer.mergeAttributes(src, trg);
 		Gftest.Eq__ary__lines(expd.To_str(tmp), src.To_str(tmp), "merge_atrs");
 	}
+	public void Test_normalizeWhitespace(String src_str, String expd) {
+		Gftest.Eq__str(expd, sanitizer.normalizeWhitespace(Bry_.new_u8(src_str)), "merge_atrs");
+	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwXml.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwXml.java
@ -53,7 +53,7 @@ public class XomwXml {
 			bfr.Add_byte_space();
 			bfr.Add((byte[])attribs.Get_at(i));
 			bfr.Add_byte_eq().Add_byte_quote();
-			XomwSanitizer.Encode_attribute(bfr, (byte[])attribs.Get_at(i + 1));
+			XomwSanitizer.encodeAttribute(bfr, (byte[])attribs.Get_at(i + 1));
 			bfr.Add_byte_quote();
 		}
 	}
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
@ -0,0 +1,373 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
+import gplx.core.btries.*;
+/**
+* A collection of static methods to play with strings.
+*/
+public class XomwStringUtils {
+//		/**
+//		* Test whether a String is valid UTF-8.
+//		*
+//		* The function check for invalid byte sequences, overlong encoding but
+//		* not for different normalisations.
+//		*
+//		* @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
+//		* In particular, the pure PHP code path did not in fact check for overlong forms.
+//		* Beware of this when backporting code to that version of MediaWiki.
+//		*
+//		* @since 1.21
+//		* @param String $value String to check
+//		* @return boolean Whether the given $value is a valid UTF-8 encoded String
+//		*/
+//		static function isUtf8($value) {
+//			$value = (String)$value;
+//
+//			// HHVM 3.4 and older come with an outdated version of libmbfl that
+//			// incorrectly allows values above U+10FFFF, so we have to check
+//			// for them separately. (This issue also exists in PHP 5.3 and
+//			// older, which are no longer supported.)
+//			static $newPHP;
+//			if ($newPHP === null) {
+//				$newPHP = !mb_check_encoding("\xf4\x90\x80\x80", 'UTF-8');
+//			}
+//
+//			return mb_check_encoding($value, 'UTF-8') &&
+//				($newPHP || preg_match("/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value) === 0);
+//		}
+
+	private static final byte DELIMITER_EXPLODE__SEP = 0, DELIMITER_EXPLODE__BGN = 1, DELIMITER_EXPLODE__END = 2;
+	private static final    Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
+		.Add_str_byte("|" , DELIMITER_EXPLODE__SEP)
+		.Add_str_byte("-{", DELIMITER_EXPLODE__BGN)
+		.Add_str_byte("}-", DELIMITER_EXPLODE__END)
+		;
+	/**
+	* Explode a String, but ignore any instances of the separator inside
+	* the given start and end delimiters, which may optionally nest.
+	* The delimiters are literal strings, not regular expressions.
+	* @param String $startDelim Start delimiter
+	* @param String $endDelim End delimiter
+	* @param String $separator Separator String for the explode.
+	* @param String $subject Subject String to explode.
+	* @param boolean $nested True iff the delimiters are allowed to nest.
+	* @return ArrayIterator
+	*/
+	// XO.MW: NOTE: function only used in two places; hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
+	public static byte[][] delimiterExplode(List_adp tmp, Btrie_rv trv, byte[] src) {
+		// XO.MW.PORTED:entire proc rewritten; see PHP for source
+		int src_bgn = 0;
+		int src_end = src.length;
+
+		int depth = 0;
+		int cur = src_bgn;
+		int prv = cur;
+		while (true) {
+			// eos
+			if (cur == src_end) {
+				// add rest
+				tmp.Add(Bry_.Mid(src, prv, src_end));
+				break;
+			}
+
+			Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
+
+			// regular char; continue;
+			if (o == null) {
+				cur++;
+				continue;
+			}
+
+			// handle sep, bgn, end
+			byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
+			switch (tid) {
+				case DELIMITER_EXPLODE__SEP:
+					if (depth == 0) {
+						tmp.Add(Bry_.Mid(src, prv, cur));
+						prv = cur + 1;
+					}
+					break;
+				case DELIMITER_EXPLODE__BGN:
+					depth++;
+					break;
+				case DELIMITER_EXPLODE__END:
+					depth--;
+					break;
+			}
+			cur = trv.Pos();
+		}
+		return (byte[][])tmp.To_ary_and_clear(byte[].class);
+	}
+
+//		/**
+//		* Perform an operation equivalent to `preg_replace()`
+//		*
+//		* Matches this code:
+//		*
+//		*     preg_replace("!$startDelim(.*?)$endDelim!", $replace, $subject);
+//		*
+//		* ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
+//		* implementation is fast but memory-hungry and inflexible. The memory requirements are such
+//		* that I don't recommend using it on anything but guaranteed small chunks of text.
+//		*
+//		* @param String $startDelim
+//		* @param String $endDelim
+//		* @param String $replace
+//		* @param String $subject
+//		* @return String
+//		*/
+//		static function hungryDelimiterReplace($startDelim, $endDelim, $replace, $subject) {
+//			$segments = explode($startDelim, $subject);
+//			$output = array_shift($segments);
+//			foreach ($segments as $s) {
+//				$endDelimPos = strpos($s, $endDelim);
+//				if ($endDelimPos === false) {
+//					$output .= $startDelim . $s;
+//				} else {
+//					$output .= $replace . substr($s, $endDelimPos + strlen($endDelim));
+//				}
+//			}
+//
+//			return $output;
+//		}
+//
+//		/**
+//		* Perform an operation equivalent to `preg_replace_callback()`
+//		*
+//		* Matches this code:
+//		*
+//		*     preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
+//		*
+//		* If the start delimiter ends with an initial substring of the end delimiter,
+//		* e.g. in the case of C-style comments, the behavior differs from the model
+//		* regex. In this implementation, the end must share no characters with the
+//		* start, so e.g. `/*\/` is not considered to be both the start and end of a
+//		* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
+//		*
+//		* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
+//		* but uses far less memory. The delimiters are literal strings, not regular expressions.
+//		*
+//		* @param String $startDelim Start delimiter
+//		* @param String $endDelim End delimiter
+//		* @param callable $callback Function to call on each match
+//		* @param String $subject
+//		* @param String $flags Regular expression flags
+//		* @throws InvalidArgumentException
+//		* @return String
+//		*/
+//		static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
+//			$subject, $flags = ''
+//		) {
+//			$inputPos = 0;
+//			$outputPos = 0;
+//			$contentPos = 0;
+//			$output = '';
+//			$foundStart = false;
+//			$encStart = preg_quote($startDelim, '!');
+//			$encEnd = preg_quote($endDelim, '!');
+//			$strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
+//			$endLength = strlen($endDelim);
+//			$m = [];
+//
+//			while ($inputPos < strlen($subject) &&
+//				preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
+//			) {
+//				$tokenOffset = $m[0][1];
+//				if ($m[1][0] != '') {
+//					if ($foundStart &&
+//						$strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
+//					) {
+//						# An end match is present at the same location
+//						$tokenType = 'end';
+//						$tokenLength = $endLength;
+//					} else {
+//						$tokenType = 'start';
+//						$tokenLength = strlen($m[0][0]);
+//					}
+//				} elseif ($m[2][0] != '') {
+//					$tokenType = 'end';
+//					$tokenLength = strlen($m[0][0]);
+//				} else {
+//					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
+//				}
+//
+//				if ($tokenType == 'start') {
+//					# Only move the start position if we haven't already found a start
+//					# This means that START START END matches outer pair
+//					if (!$foundStart) {
+//						# Found start
+//						$inputPos = $tokenOffset + $tokenLength;
+//						# Write out the non-matching section
+//						$output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
+//						$outputPos = $tokenOffset;
+//						$contentPos = $inputPos;
+//						$foundStart = true;
+//					} else {
+//						# Move the input position past the *first character* of START,
+//						# to protect against missing END when it overlaps with START
+//						$inputPos = $tokenOffset + 1;
+//					}
+//				} elseif ($tokenType == 'end') {
+//					if ($foundStart) {
+//						# Found match
+//						$output .= call_user_func($callback, [
+//							substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
+//							substr($subject, $contentPos, $tokenOffset - $contentPos)
+//						]);
+//						$foundStart = false;
+//					} else {
+//						# Non-matching end, write it out
+//						$output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
+//					}
+//					$inputPos = $outputPos = $tokenOffset + $tokenLength;
+//				} else {
+//					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
+//				}
+//			}
+//			if ($outputPos < strlen($subject)) {
+//				$output .= substr($subject, $outputPos);
+//			}
+//
+//			return $output;
+//		}
+//
+//		/**
+//		* Perform an operation equivalent to `preg_replace()` with flags.
+//		*
+//		* Matches this code:
+//		*
+//		*     preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
+//		*
+//		* @param String $startDelim Start delimiter regular expression
+//		* @param String $endDelim End delimiter regular expression
+//		* @param String $replace Replacement String. May contain $1, which will be
+//		*  replaced by the text between the delimiters
+//		* @param String $subject String to search
+//		* @param String $flags Regular expression flags
+//		* @return String The String with the matches replaced
+//		*/
+//		static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
+//			$replacer = new RegexlikeReplacer($replace);
+//
+//			return self::delimiterReplaceCallback($startDelim, $endDelim,
+//				$replacer->cb(), $subject, $flags);
+//		}
+//
+//		/**
+//		* More or less "markup-safe" explode()
+//		* Ignores any instances of the separator inside `<...>`
+//		* @param String $separator
+//		* @param String $text
+//		* @return array
+//		*/
+//		static function explodeMarkup($separator, $text) {
+//			$placeholder = "\x00";
+//
+//			// Remove placeholder instances
+//			$text = str_replace($placeholder, '', $text);
+//
+//			// Replace instances of the separator inside HTML-like tags with the placeholder
+//			$replacer = new DoubleReplacer($separator, $placeholder);
+//			$cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
+//
+//			// Explode, then put the replaced separators back in
+//			$items = explode($separator, $cleaned);
+//			foreach ($items as $i => $str) {
+//				$items[$i] = str_replace($placeholder, $separator, $str);
+//			}
+//
+//			return $items;
+//		}
+
+	/**
+	* More or less "markup-safe" str_replace()
+	* Ignores any instances of the separator inside `<...>`
+	* @param String $search
+	* @param String $replace
+	* @param String $text
+	* @return String
+	*/
+	public static void replaceMarkup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {	// REF:/includes/libs/StringUtils.php|replaceMarkup
+		// XO.MW.PORTED: avoiding multiple regex calls / String creations
+		// $placeholder = "\x00";
+		//
+		// Remove placeholder instances
+		// $text = str_replace($placeholder, '', $text);
+		//
+		// Replace instances of the separator inside HTML-like tags with the placeholder
+		// $replacer = new DoubleReplacer($search, $placeholder);
+		// $cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
+		//
+		// Explode, then put the replaced separators back in
+		// $cleaned = str_replace($search, $replace, $cleaned);
+		// $text = str_replace($placeholder, $search, $cleaned);
+
+		// if same length find / repl, do in-place replacement; EX: "!!"  -> "||"
+		int find_len = find.length;
+		int repl_len = repl.length;
+		if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
+
+		byte find_0 = find[0];
+		byte dlm_bgn = Byte_ascii.Angle_bgn;
+		byte dlm_end = Byte_ascii.Angle_end;
+		boolean repl_active = true;
+
+		// loop every char in array
+		for (int i = src_bgn; i < src_end; i++) {
+			byte b = src[i];
+			if (  b == find_0
+				&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
+				&& repl_active
+				) {
+				Bry_.Set(src, i, i + find_len, repl);
+			}
+			else if (b == dlm_bgn) {
+				repl_active = false;
+			}
+			else if (b == dlm_end) {
+				repl_active = true;
+			}
+		}
+	}
+
+//		/**
+//		* Escape a String to make it suitable for inclusion in a preg_replace()
+//		* replacement parameter.
+//		*
+//		* @param String $String
+//		* @return String
+//		*/
+//		static function escapeRegexReplacement($String) {
+//			$String = str_replace('\\', '\\\\', $String);
+//			$String = str_replace('$', '\\$', $String);
+//			return $String;
+//		}
+//
+//		/**
+//		* Workalike for explode() with limited memory usage.
+//		*
+//		* @param String $separator
+//		* @param String $subject
+//		* @return ArrayIterator|ExplodeIterator
+//		*/
+//		static function explode($separator, $subject) {
+//			if (substr_count($subject, $separator) > 1000) {
+//				return new ExplodeIterator($separator, $subject);
+//			} else {
+//				return new ArrayIterator(explode($separator, $subject));
+//			}
+//		}
+}
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils__tst.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils__tst.java
@ -15,8 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
 import org.junit.*; import gplx.core.tests.*;
-public class Xomw_string_utils__tst {
-	private final    Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
+public class XomwStringUtilsTest {
+	private final    XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
 	@Test  public void Delimiter_explode() {
 		// basic
 		fxt.Test__delimiter_explode("a|b|c"                             , "a", "b", "c");
@ -42,17 +42,17 @@ public class Xomw_string_utils__tst {
 		fxt.Test__replace_markup("a!!b<!!>!!>!!c"   , "!!", "||", "a||b<!!>||>||c");	// NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
 	}
 }
-class Xomw_string_utils__fxt {
+class XomwStringUtilsFxt {
 	public void Test__delimiter_explode(String src_str, String... expd) {
 		List_adp tmp = List_adp_.New();
 		gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();

-		byte[][] actl = Xomw_string_utils.Delimiter_explode(tmp, trv, Bry_.new_u8(src_str));
+		byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
 		Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
 	}
 	public void Test__replace_markup(String src_str, String find, String repl, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
+		XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
 		Gftest.Eq__str(expd, src_bry);
 	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils.java
@ -1,123 +0,0 @@
-/*
-XOWA: the XOWA Offline Wiki Application
-Copyright (C) 2012-2017 gnosygnu@gmail.com
-
-XOWA is licensed under the terms of the General Public License (GPL) Version 3,
-or alternatively under the terms of the Apache License Version 2.0.
-
-You may use XOWA according to either of these licenses as is most appropriate
-for your project on a case-by-case basis.
-
-The terms of each license can be found in the source code repository:
-
-GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
-Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
-*/
-package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
-import gplx.core.btries.*;
-public class Xomw_string_utils {
-	// Explode a String, but ignore any instances of the separator inside
-	// the given start and end delimiters, which may optionally nest.
-	// The delimiters are literal strings, not regular expressions.
-	// @param String bgn_delim Start delimiter
-	// @param String end_delim End delimiter
-	// @param String separator Separator String for the explode.
-	// @param String subject Subject String to explode.
-	// @param boolean nested True iff the delimiters are allowed to nest.
-	// @return ArrayIterator
-	// XO.MW: hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
-	// XO.MW:SYNC:1.29; DATE:2017-02-03
-	private static final byte Delimiter_explode__sep = 0, Delimiter_explode__bgn = 1, Delimiter_explode__end = 2;
-	private static final    Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
-		.Add_str_byte("|" , Delimiter_explode__sep)
-		.Add_str_byte("-{", Delimiter_explode__bgn)
-		.Add_str_byte("}-", Delimiter_explode__end)
-		;
-	public static byte[][] Delimiter_explode(List_adp tmp, Btrie_rv trv, byte[] src) {
-		int src_bgn = 0;
-		int src_end = src.length;
-
-		int depth = 0;
-		int cur = src_bgn;
-		int prv = cur;
-		while (true) {
-			// eos
-			if (cur == src_end) {
-				// add rest
-				tmp.Add(Bry_.Mid(src, prv, src_end));
-				break;
-			}
-
-			Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
-
-			// regular char; continue;
-			if (o == null) {
-				cur++;
-				continue;
-			}
-
-			// handle sep, bgn, end
-			byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
-			switch (tid) {
-				case Delimiter_explode__sep:
-					if (depth == 0) {
-						tmp.Add(Bry_.Mid(src, prv, cur));
-						prv = cur + 1;
-					}
-					break;
-				case Delimiter_explode__bgn:
-					depth++;
-					break;
-				case Delimiter_explode__end:
-					depth--;
-					break;
-			}
-			cur = trv.Pos();
-		}
-		return (byte[][])tmp.To_ary_and_clear(byte[].class);
-	}
-	// More or less "markup-safe" str_replace()
-	// Ignores any instances of the separator inside `<...>`
-	public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {	// REF:/includes/libs/StringUtils.php|replaceMarkup
-		// PORTED: avoiding multiple regex calls / String creations
-		// $placeholder = "\x00";
-
-		// Remove placeholder instances
-		// $text = str_replace( $placeholder, '', $text );
-
-		// Replace instances of the separator inside HTML-like tags with the placeholder
-		// $replacer = new DoubleReplacer( $search, $placeholder );
-		// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
-
-		// Explode, then put the replaced separators back in
-		// $cleaned = str_replace( $search, $replace, $cleaned );
-		// $text = str_replace( $placeholder, $search, $cleaned );
-
-		// if same length find / repl, do in-place replacement; EX: "!!"  -> "||"
-		int find_len = find.length;
-		int repl_len = repl.length;
-		if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
-
-		byte find_0 = find[0];
-		byte dlm_bgn = Byte_ascii.Angle_bgn;
-		byte dlm_end = Byte_ascii.Angle_end;
-		boolean repl_active = true;
-
-		// loop every char in array
-		for (int i = src_bgn; i < src_end; i++) {
-			byte b = src[i];
-			if (   b == find_0
-				&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
-				&& repl_active
-				) {
-				Bry_.Set(src, i, i + find_len, repl);
-			}
-			else if (b == dlm_bgn) {
-				repl_active = false;
-			}
-			else if (b == dlm_end) {
-				repl_active = true;
-			}
-		}
-	}
-}
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/linkers/Xomw_link_renderer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/linkers/Xomw_link_renderer.java
@ -164,7 +164,7 @@ public class Xomw_link_renderer {

 		// Merge the custom attribs with the default ones, and iterate
 		// over that, deleting all "false" attributes.
-		sanitizer.Merge_attributes(src, trg);
+		sanitizer.mergeAttributes(src, trg);

 		// XO.MW:MW removes "false" values; XO removes "null" values
 		boolean deleted = false;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java
@ -1244,7 +1244,7 @@ public class XomwParser implements XomwParserIface {
 //			$text = $this->replaceTransparentTags( $text );
 		mStripState.unstripGeneral(pbfr);

-		sanitizer.Normalize_char_references(pbfr);
+		sanitizer.normalizeCharReferences(pbfr);

 //			if ( MWTidy::isEnabled() ) {
 //				if ( $this->mOptions->getTidy() ) {
@ -4605,20 +4605,11 @@ public class XomwParser implements XomwParserIface {
 		// that are later expanded to html- so expand them now and
 		// remove the tags
 		tooltip = this.mStripState.unstripBoth(tooltip);
-//			tooltip = Sanitizer::stripAllTags( tooltip );
+		tooltip = sanitizer.stripAllTags(tooltip);

 		return tooltip;
 	}
-//		protected function stripAltText($caption, $holders) {
-//			# make sure there are no placeholders in thumbnail attributes
-//			# that are later expanded to html- so expand them now and
-//			# remove the tags
-//			$tooltip = this.mStripState->unstripBoth($tooltip);
-//			$tooltip = Sanitizer::stripAllTags($tooltip);
-//
-//			return $tooltip;
-//		}
-//
+
 //		/**
 //		* Set a flag in the output Object indicating that the content is dynamic and
 //		* shouldn't be cached.
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser.java
@ -200,7 +200,7 @@ public class Xomw_parser implements XomwParserIface {
 //			$text = $this->replaceTransparentTags( $text );
 		strip_state.unstripGeneral(pbfr);

-		sanitizer.Normalize_char_references(pbfr);
+		sanitizer.normalizeCharReferences(pbfr);

 //			if ( MWTidy::isEnabled() ) {
 //				if ( $this->mOptions->getTidy() ) {
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java
@ -203,7 +203,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
 			// $text = $this->getConverterLanguage()->markNoConversion( $text );

 			byte[] url = Bry_.Mid(src, url_bgn, url_end);
-			url = sanitizer.Clean_url(url);
+			url = sanitizer.cleanUrl(url);

 			bfr.Add_mid(src, prv, lnke_bgn);
 			prv = cur;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java
@ -472,7 +472,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls
 		//  * text-bottom

 		// Protect LanguageConverter markup when splitting into parts
-		byte[][] parts = Xomw_string_utils.Delimiter_explode(tmp_list, trv, options_at_link);
+		byte[][] parts = XomwStringUtils.delimiterExplode(tmp_list, trv, options_at_link);

 		// Give extensions a chance to select the file revision for us
 //			$options = [];
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
@ -252,7 +252,7 @@ public class Xomw_magiclinks_wkr {
 			return;
 		}

-		url = sanitizer.Clean_url(url);
+		url = sanitizer.cleanUrl(url);

 		// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
 		// Is this an external image?			
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java
@ -107,7 +107,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 			for (int j = 0; j < indent_level; j++)
 				tmp.Add(Html__dl__bgn);
 			tmp.Add_str_a7("<table");
-			sanitizer.Fix_tag_attributes(tmp, Name__table, tblw_atrs);
+			sanitizer.fixTagAttributes(tmp, Name__table, tblw_atrs);
 			tmp.Add_byte(Byte_ascii.Angle_end);
 			out_line = tmp.To_bry_and_clear();
 			td_history.Add(false);
@ -150,7 +150,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U

 			// Whats after the tag is now only attributes
 			byte[] atrs = strip_state.unstripBoth(line);
-			sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs);
+			sanitizer.fixTagAttributes(tmp, Name__tr, atrs);
 			atrs = tmp.To_bry_and_clear();

 			Php_ary_.Pop_bry_or_null(tr_attributes);
@ -188,7 +188,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U

 			// Implies both are valid for table headings.
 			if (first_char == Byte_ascii.Bang) {
-				Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
+				XomwStringUtils.replaceMarkup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
 			}

 			// Split up multiple cells on the same line.
@ -253,7 +253,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 				else {
 					byte[] atrs = strip_state.unstripBoth(cell_data_0);
 					tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
-					sanitizer.Fix_tag_attributes(tmp, last_tag, atrs);
+					sanitizer.fixTagAttributes(tmp, last_tag, atrs);
 					tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
 					cell = tmp.To_bry_and_clear();
 				}