Xomw: Convert Sanitizer, StringUtils; also, support stripAllTags

2026-03-02 03:49:30 +00:00 · 2017-02-23 09:08:03 -05:00
parent 09dbfc894e
commit 31fcfaf1bd
13 changed files with 2082 additions and 238 deletions
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizerTest.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizerTest.java
@@ -121,13 +121,20 @@ public class XomwSanitizerTest {
 		// cls: ws
 		fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, "  v1   v2  "), trg_atrs.Clear().Add_many(cls, "  v3   v4   "), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
 	}
 	@Test   public void normalizeWhitespace() {
 		fxt.Test_normalizeWhitespace("a\r\nb", "a b");
 		fxt.Test_normalizeWhitespace("a\rb", "a b");
 		fxt.Test_normalizeWhitespace("a\nb", "a b");
 		fxt.Test_normalizeWhitespace("a\tb", "a b");
 	}
 }
 class XomwSanitizerFxt {
 	private final    XomwSanitizer sanitizer = new XomwSanitizer();
 	private final    Bry_bfr tmp = Bry_bfr_.New();
 	public void Test__normalize_char_references(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
+		sanitizer.normalizeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
 		Gftest.Eq__str(expd, tmp.To_str_and_clear());
 	}
 	public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
@@ -152,15 +159,18 @@ class XomwSanitizerFxt {
 	}
 	public void Test__decode_char_references(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
+		sanitizer.decodeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
 		Gftest.Eq__str(expd, tmp.To_str_and_clear());
 	}
 	public void Test__clean_url(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
+		Gftest.Eq__str(expd, sanitizer.cleanUrl(src_bry));
 	}
 	public void Test__merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg, Xomw_atr_mgr expd) {
-		sanitizer.Merge_attributes(src, trg);
+		sanitizer.mergeAttributes(src, trg);
 		Gftest.Eq__ary__lines(expd.To_str(tmp), src.To_str(tmp), "merge_atrs");
 	}
 	public void Test_normalizeWhitespace(String src_str, String expd) {
 		Gftest.Eq__str(expd, sanitizer.normalizeWhitespace(Bry_.new_u8(src_str)), "merge_atrs");
 	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwXml.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwXml.java
@@ -53,7 +53,7 @@ public class XomwXml {
 			bfr.Add_byte_space();
 			bfr.Add((byte[])attribs.Get_at(i));
 			bfr.Add_byte_eq().Add_byte_quote();
-			XomwSanitizer.Encode_attribute(bfr, (byte[])attribs.Get_at(i + 1));
+			XomwSanitizer.encodeAttribute(bfr, (byte[])attribs.Get_at(i + 1));
 			bfr.Add_byte_quote();
 		}
 	}
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
@@ -0,0 +1,373 @@
 /*
 XOWA: the XOWA Offline Wiki Application
 Copyright (C) 2012-2017 gnosygnu@gmail.com
 XOWA is licensed under the terms of the General Public License (GPL) Version 3,
 or alternatively under the terms of the Apache License Version 2.0.
 You may use XOWA according to either of these licenses as is most appropriate
 for your project on a case-by-case basis.
 The terms of each license can be found in the source code repository:
 GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
 Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
 import gplx.core.btries.*;
 /**
 * A collection of static methods to play with strings.
 */
 public class XomwStringUtils {
 //		/**
 //		* Test whether a String is valid UTF-8.
 //		*
 //		* The function check for invalid byte sequences, overlong encoding but
 //		* not for different normalisations.
 //		*
 //		* @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
 //		* In particular, the pure PHP code path did not in fact check for overlong forms.
 //		* Beware of this when backporting code to that version of MediaWiki.
 //		*
 //		* @since 1.21
 //		* @param String $value String to check
 //		* @return boolean Whether the given $value is a valid UTF-8 encoded String
 //		*/
 //		static function isUtf8($value) {
 //			$value = (String)$value;
 //
 //			// HHVM 3.4 and older come with an outdated version of libmbfl that
 //			// incorrectly allows values above U+10FFFF, so we have to check
 //			// for them separately. (This issue also exists in PHP 5.3 and
 //			// older, which are no longer supported.)
 //			static $newPHP;
 //			if ($newPHP === null) {
 //				$newPHP = !mb_check_encoding("\xf4\x90\x80\x80", 'UTF-8');
 //			}
 //
 //			return mb_check_encoding($value, 'UTF-8') &&
 //				($newPHP || preg_match("/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value) === 0);
 //		}
 	private static final byte DELIMITER_EXPLODE__SEP = 0, DELIMITER_EXPLODE__BGN = 1, DELIMITER_EXPLODE__END = 2;
 	private static final    Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
 		.Add_str_byte("|" , DELIMITER_EXPLODE__SEP)
 		.Add_str_byte("-{", DELIMITER_EXPLODE__BGN)
 		.Add_str_byte("}-", DELIMITER_EXPLODE__END)
 		;
 	/**
 	* Explode a String, but ignore any instances of the separator inside
 	* the given start and end delimiters, which may optionally nest.
 	* The delimiters are literal strings, not regular expressions.
 	* @param String $startDelim Start delimiter
 	* @param String $endDelim End delimiter
 	* @param String $separator Separator String for the explode.
 	* @param String $subject Subject String to explode.
 	* @param boolean $nested True iff the delimiters are allowed to nest.
 	* @return ArrayIterator
 	*/
 	// XO.MW: NOTE: function only used in two places; hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
 	public static byte[][] delimiterExplode(List_adp tmp, Btrie_rv trv, byte[] src) {
 		// XO.MW.PORTED:entire proc rewritten; see PHP for source
 		int src_bgn = 0;
 		int src_end = src.length;
 		int depth = 0;
 		int cur = src_bgn;
 		int prv = cur;
 		while (true) {
 			// eos
 			if (cur == src_end) {
 				// add rest
 				tmp.Add(Bry_.Mid(src, prv, src_end));
 				break;
 			}
 			Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
 			// regular char; continue;
 			if (o == null) {
 				cur++;
 				continue;
 			}
 			// handle sep, bgn, end
 			byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
 			switch (tid) {
 				case DELIMITER_EXPLODE__SEP:
 					if (depth == 0) {
 						tmp.Add(Bry_.Mid(src, prv, cur));
 						prv = cur + 1;
 					}
 					break;
 				case DELIMITER_EXPLODE__BGN:
 					depth++;
 					break;
 				case DELIMITER_EXPLODE__END:
 					depth--;
 					break;
 			}
 			cur = trv.Pos();
 		}
 		return (byte[][])tmp.To_ary_and_clear(byte[].class);
 	}
 //		/**
 //		* Perform an operation equivalent to `preg_replace()`
 //		*
 //		* Matches this code:
 //		*
 //		*     preg_replace("!$startDelim(.*?)$endDelim!", $replace, $subject);
 //		*
 //		* ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
 //		* implementation is fast but memory-hungry and inflexible. The memory requirements are such
 //		* that I don't recommend using it on anything but guaranteed small chunks of text.
 //		*
 //		* @param String $startDelim
 //		* @param String $endDelim
 //		* @param String $replace
 //		* @param String $subject
 //		* @return String
 //		*/
 //		static function hungryDelimiterReplace($startDelim, $endDelim, $replace, $subject) {
 //			$segments = explode($startDelim, $subject);
 //			$output = array_shift($segments);
 //			foreach ($segments as $s) {
 //				$endDelimPos = strpos($s, $endDelim);
 //				if ($endDelimPos === false) {
 //					$output .= $startDelim . $s;
 //				} else {
 //					$output .= $replace . substr($s, $endDelimPos + strlen($endDelim));
 //				}
 //			}
 //
 //			return $output;
 //		}
 //
 //		/**
 //		* Perform an operation equivalent to `preg_replace_callback()`
 //		*
 //		* Matches this code:
 //		*
 //		*     preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
 //		*
 //		* If the start delimiter ends with an initial substring of the end delimiter,
 //		* e.g. in the case of C-style comments, the behavior differs from the model
 //		* regex. In this implementation, the end must share no characters with the
 //		* start, so e.g. `/*\/` is not considered to be both the start and end of a
 //		* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
 //		*
 //		* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
 //		* but uses far less memory. The delimiters are literal strings, not regular expressions.
 //		*
 //		* @param String $startDelim Start delimiter
 //		* @param String $endDelim End delimiter
 //		* @param callable $callback Function to call on each match
 //		* @param String $subject
 //		* @param String $flags Regular expression flags
 //		* @throws InvalidArgumentException
 //		* @return String
 //		*/
 //		static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
 //			$subject, $flags = ''
 //		) {
 //			$inputPos = 0;
 //			$outputPos = 0;
 //			$contentPos = 0;
 //			$output = '';
 //			$foundStart = false;
 //			$encStart = preg_quote($startDelim, '!');
 //			$encEnd = preg_quote($endDelim, '!');
 //			$strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
 //			$endLength = strlen($endDelim);
 //			$m = [];
 //
 //			while ($inputPos < strlen($subject) &&
 //				preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
 //			) {
 //				$tokenOffset = $m[0][1];
 //				if ($m[1][0] != '') {
 //					if ($foundStart &&
 //						$strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
 //					) {
 //						# An end match is present at the same location
 //						$tokenType = 'end';
 //						$tokenLength = $endLength;
 //					} else {
 //						$tokenType = 'start';
 //						$tokenLength = strlen($m[0][0]);
 //					}
 //				} elseif ($m[2][0] != '') {
 //					$tokenType = 'end';
 //					$tokenLength = strlen($m[0][0]);
 //				} else {
 //					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
 //				}
 //
 //				if ($tokenType == 'start') {
 //					# Only move the start position if we haven't already found a start
 //					# This means that START START END matches outer pair
 //					if (!$foundStart) {
 //						# Found start
 //						$inputPos = $tokenOffset + $tokenLength;
 //						# Write out the non-matching section
 //						$output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
 //						$outputPos = $tokenOffset;
 //						$contentPos = $inputPos;
 //						$foundStart = true;
 //					} else {
 //						# Move the input position past the *first character* of START,
 //						# to protect against missing END when it overlaps with START
 //						$inputPos = $tokenOffset + 1;
 //					}
 //				} elseif ($tokenType == 'end') {
 //					if ($foundStart) {
 //						# Found match
 //						$output .= call_user_func($callback, [
 //							substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
 //							substr($subject, $contentPos, $tokenOffset - $contentPos)
 //						]);
 //						$foundStart = false;
 //					} else {
 //						# Non-matching end, write it out
 //						$output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
 //					}
 //					$inputPos = $outputPos = $tokenOffset + $tokenLength;
 //				} else {
 //					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
 //				}
 //			}
 //			if ($outputPos < strlen($subject)) {
 //				$output .= substr($subject, $outputPos);
 //			}
 //
 //			return $output;
 //		}
 //
 //		/**
 //		* Perform an operation equivalent to `preg_replace()` with flags.
 //		*
 //		* Matches this code:
 //		*
 //		*     preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
 //		*
 //		* @param String $startDelim Start delimiter regular expression
 //		* @param String $endDelim End delimiter regular expression
 //		* @param String $replace Replacement String. May contain $1, which will be
 //		*  replaced by the text between the delimiters
 //		* @param String $subject String to search
 //		* @param String $flags Regular expression flags
 //		* @return String The String with the matches replaced
 //		*/
 //		static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
 //			$replacer = new RegexlikeReplacer($replace);
 //
 //			return self::delimiterReplaceCallback($startDelim, $endDelim,
 //				$replacer->cb(), $subject, $flags);
 //		}
 //
 //		/**
 //		* More or less "markup-safe" explode()
 //		* Ignores any instances of the separator inside `<...>`
 //		* @param String $separator
 //		* @param String $text
 //		* @return array
 //		*/
 //		static function explodeMarkup($separator, $text) {
 //			$placeholder = "\x00";
 //
 //			// Remove placeholder instances
 //			$text = str_replace($placeholder, '', $text);
 //
 //			// Replace instances of the separator inside HTML-like tags with the placeholder
 //			$replacer = new DoubleReplacer($separator, $placeholder);
 //			$cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
 //
 //			// Explode, then put the replaced separators back in
 //			$items = explode($separator, $cleaned);
 //			foreach ($items as $i => $str) {
 //				$items[$i] = str_replace($placeholder, $separator, $str);
 //			}
 //
 //			return $items;
 //		}
 	/**
 	* More or less "markup-safe" str_replace()
 	* Ignores any instances of the separator inside `<...>`
 	* @param String $search
 	* @param String $replace
 	* @param String $text
 	* @return String
 	*/
 	public static void replaceMarkup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {	// REF:/includes/libs/StringUtils.php|replaceMarkup
 		// XO.MW.PORTED: avoiding multiple regex calls / String creations
 		// $placeholder = "\x00";
 		//
 		// Remove placeholder instances
 		// $text = str_replace($placeholder, '', $text);
 		//
 		// Replace instances of the separator inside HTML-like tags with the placeholder
 		// $replacer = new DoubleReplacer($search, $placeholder);
 		// $cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
 		//
 		// Explode, then put the replaced separators back in
 		// $cleaned = str_replace($search, $replace, $cleaned);
 		// $text = str_replace($placeholder, $search, $cleaned);
 		// if same length find / repl, do in-place replacement; EX: "!!"  -> "||"
 		int find_len = find.length;
 		int repl_len = repl.length;
 		if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
 		byte find_0 = find[0];
 		byte dlm_bgn = Byte_ascii.Angle_bgn;
 		byte dlm_end = Byte_ascii.Angle_end;
 		boolean repl_active = true;
 		// loop every char in array
 		for (int i = src_bgn; i < src_end; i++) {
 			byte b = src[i];
 			if (  b == find_0
 				&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
 				&& repl_active
 				) {
 				Bry_.Set(src, i, i + find_len, repl);
 			}
 			else if (b == dlm_bgn) {
 				repl_active = false;
 			}
 			else if (b == dlm_end) {
 				repl_active = true;
 			}
 		}
 	}
 //		/**
 //		* Escape a String to make it suitable for inclusion in a preg_replace()
 //		* replacement parameter.
 //		*
 //		* @param String $String
 //		* @return String
 //		*/
 //		static function escapeRegexReplacement($String) {
 //			$String = str_replace('\\', '\\\\', $String);
 //			$String = str_replace('$', '\\$', $String);
 //			return $String;
 //		}
 //
 //		/**
 //		* Workalike for explode() with limited memory usage.
 //		*
 //		* @param String $separator
 //		* @param String $subject
 //		* @return ArrayIterator|ExplodeIterator
 //		*/
 //		static function explode($separator, $subject) {
 //			if (substr_count($subject, $separator) > 1000) {
 //				return new ExplodeIterator($separator, $subject);
 //			} else {
 //				return new ArrayIterator(explode($separator, $subject));
 //			}
 //		}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils__tst.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils__tst.java
@@ -15,8 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
 import org.junit.*; import gplx.core.tests.*;
-public class Xomw_string_utils__tst {
+public class XomwStringUtilsTest {
-	private final    Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
+	private final    XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
 	@Test  public void Delimiter_explode() {
 		// basic
 		fxt.Test__delimiter_explode("a|b|c"                             , "a", "b", "c");
@@ -42,17 +42,17 @@ public class Xomw_string_utils__tst {
 		fxt.Test__replace_markup("a!!b<!!>!!>!!c"   , "!!", "||", "a||b<!!>||>||c");	// NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
 	}
 }
-class Xomw_string_utils__fxt {
+class XomwStringUtilsFxt {
 	public void Test__delimiter_explode(String src_str, String... expd) {
 		List_adp tmp = List_adp_.New();
 		gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
-		byte[][] actl = Xomw_string_utils.Delimiter_explode(tmp, trv, Bry_.new_u8(src_str));
+		byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
 		Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
 	}
 	public void Test__replace_markup(String src_str, String find, String repl, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
-		Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
+		XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
 		Gftest.Eq__str(expd, src_bry);
 	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/Xomw_string_utils.java
@@ -1,123 +0,0 @@
 /*
 XOWA: the XOWA Offline Wiki Application
 Copyright (C) 2012-2017 gnosygnu@gmail.com
 XOWA is licensed under the terms of the General Public License (GPL) Version 3,
 or alternatively under the terms of the Apache License Version 2.0.
 You may use XOWA according to either of these licenses as is most appropriate
 for your project on a case-by-case basis.
 The terms of each license can be found in the source code repository:
 GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
 Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
 import gplx.core.btries.*;
 public class Xomw_string_utils {
 	// Explode a String, but ignore any instances of the separator inside
 	// the given start and end delimiters, which may optionally nest.
 	// The delimiters are literal strings, not regular expressions.
 	// @param String bgn_delim Start delimiter
 	// @param String end_delim End delimiter
 	// @param String separator Separator String for the explode.
 	// @param String subject Subject String to explode.
 	// @param boolean nested True iff the delimiters are allowed to nest.
 	// @return ArrayIterator
 	// XO.MW: hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
 	// XO.MW:SYNC:1.29; DATE:2017-02-03
 	private static final byte Delimiter_explode__sep = 0, Delimiter_explode__bgn = 1, Delimiter_explode__end = 2;
 	private static final    Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
 		.Add_str_byte("|" , Delimiter_explode__sep)
 		.Add_str_byte("-{", Delimiter_explode__bgn)
 		.Add_str_byte("}-", Delimiter_explode__end)
 		;
 	public static byte[][] Delimiter_explode(List_adp tmp, Btrie_rv trv, byte[] src) {
 		int src_bgn = 0;
 		int src_end = src.length;
 		int depth = 0;
 		int cur = src_bgn;
 		int prv = cur;
 		while (true) {
 			// eos
 			if (cur == src_end) {
 				// add rest
 				tmp.Add(Bry_.Mid(src, prv, src_end));
 				break;
 			}
 			Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
 			// regular char; continue;
 			if (o == null) {
 				cur++;
 				continue;
 			}
 			// handle sep, bgn, end
 			byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
 			switch (tid) {
 				case Delimiter_explode__sep:
 					if (depth == 0) {
 						tmp.Add(Bry_.Mid(src, prv, cur));
 						prv = cur + 1;
 					}
 					break;
 				case Delimiter_explode__bgn:
 					depth++;
 					break;
 				case Delimiter_explode__end:
 					depth--;
 					break;
 			}
 			cur = trv.Pos();
 		}
 		return (byte[][])tmp.To_ary_and_clear(byte[].class);
 	}
 	// More or less "markup-safe" str_replace()
 	// Ignores any instances of the separator inside `<...>`
 	public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {	// REF:/includes/libs/StringUtils.php|replaceMarkup
 		// PORTED: avoiding multiple regex calls / String creations
 		// $placeholder = "\x00";
 		// Remove placeholder instances
 		// $text = str_replace( $placeholder, '', $text );
 		// Replace instances of the separator inside HTML-like tags with the placeholder
 		// $replacer = new DoubleReplacer( $search, $placeholder );
 		// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 		// Explode, then put the replaced separators back in
 		// $cleaned = str_replace( $search, $replace, $cleaned );
 		// $text = str_replace( $placeholder, $search, $cleaned );
 		// if same length find / repl, do in-place replacement; EX: "!!"  -> "||"
 		int find_len = find.length;
 		int repl_len = repl.length;
 		if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
 		byte find_0 = find[0];
 		byte dlm_bgn = Byte_ascii.Angle_bgn;
 		byte dlm_end = Byte_ascii.Angle_end;
 		boolean repl_active = true;
 		// loop every char in array
 		for (int i = src_bgn; i < src_end; i++) {
 			byte b = src[i];
 			if (   b == find_0
 				&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
 				&& repl_active
 				) {
 				Bry_.Set(src, i, i + find_len, repl);
 			}
 			else if (b == dlm_bgn) {
 				repl_active = false;
 			}
 			else if (b == dlm_end) {
 				repl_active = true;
 			}
 		}
 	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/linkers/Xomw_link_renderer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/linkers/Xomw_link_renderer.java
@@ -164,7 +164,7 @@ public class Xomw_link_renderer {
 		// Merge the custom attribs with the default ones, and iterate
 		// over that, deleting all "false" attributes.
-		sanitizer.Merge_attributes(src, trg);
+		sanitizer.mergeAttributes(src, trg);
 		// XO.MW:MW removes "false" values; XO removes "null" values
 		boolean deleted = false;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java
@@ -1244,7 +1244,7 @@ public class XomwParser implements XomwParserIface {
 //			$text = $this->replaceTransparentTags( $text );
 		mStripState.unstripGeneral(pbfr);
-		sanitizer.Normalize_char_references(pbfr);
+		sanitizer.normalizeCharReferences(pbfr);
 //			if ( MWTidy::isEnabled() ) {
 //				if ( $this->mOptions->getTidy() ) {
@@ -4605,20 +4605,11 @@ public class XomwParser implements XomwParserIface {
 		// that are later expanded to html- so expand them now and
 		// remove the tags
 		tooltip = this.mStripState.unstripBoth(tooltip);
-//			tooltip = Sanitizer::stripAllTags( tooltip );
+		tooltip = sanitizer.stripAllTags(tooltip);
 		return tooltip;
 	}
-//		protected function stripAltText($caption, $holders) {
+
 //			# make sure there are no placeholders in thumbnail attributes
 //			# that are later expanded to html- so expand them now and
 //			# remove the tags
 //			$tooltip = this.mStripState->unstripBoth($tooltip);
 //			$tooltip = Sanitizer::stripAllTags($tooltip);
 //
 //			return $tooltip;
 //		}
 //
 //		/**
 //		* Set a flag in the output Object indicating that the content is dynamic and
 //		* shouldn't be cached.
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser.java
@@ -200,7 +200,7 @@ public class Xomw_parser implements XomwParserIface {
 //			$text = $this->replaceTransparentTags( $text );
 		strip_state.unstripGeneral(pbfr);
-		sanitizer.Normalize_char_references(pbfr);
+		sanitizer.normalizeCharReferences(pbfr);
 //			if ( MWTidy::isEnabled() ) {
 //				if ( $this->mOptions->getTidy() ) {
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java
@@ -203,7 +203,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
 			// $text = $this->getConverterLanguage()->markNoConversion( $text );
 			byte[] url = Bry_.Mid(src, url_bgn, url_end);
-			url = sanitizer.Clean_url(url);
+			url = sanitizer.cleanUrl(url);
 			bfr.Add_mid(src, prv, lnke_bgn);
 			prv = cur;
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java
@@ -472,7 +472,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls
 		//  * text-bottom
 		// Protect LanguageConverter markup when splitting into parts
-		byte[][] parts = Xomw_string_utils.Delimiter_explode(tmp_list, trv, options_at_link);
+		byte[][] parts = XomwStringUtils.delimiterExplode(tmp_list, trv, options_at_link);
 		// Give extensions a chance to select the file revision for us
 //			$options = [];
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java
@@ -252,7 +252,7 @@ public class Xomw_magiclinks_wkr {
 			return;
 		}
-		url = sanitizer.Clean_url(url);
+		url = sanitizer.cleanUrl(url);
 		// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
 		// Is this an external image?			
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java
@@ -107,7 +107,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 			for (int j = 0; j < indent_level; j++)
 				tmp.Add(Html__dl__bgn);
 			tmp.Add_str_a7("<table");
-			sanitizer.Fix_tag_attributes(tmp, Name__table, tblw_atrs);
+			sanitizer.fixTagAttributes(tmp, Name__table, tblw_atrs);
 			tmp.Add_byte(Byte_ascii.Angle_end);
 			out_line = tmp.To_bry_and_clear();
 			td_history.Add(false);
@@ -150,7 +150,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 			// Whats after the tag is now only attributes
 			byte[] atrs = strip_state.unstripBoth(line);
-			sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs);
+			sanitizer.fixTagAttributes(tmp, Name__tr, atrs);
 			atrs = tmp.To_bry_and_clear();
 			Php_ary_.Pop_bry_or_null(tr_attributes);
@@ -188,7 +188,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 			// Implies both are valid for table headings.
 			if (first_char == Byte_ascii.Bang) {
-				Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
+				XomwStringUtils.replaceMarkup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
 			}
 			// Split up multiple cells on the same line.
@@ -253,7 +253,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 				else {
 					byte[] atrs = strip_state.unstripBoth(cell_data_0);
 					tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
-					sanitizer.Fix_tag_attributes(tmp, last_tag, atrs);
+					sanitizer.fixTagAttributes(tmp, last_tag, atrs);
 					tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
 					cell = tmp.To_bry_and_clear();
 				}