Xomw: Add XomwStringUtils.delimiterReplace

2026-03-02 03:49:30 +00:00 · 2017-02-23 12:01:49 -05:00
parent 31fcfaf1bd
commit a8c7f27ff5
5 changed files with 198 additions and 136 deletions
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java
@@ -17,6 +17,7 @@ package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import
 import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
 import gplx.xowa.parsers.htmls.*;
 import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*;
+import gplx.xowa.mediawiki.includes.libs.*;
 public class XomwSanitizer {
 	private final    Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
 	private final    Mwh_atr_parser atr_parser = new Mwh_atr_parser();
@@ -1671,7 +1672,8 @@ public class XomwSanitizer {
 	*/
 	public byte[] stripAllTags(byte[] text) {
 		// Actual <tags>
-//			$text = StringUtils::delimiterReplace('<', '>', '', $text);
+		XomwStringUtils.delimiterReplace(tmp_bfr, Byte_ascii.Angle_bgn_bry, Byte_ascii.Angle_end_bry, Bry_.Empty, text);
+		text = tmp_bfr.To_bry_and_clear();

 		// Normalize &entities and whitespace
 		text = decodeCharReferences(null, false, text, 0, text.length);
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java
@@ -15,6 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
 */
 package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
 import gplx.core.btries.*;
+import gplx.xowa.mediawiki.includes.utls.*;
+import gplx.xowa.mediawiki.includes.libs.replacers.*;
 /**
 * A collection of static methods to play with strings.
 */
@@ -143,129 +145,123 @@ public class XomwStringUtils {
 //
 //			return $output;
 //		}
-//
-//		/**
-//		* Perform an operation equivalent to `preg_replace_callback()`
-//		*
-//		* Matches this code:
-//		*
-//		*     preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
-//		*
-//		* If the start delimiter ends with an initial substring of the end delimiter,
-//		* e.g. in the case of C-style comments, the behavior differs from the model
-//		* regex. In this implementation, the end must share no characters with the
-//		* start, so e.g. `/*\/` is not considered to be both the start and end of a
-//		* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
-//		*
-//		* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
-//		* but uses far less memory. The delimiters are literal strings, not regular expressions.
-//		*
-//		* @param String $startDelim Start delimiter
-//		* @param String $endDelim End delimiter
-//		* @param callable $callback Function to call on each match
-//		* @param String $subject
-//		* @param String $flags Regular expression flags
-//		* @throws InvalidArgumentException
-//		* @return String
-//		*/
-//		static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
-//			$subject, $flags = ''
-//		) {
-//			$inputPos = 0;
-//			$outputPos = 0;
-//			$contentPos = 0;
-//			$output = '';
-//			$foundStart = false;
-//			$encStart = preg_quote($startDelim, '!');
-//			$encEnd = preg_quote($endDelim, '!');
-//			$strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
-//			$endLength = strlen($endDelim);
-//			$m = [];
-//
-//			while ($inputPos < strlen($subject) &&
-//				preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
-//			) {
-//				$tokenOffset = $m[0][1];
-//				if ($m[1][0] != '') {
-//					if ($foundStart &&
-//						$strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
-//					) {
-//						# An end match is present at the same location
-//						$tokenType = 'end';
-//						$tokenLength = $endLength;
-//					} else {
-//						$tokenType = 'start';
-//						$tokenLength = strlen($m[0][0]);
-//					}
-//				} elseif ($m[2][0] != '') {
-//					$tokenType = 'end';
-//					$tokenLength = strlen($m[0][0]);
-//				} else {
-//					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
-//				}
-//
-//				if ($tokenType == 'start') {
-//					# Only move the start position if we haven't already found a start
-//					# This means that START START END matches outer pair
-//					if (!$foundStart) {
-//						# Found start
-//						$inputPos = $tokenOffset + $tokenLength;
-//						# Write out the non-matching section
-//						$output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
-//						$outputPos = $tokenOffset;
-//						$contentPos = $inputPos;
-//						$foundStart = true;
-//					} else {
-//						# Move the input position past the *first character* of START,
-//						# to protect against missing END when it overlaps with START
-//						$inputPos = $tokenOffset + 1;
-//					}
-//				} elseif ($tokenType == 'end') {
-//					if ($foundStart) {
-//						# Found match
-//						$output .= call_user_func($callback, [
-//							substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
-//							substr($subject, $contentPos, $tokenOffset - $contentPos)
-//						]);
-//						$foundStart = false;
-//					} else {
-//						# Non-matching end, write it out
-//						$output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
-//					}
-//					$inputPos = $outputPos = $tokenOffset + $tokenLength;
-//				} else {
-//					throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
-//				}
-//			}
-//			if ($outputPos < strlen($subject)) {
-//				$output .= substr($subject, $outputPos);
-//			}
-//
-//			return $output;
-//		}
-//
-//		/**
-//		* Perform an operation equivalent to `preg_replace()` with flags.
-//		*
-//		* Matches this code:
-//		*
-//		*     preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
-//		*
-//		* @param String $startDelim Start delimiter regular expression
-//		* @param String $endDelim End delimiter regular expression
-//		* @param String $replace Replacement String. May contain $1, which will be
-//		*  replaced by the text between the delimiters
-//		* @param String $subject String to search
-//		* @param String $flags Regular expression flags
-//		* @return String The String with the matches replaced
-//		*/
-//		static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
-//			$replacer = new RegexlikeReplacer($replace);
-//
-//			return self::delimiterReplaceCallback($startDelim, $endDelim,
-//				$replacer->cb(), $subject, $flags);
-//		}
-//
+
+	/**
+	* Perform an operation equivalent to `preg_replace_callback()`
+	*
+	* Matches this code:
+	*
+	*     preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
+	*
+	* If the start delimiter ends with an initial substring of the end delimiter,
+	* e.g. in the case of C-style comments, the behavior differs from the model
+	* regex. In this implementation, the end must share no characters with the
+	* start, so e.g. `/*\/` is not considered to be both the start and end of a
+	* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
+	*
+	* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
+	* but uses far less memory. The delimiters are literal strings, not regular expressions.
+	*
+	* @param String $startDelim Start delimiter
+	* @param String $endDelim End delimiter
+	* @param callable $callback Function to call on each match
+	* @param String $subject
+	* @param String $flags Regular expression flags
+	* @throws InvalidArgumentException
+	* @return String
+	*/
+	// XO.MW:flags not supported; goes directly to regex; also, flags of "i" will do case-insensitive
+	public static void delimiterReplaceCallback(Bry_bfr bfr, byte[] bgn, byte[] end, XomwReplacer callback,
+		byte[] src
+	) {
+		/* XO.MW.PORTED:
+			MW does following logic
+			* Run start/end regex on subject till no matches
+			* If start/end found, evaluate possible match (handling nesting)
+			* If match found, then pass find-replace pair to callback;
+			    find=substr(subject, outputPos, tokenOffset + tokenLength - outputPos)
+				replace=substr(subject, contentPos, tokenOffset - contentPos)				
+			* Also, unnecessary "overlapping" logic: bgn=ab;end=abc
+				$strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
+		*/
+		int pos = 0;
+		int prv = 0;
+		int srcLen = src.length;
+		int bgnLen = bgn.length;
+		int endLen = end.length;
+		boolean foundStart = false;
+		boolean tokenTypeIsStart = false;
+
+		while (true) {
+			if (pos >= srcLen) {
+				bfr.Add_mid(src, prv, srcLen);
+				break;
+			}
+			if      (Bry_.Eq(src, pos, pos + bgnLen, bgn)) {
+				tokenTypeIsStart = true;
+			}
+			else if (Bry_.Eq(src, pos, pos + endLen, end)) {
+				tokenTypeIsStart = false;
+			}
+			else {
+				pos++;
+				continue;
+			}
+
+			if (tokenTypeIsStart) {
+				// Only move the start position if we haven't already found a start
+				// This means that START START END matches outer pair
+				// EX: "(a(b)" has match of "a(b"
+				if (!foundStart) {
+					// Found start
+					// Write out the non-matching section
+					bfr.Add_mid(src, prv, pos);
+					pos += bgnLen;
+					prv = pos;
+					foundStart = true;
+				} else {
+					// Move the input position past the *first character* of START,
+					// to protect against missing END when it overlaps with START
+					pos++;
+				}
+			} else { // elseif (tokenType == 'end')
+				if (foundStart) {
+					// Found match
+					callback.cb(bfr, src, prv, pos);
+					foundStart = false;
+				} else {
+					// Non-matching end, write it out
+					// EX: "a)b" -> "a)"
+					bfr.Add_mid(src, prv, pos + endLen);
+				}
+				pos += endLen;
+				prv = pos;
+			}
+		}
+	}
+
+	/**
+	* Perform an operation equivalent to `preg_replace()` with flags.
+	*
+	* Matches this code:
+	*
+	*     preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
+	*
+	* @param String $startDelim Start delimiter regular expression
+	* @param String $endDelim End delimiter regular expression
+	* @param String $replace Replacement String. May contain $1, which will be
+	*  replaced by the text between the delimiters
+	* @param String $subject String to search
+	* @param String $flags Regular expression flags
+	* @return String The String with the matches replaced
+	*/
+	// XO.MW:removed flags=''
+	public static void delimiterReplace(Bry_bfr bfr, byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject) {
+		XomwRegexlikeReplacer replacer = new XomwRegexlikeReplacer(replace);
+
+		delimiterReplaceCallback(bfr, startDelim, endDelim, replacer, subject);
+	}
+
 //		/**
 //		* More or less "markup-safe" explode()
 //		* Ignores any instances of the separator inside `<...>`
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java
@@ -19,40 +19,56 @@ public class XomwStringUtilsTest {
 	private final    XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
 	@Test  public void Delimiter_explode() {
 		// basic
-		fxt.Test__delimiter_explode("a|b|c"                             , "a", "b", "c");
+		fxt.Test_delimiter_explode("a|b|c"                             , "a", "b", "c");
 		// empty
-		fxt.Test__delimiter_explode("|a||c|"                            , "", "a", "", "c", "");
+		fxt.Test_delimiter_explode("|a||c|"                            , "", "a", "", "c", "");
 		// nest_1
-		fxt.Test__delimiter_explode("a|-{b|c}-|d"                       , "a", "-{b|c}-", "d");
+		fxt.Test_delimiter_explode("a|-{b|c}-|d"                       , "a", "-{b|c}-", "d");
 		// nest_many
-		fxt.Test__delimiter_explode("a|-{b-{c|d}-e}-|f"                 , "a", "-{b-{c|d}-e}-", "f");
+		fxt.Test_delimiter_explode("a|-{b-{c|d}-e}-|f"                 , "a", "-{b-{c|d}-e}-", "f");
 	}
 	@Test  public void Replace_markup() {
 		// basic
-		fxt.Test__replace_markup("a!!b"             , "!!", "||", "a||b");
+		fxt.Test_replace_markup("a!!b"             , "!!", "||", "a||b");
 		// missing
-		fxt.Test__replace_markup("abcd"             , "!!", "||", "abcd");
+		fxt.Test_replace_markup("abcd"             , "!!", "||", "abcd");
 		// eos
-		fxt.Test__replace_markup("a!!"              , "!!", "||", "a||");
+		fxt.Test_replace_markup("a!!"              , "!!", "||", "a||");
 		// ignore
-		fxt.Test__replace_markup("a!!b<!!>!!c"      , "!!", "||", "a||b<!!>||c");
+		fxt.Test_replace_markup("a!!b<!!>!!c"      , "!!", "||", "a||b<!!>||c");
 		// ignore asym_lhs
-		fxt.Test__replace_markup("a!!b<!!<!!>!!c"   , "!!", "||", "a||b<!!<!!>||c");
+		fxt.Test_replace_markup("a!!b<!!<!!>!!c"   , "!!", "||", "a||b<!!<!!>||c");
 		// ignore asym_lhs
-		fxt.Test__replace_markup("a!!b<!!>!!>!!c"   , "!!", "||", "a||b<!!>||>||c");	// NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
+		fxt.Test_replace_markup("a!!b<!!>!!>!!c"   , "!!", "||", "a||b<!!>||>||c");	// NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
+	}
+	@Test   public void delimiterReplace() {
+		// basic
+		fxt.Test_delimiterReplace("/*", "*/", "a/*0*/c"         , "9", "a9c");
+		// overlapping; "/*/"
+		fxt.Test_delimiterReplace("/*", "*/", "a/*/0/*/c"       , "9", "a9c");
+		// dangling bgn; "/* /*"
+		fxt.Test_delimiterReplace("/*", "*/", "a/*0/*1*/c"      , "9", "a9c"); // fails if "a/*9c"
+		// dangling end; "*/ */"
+		fxt.Test_delimiterReplace("/*", "*/", "a/*0*/1*/c"      , "9", "a91*/c");
 	}
 }
 class XomwStringUtilsFxt {
-	public void Test__delimiter_explode(String src_str, String... expd) {
+	public void Test_delimiter_explode(String src_str, String... expd) {
 		List_adp tmp = List_adp_.New();
 		gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();

 		byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
 		Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
 	}
-	public void Test__replace_markup(String src_str, String find, String repl, String expd) {
+	public void Test_replace_markup(String src_str, String find, String repl, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);
 		XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
 		Gftest.Eq__str(expd, src_bry);
 	}
+	// byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject
+	public void Test_delimiterReplace(String bgn, String end, String src, String repl, String expd) {
+		Bry_bfr bfr = Bry_bfr_.New();
+		XomwStringUtils.delimiterReplace(bfr, Bry_.new_u8(bgn), Bry_.new_u8(end), Bry_.new_u8(repl), Bry_.new_u8(src));
+		Gftest.Eq__str(expd, bfr.To_str_and_clear());
+	}
 }
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java
@@ -0,0 +1,25 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
+public class XomwRegexlikeReplacer implements XomwReplacer {
+	private byte[] replace;
+	public XomwRegexlikeReplacer(byte[] replace) {
+		this.replace = replace;
+	}
+	public void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end) {
+		bfr.Add(replace);
+	}
+}
--- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java
+++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java
@@ -0,0 +1,23 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012-2017 gnosygnu@gmail.com
+
+XOWA is licensed under the terms of the General Public License (GPL) Version 3,
+or alternatively under the terms of the Apache License Version 2.0.
+
+You may use XOWA according to either of these licenses as is most appropriate
+for your project on a case-by-case basis.
+
+The terms of each license can be found in the source code repository:
+
+GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
+Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
+*/
+package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
+/**
+* Base class for "replacers", objects used in preg_replace_callback() and
+* StringUtils::delimiterReplaceCallback()
+*/
+public interface XomwReplacer {
+	void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end);
+}