From a8c7f27ff5266f15066eacddc3b5466a38864583 Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Thu, 23 Feb 2017 12:01:49 -0500 Subject: [PATCH] Xomw: Add XomwStringUtils.delimiterReplace --- .../mediawiki/includes/XomwSanitizer.java | 4 +- .../includes/libs/XomwStringUtils.java | 242 +++++++++--------- .../includes/libs/XomwStringUtilsTest.java | 40 ++- .../libs/replacers/XomwRegexlikeReplacer.java | 25 ++ .../includes/libs/replacers/XomwReplacer.java | 23 ++ 5 files changed, 198 insertions(+), 136 deletions(-) create mode 100644 gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java create mode 100644 gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java index 4624adcc0..385cf872a 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwSanitizer.java @@ -17,6 +17,7 @@ package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*; import gplx.xowa.parsers.htmls.*; import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*; +import gplx.xowa.mediawiki.includes.libs.*; public class XomwSanitizer { private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr(); private final Mwh_atr_parser atr_parser = new Mwh_atr_parser(); @@ -1671,7 +1672,8 @@ public class XomwSanitizer { */ public byte[] stripAllTags(byte[] text) { // Actual -// $text = StringUtils::delimiterReplace('<', '>', '', $text); + XomwStringUtils.delimiterReplace(tmp_bfr, Byte_ascii.Angle_bgn_bry, Byte_ascii.Angle_end_bry, Bry_.Empty, text); + text = tmp_bfr.To_bry_and_clear(); // Normalize &entities and whitespace text = decodeCharReferences(null, false, text, 0, text.length); diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java index bdd7dee35..0dee0a10f 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtils.java @@ -15,6 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.core.btries.*; +import gplx.xowa.mediawiki.includes.utls.*; +import gplx.xowa.mediawiki.includes.libs.replacers.*; /** * A collection of static methods to play with strings. */ @@ -143,129 +145,123 @@ public class XomwStringUtils { // // return $output; // } -// -// /** -// * Perform an operation equivalent to `preg_replace_callback()` -// * -// * Matches this code: -// * -// * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject); -// * -// * If the start delimiter ends with an initial substring of the end delimiter, -// * e.g. in the case of C-style comments, the behavior differs from the model -// * regex. In this implementation, the end must share no characters with the -// * start, so e.g. `/*\/` is not considered to be both the start and end of a -// * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. -// * -// * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() -// * but uses far less memory. The delimiters are literal strings, not regular expressions. -// * -// * @param String $startDelim Start delimiter -// * @param String $endDelim End delimiter -// * @param callable $callback Function to call on each match -// * @param String $subject -// * @param String $flags Regular expression flags -// * @throws InvalidArgumentException -// * @return String -// */ -// static function delimiterReplaceCallback($startDelim, $endDelim, $callback, -// $subject, $flags = '' -// ) { -// $inputPos = 0; -// $outputPos = 0; -// $contentPos = 0; -// $output = ''; -// $foundStart = false; -// $encStart = preg_quote($startDelim, '!'); -// $encEnd = preg_quote($endDelim, '!'); -// $strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp'; -// $endLength = strlen($endDelim); -// $m = []; -// -// while ($inputPos < strlen($subject) && -// preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos) -// ) { -// $tokenOffset = $m[0][1]; -// if ($m[1][0] != '') { -// if ($foundStart && -// $strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0 -// ) { -// # An end match is present at the same location -// $tokenType = 'end'; -// $tokenLength = $endLength; -// } else { -// $tokenType = 'start'; -// $tokenLength = strlen($m[0][0]); -// } -// } elseif ($m[2][0] != '') { -// $tokenType = 'end'; -// $tokenLength = strlen($m[0][0]); -// } else { -// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__); -// } -// -// if ($tokenType == 'start') { -// # Only move the start position if we haven't already found a start -// # This means that START START END matches outer pair -// if (!$foundStart) { -// # Found start -// $inputPos = $tokenOffset + $tokenLength; -// # Write out the non-matching section -// $output .= substr($subject, $outputPos, $tokenOffset - $outputPos); -// $outputPos = $tokenOffset; -// $contentPos = $inputPos; -// $foundStart = true; -// } else { -// # Move the input position past the *first character* of START, -// # to protect against missing END when it overlaps with START -// $inputPos = $tokenOffset + 1; -// } -// } elseif ($tokenType == 'end') { -// if ($foundStart) { -// # Found match -// $output .= call_user_func($callback, [ -// substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos), -// substr($subject, $contentPos, $tokenOffset - $contentPos) -// ]); -// $foundStart = false; -// } else { -// # Non-matching end, write it out -// $output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos); -// } -// $inputPos = $outputPos = $tokenOffset + $tokenLength; -// } else { -// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__); -// } -// } -// if ($outputPos < strlen($subject)) { -// $output .= substr($subject, $outputPos); -// } -// -// return $output; -// } -// -// /** -// * Perform an operation equivalent to `preg_replace()` with flags. -// * -// * Matches this code: -// * -// * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject); -// * -// * @param String $startDelim Start delimiter regular expression -// * @param String $endDelim End delimiter regular expression -// * @param String $replace Replacement String. May contain $1, which will be -// * replaced by the text between the delimiters -// * @param String $subject String to search -// * @param String $flags Regular expression flags -// * @return String The String with the matches replaced -// */ -// static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') { -// $replacer = new RegexlikeReplacer($replace); -// -// return self::delimiterReplaceCallback($startDelim, $endDelim, -// $replacer->cb(), $subject, $flags); -// } -// + + /** + * Perform an operation equivalent to `preg_replace_callback()` + * + * Matches this code: + * + * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject); + * + * If the start delimiter ends with an initial substring of the end delimiter, + * e.g. in the case of C-style comments, the behavior differs from the model + * regex. In this implementation, the end must share no characters with the + * start, so e.g. `/*\/` is not considered to be both the start and end of a + * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. + * + * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() + * but uses far less memory. The delimiters are literal strings, not regular expressions. + * + * @param String $startDelim Start delimiter + * @param String $endDelim End delimiter + * @param callable $callback Function to call on each match + * @param String $subject + * @param String $flags Regular expression flags + * @throws InvalidArgumentException + * @return String + */ + // XO.MW:flags not supported; goes directly to regex; also, flags of "i" will do case-insensitive + public static void delimiterReplaceCallback(Bry_bfr bfr, byte[] bgn, byte[] end, XomwReplacer callback, + byte[] src + ) { + /* XO.MW.PORTED: + MW does following logic + * Run start/end regex on subject till no matches + * If start/end found, evaluate possible match (handling nesting) + * If match found, then pass find-replace pair to callback; + find=substr(subject, outputPos, tokenOffset + tokenLength - outputPos) + replace=substr(subject, contentPos, tokenOffset - contentPos) + * Also, unnecessary "overlapping" logic: bgn=ab;end=abc + $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 + */ + int pos = 0; + int prv = 0; + int srcLen = src.length; + int bgnLen = bgn.length; + int endLen = end.length; + boolean foundStart = false; + boolean tokenTypeIsStart = false; + + while (true) { + if (pos >= srcLen) { + bfr.Add_mid(src, prv, srcLen); + break; + } + if (Bry_.Eq(src, pos, pos + bgnLen, bgn)) { + tokenTypeIsStart = true; + } + else if (Bry_.Eq(src, pos, pos + endLen, end)) { + tokenTypeIsStart = false; + } + else { + pos++; + continue; + } + + if (tokenTypeIsStart) { + // Only move the start position if we haven't already found a start + // This means that START START END matches outer pair + // EX: "(a(b)" has match of "a(b" + if (!foundStart) { + // Found start + // Write out the non-matching section + bfr.Add_mid(src, prv, pos); + pos += bgnLen; + prv = pos; + foundStart = true; + } else { + // Move the input position past the *first character* of START, + // to protect against missing END when it overlaps with START + pos++; + } + } else { // elseif (tokenType == 'end') + if (foundStart) { + // Found match + callback.cb(bfr, src, prv, pos); + foundStart = false; + } else { + // Non-matching end, write it out + // EX: "a)b" -> "a)" + bfr.Add_mid(src, prv, pos + endLen); + } + pos += endLen; + prv = pos; + } + } + } + + /** + * Perform an operation equivalent to `preg_replace()` with flags. + * + * Matches this code: + * + * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject); + * + * @param String $startDelim Start delimiter regular expression + * @param String $endDelim End delimiter regular expression + * @param String $replace Replacement String. May contain $1, which will be + * replaced by the text between the delimiters + * @param String $subject String to search + * @param String $flags Regular expression flags + * @return String The String with the matches replaced + */ + // XO.MW:removed flags='' + public static void delimiterReplace(Bry_bfr bfr, byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject) { + XomwRegexlikeReplacer replacer = new XomwRegexlikeReplacer(replace); + + delimiterReplaceCallback(bfr, startDelim, endDelim, replacer, subject); + } + // /** // * More or less "markup-safe" explode() // * Ignores any instances of the separator inside `<...>` diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java index cbbb4fbf7..e281f0644 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/XomwStringUtilsTest.java @@ -19,40 +19,56 @@ public class XomwStringUtilsTest { private final XomwStringUtilsFxt fxt = new XomwStringUtilsFxt(); @Test public void Delimiter_explode() { // basic - fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c"); + fxt.Test_delimiter_explode("a|b|c" , "a", "b", "c"); // empty - fxt.Test__delimiter_explode("|a||c|" , "", "a", "", "c", ""); + fxt.Test_delimiter_explode("|a||c|" , "", "a", "", "c", ""); // nest_1 - fxt.Test__delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d"); + fxt.Test_delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d"); // nest_many - fxt.Test__delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f"); + fxt.Test_delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f"); } @Test public void Replace_markup() { // basic - fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b"); + fxt.Test_replace_markup("a!!b" , "!!", "||", "a||b"); // missing - fxt.Test__replace_markup("abcd" , "!!", "||", "abcd"); + fxt.Test_replace_markup("abcd" , "!!", "||", "abcd"); // eos - fxt.Test__replace_markup("a!!" , "!!", "||", "a||"); + fxt.Test_replace_markup("a!!" , "!!", "||", "a||"); // ignore - fxt.Test__replace_markup("a!!b!!c" , "!!", "||", "a||b||c"); + fxt.Test_replace_markup("a!!b!!c" , "!!", "||", "a||b||c"); // ignore asym_lhs - fxt.Test__replace_markup("a!!b!!c" , "!!", "||", "a||b||c"); + fxt.Test_replace_markup("a!!b!!c" , "!!", "||", "a||b||c"); // ignore asym_lhs - fxt.Test__replace_markup("a!!b!!>!!c" , "!!", "||", "a||b||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">" + fxt.Test_replace_markup("a!!b!!>!!c" , "!!", "||", "a||b||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">" + } + @Test public void delimiterReplace() { + // basic + fxt.Test_delimiterReplace("/*", "*/", "a/*0*/c" , "9", "a9c"); + // overlapping; "/*/" + fxt.Test_delimiterReplace("/*", "*/", "a/*/0/*/c" , "9", "a9c"); + // dangling bgn; "/* /*" + fxt.Test_delimiterReplace("/*", "*/", "a/*0/*1*/c" , "9", "a9c"); // fails if "a/*9c" + // dangling end; "*/ */" + fxt.Test_delimiterReplace("/*", "*/", "a/*0*/1*/c" , "9", "a91*/c"); } } class XomwStringUtilsFxt { - public void Test__delimiter_explode(String src_str, String... expd) { + public void Test_delimiter_explode(String src_str, String... expd) { List_adp tmp = List_adp_.New(); gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv(); byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str)); Gftest.Eq__ary(expd, actl, "src=~{0}", src_str); } - public void Test__replace_markup(String src_str, String find, String repl, String expd) { + public void Test_replace_markup(String src_str, String find, String repl, String expd) { byte[] src_bry = Bry_.new_u8(src_str); XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl)); Gftest.Eq__str(expd, src_bry); } + // byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject + public void Test_delimiterReplace(String bgn, String end, String src, String repl, String expd) { + Bry_bfr bfr = Bry_bfr_.New(); + XomwStringUtils.delimiterReplace(bfr, Bry_.new_u8(bgn), Bry_.new_u8(end), Bry_.new_u8(repl), Bry_.new_u8(src)); + Gftest.Eq__str(expd, bfr.To_str_and_clear()); + } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java new file mode 100644 index 000000000..d0e59c9f6 --- /dev/null +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwRegexlikeReplacer.java @@ -0,0 +1,25 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*; +public class XomwRegexlikeReplacer implements XomwReplacer { + private byte[] replace; + public XomwRegexlikeReplacer(byte[] replace) { + this.replace = replace; + } + public void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end) { + bfr.Add(replace); + } +} diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java new file mode 100644 index 000000000..0a82a8bb4 --- /dev/null +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/libs/replacers/XomwReplacer.java @@ -0,0 +1,23 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*; +/** +* Base class for "replacers", objects used in preg_replace_callback() and +* StringUtils::delimiterReplaceCallback() +*/ +public interface XomwReplacer { + void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end); +}