1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Xomw: Add XomwStringUtils.delimiterReplace

This commit is contained in:
gnosygnu 2017-02-23 12:01:49 -05:00
parent 31fcfaf1bd
commit a8c7f27ff5
5 changed files with 198 additions and 136 deletions

View File

@ -17,6 +17,7 @@ package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
import gplx.xowa.parsers.htmls.*;
import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.libs.*;
public class XomwSanitizer {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
@ -1671,7 +1672,8 @@ public class XomwSanitizer {
*/
public byte[] stripAllTags(byte[] text) {
// Actual <tags>
// $text = StringUtils::delimiterReplace('<', '>', '', $text);
XomwStringUtils.delimiterReplace(tmp_bfr, Byte_ascii.Angle_bgn_bry, Byte_ascii.Angle_end_bry, Bry_.Empty, text);
text = tmp_bfr.To_bry_and_clear();
// Normalize &entities and whitespace
text = decodeCharReferences(null, false, text, 0, text.length);

View File

@ -15,6 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.libs.replacers.*;
/**
* A collection of static methods to play with strings.
*/
@ -143,129 +145,123 @@ public class XomwStringUtils {
//
// return $output;
// }
//
// /**
// * Perform an operation equivalent to `preg_replace_callback()`
// *
// * Matches this code:
// *
// * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
// *
// * If the start delimiter ends with an initial substring of the end delimiter,
// * e.g. in the case of C-style comments, the behavior differs from the model
// * regex. In this implementation, the end must share no characters with the
// * start, so e.g. `/*\/` is not considered to be both the start and end of a
// * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
// *
// * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
// * but uses far less memory. The delimiters are literal strings, not regular expressions.
// *
// * @param String $startDelim Start delimiter
// * @param String $endDelim End delimiter
// * @param callable $callback Function to call on each match
// * @param String $subject
// * @param String $flags Regular expression flags
// * @throws InvalidArgumentException
// * @return String
// */
// static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
// $subject, $flags = ''
// ) {
// $inputPos = 0;
// $outputPos = 0;
// $contentPos = 0;
// $output = '';
// $foundStart = false;
// $encStart = preg_quote($startDelim, '!');
// $encEnd = preg_quote($endDelim, '!');
// $strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
// $endLength = strlen($endDelim);
// $m = [];
//
// while ($inputPos < strlen($subject) &&
// preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
// ) {
// $tokenOffset = $m[0][1];
// if ($m[1][0] != '') {
// if ($foundStart &&
// $strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
// ) {
// # An end match is present at the same location
// $tokenType = 'end';
// $tokenLength = $endLength;
// } else {
// $tokenType = 'start';
// $tokenLength = strlen($m[0][0]);
// }
// } elseif ($m[2][0] != '') {
// $tokenType = 'end';
// $tokenLength = strlen($m[0][0]);
// } else {
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
// }
//
// if ($tokenType == 'start') {
// # Only move the start position if we haven't already found a start
// # This means that START START END matches outer pair
// if (!$foundStart) {
// # Found start
// $inputPos = $tokenOffset + $tokenLength;
// # Write out the non-matching section
// $output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
// $outputPos = $tokenOffset;
// $contentPos = $inputPos;
// $foundStart = true;
// } else {
// # Move the input position past the *first character* of START,
// # to protect against missing END when it overlaps with START
// $inputPos = $tokenOffset + 1;
// }
// } elseif ($tokenType == 'end') {
// if ($foundStart) {
// # Found match
// $output .= call_user_func($callback, [
// substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
// substr($subject, $contentPos, $tokenOffset - $contentPos)
// ]);
// $foundStart = false;
// } else {
// # Non-matching end, write it out
// $output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
// }
// $inputPos = $outputPos = $tokenOffset + $tokenLength;
// } else {
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
// }
// }
// if ($outputPos < strlen($subject)) {
// $output .= substr($subject, $outputPos);
// }
//
// return $output;
// }
//
// /**
// * Perform an operation equivalent to `preg_replace()` with flags.
// *
// * Matches this code:
// *
// * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
// *
// * @param String $startDelim Start delimiter regular expression
// * @param String $endDelim End delimiter regular expression
// * @param String $replace Replacement String. May contain $1, which will be
// * replaced by the text between the delimiters
// * @param String $subject String to search
// * @param String $flags Regular expression flags
// * @return String The String with the matches replaced
// */
// static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
// $replacer = new RegexlikeReplacer($replace);
//
// return self::delimiterReplaceCallback($startDelim, $endDelim,
// $replacer->cb(), $subject, $flags);
// }
//
/**
* Perform an operation equivalent to `preg_replace_callback()`
*
* Matches this code:
*
* preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
*
* If the start delimiter ends with an initial substring of the end delimiter,
* e.g. in the case of C-style comments, the behavior differs from the model
* regex. In this implementation, the end must share no characters with the
* start, so e.g. `/*\/` is not considered to be both the start and end of a
* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
*
* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
* but uses far less memory. The delimiters are literal strings, not regular expressions.
*
* @param String $startDelim Start delimiter
* @param String $endDelim End delimiter
* @param callable $callback Function to call on each match
* @param String $subject
* @param String $flags Regular expression flags
* @throws InvalidArgumentException
* @return String
*/
// XO.MW:flags not supported; goes directly to regex; also, flags of "i" will do case-insensitive
public static void delimiterReplaceCallback(Bry_bfr bfr, byte[] bgn, byte[] end, XomwReplacer callback,
byte[] src
) {
/* XO.MW.PORTED:
MW does following logic
* Run start/end regex on subject till no matches
* If start/end found, evaluate possible match (handling nesting)
* If match found, then pass find-replace pair to callback;
find=substr(subject, outputPos, tokenOffset + tokenLength - outputPos)
replace=substr(subject, contentPos, tokenOffset - contentPos)
* Also, unnecessary "overlapping" logic: bgn=ab;end=abc
$strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
*/
int pos = 0;
int prv = 0;
int srcLen = src.length;
int bgnLen = bgn.length;
int endLen = end.length;
boolean foundStart = false;
boolean tokenTypeIsStart = false;
while (true) {
if (pos >= srcLen) {
bfr.Add_mid(src, prv, srcLen);
break;
}
if (Bry_.Eq(src, pos, pos + bgnLen, bgn)) {
tokenTypeIsStart = true;
}
else if (Bry_.Eq(src, pos, pos + endLen, end)) {
tokenTypeIsStart = false;
}
else {
pos++;
continue;
}
if (tokenTypeIsStart) {
// Only move the start position if we haven't already found a start
// This means that START START END matches outer pair
// EX: "(a(b)" has match of "a(b"
if (!foundStart) {
// Found start
// Write out the non-matching section
bfr.Add_mid(src, prv, pos);
pos += bgnLen;
prv = pos;
foundStart = true;
} else {
// Move the input position past the *first character* of START,
// to protect against missing END when it overlaps with START
pos++;
}
} else { // elseif (tokenType == 'end')
if (foundStart) {
// Found match
callback.cb(bfr, src, prv, pos);
foundStart = false;
} else {
// Non-matching end, write it out
// EX: "a)b" -> "a)"
bfr.Add_mid(src, prv, pos + endLen);
}
pos += endLen;
prv = pos;
}
}
}
/**
* Perform an operation equivalent to `preg_replace()` with flags.
*
* Matches this code:
*
* preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
*
* @param String $startDelim Start delimiter regular expression
* @param String $endDelim End delimiter regular expression
* @param String $replace Replacement String. May contain $1, which will be
* replaced by the text between the delimiters
* @param String $subject String to search
* @param String $flags Regular expression flags
* @return String The String with the matches replaced
*/
// XO.MW:removed flags=''
public static void delimiterReplace(Bry_bfr bfr, byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject) {
XomwRegexlikeReplacer replacer = new XomwRegexlikeReplacer(replace);
delimiterReplaceCallback(bfr, startDelim, endDelim, replacer, subject);
}
// /**
// * More or less "markup-safe" explode()
// * Ignores any instances of the separator inside `<...>`

View File

@ -19,40 +19,56 @@ public class XomwStringUtilsTest {
private final XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
@Test public void Delimiter_explode() {
// basic
fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c");
fxt.Test_delimiter_explode("a|b|c" , "a", "b", "c");
// empty
fxt.Test__delimiter_explode("|a||c|" , "", "a", "", "c", "");
fxt.Test_delimiter_explode("|a||c|" , "", "a", "", "c", "");
// nest_1
fxt.Test__delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d");
fxt.Test_delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d");
// nest_many
fxt.Test__delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f");
fxt.Test_delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f");
}
@Test public void Replace_markup() {
// basic
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
fxt.Test_replace_markup("a!!b" , "!!", "||", "a||b");
// missing
fxt.Test__replace_markup("abcd" , "!!", "||", "abcd");
fxt.Test_replace_markup("abcd" , "!!", "||", "abcd");
// eos
fxt.Test__replace_markup("a!!" , "!!", "||", "a||");
fxt.Test_replace_markup("a!!" , "!!", "||", "a||");
// ignore
fxt.Test__replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
fxt.Test_replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
// ignore asym_lhs
fxt.Test__replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
fxt.Test_replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
// ignore asym_lhs
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
fxt.Test_replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
}
@Test public void delimiterReplace() {
// basic
fxt.Test_delimiterReplace("/*", "*/", "a/*0*/c" , "9", "a9c");
// overlapping; "/*/"
fxt.Test_delimiterReplace("/*", "*/", "a/*/0/*/c" , "9", "a9c");
// dangling bgn; "/* /*"
fxt.Test_delimiterReplace("/*", "*/", "a/*0/*1*/c" , "9", "a9c"); // fails if "a/*9c"
// dangling end; "*/ */"
fxt.Test_delimiterReplace("/*", "*/", "a/*0*/1*/c" , "9", "a91*/c");
}
}
class XomwStringUtilsFxt {
public void Test__delimiter_explode(String src_str, String... expd) {
public void Test_delimiter_explode(String src_str, String... expd) {
List_adp tmp = List_adp_.New();
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
}
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
public void Test_replace_markup(String src_str, String find, String repl, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
Gftest.Eq__str(expd, src_bry);
}
// byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject
public void Test_delimiterReplace(String bgn, String end, String src, String repl, String expd) {
Bry_bfr bfr = Bry_bfr_.New();
XomwStringUtils.delimiterReplace(bfr, Bry_.new_u8(bgn), Bry_.new_u8(end), Bry_.new_u8(repl), Bry_.new_u8(src));
Gftest.Eq__str(expd, bfr.To_str_and_clear());
}
}

View File

@ -0,0 +1,25 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
public class XomwRegexlikeReplacer implements XomwReplacer {
private byte[] replace;
public XomwRegexlikeReplacer(byte[] replace) {
this.replace = replace;
}
public void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end) {
bfr.Add(replace);
}
}

View File

@ -0,0 +1,23 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
/**
* Base class for "replacers", objects used in preg_replace_callback() and
* StringUtils::delimiterReplaceCallback()
*/
public interface XomwReplacer {
void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end);
}