mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Xomw: Add XomwStringUtils.delimiterReplace
This commit is contained in:
parent
31fcfaf1bd
commit
a8c7f27ff5
@ -17,6 +17,7 @@ package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import
|
|||||||
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
|
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
|
||||||
import gplx.xowa.parsers.htmls.*;
|
import gplx.xowa.parsers.htmls.*;
|
||||||
import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*;
|
import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*;
|
||||||
|
import gplx.xowa.mediawiki.includes.libs.*;
|
||||||
public class XomwSanitizer {
|
public class XomwSanitizer {
|
||||||
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
||||||
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
||||||
@ -1671,7 +1672,8 @@ public class XomwSanitizer {
|
|||||||
*/
|
*/
|
||||||
public byte[] stripAllTags(byte[] text) {
|
public byte[] stripAllTags(byte[] text) {
|
||||||
// Actual <tags>
|
// Actual <tags>
|
||||||
// $text = StringUtils::delimiterReplace('<', '>', '', $text);
|
XomwStringUtils.delimiterReplace(tmp_bfr, Byte_ascii.Angle_bgn_bry, Byte_ascii.Angle_end_bry, Bry_.Empty, text);
|
||||||
|
text = tmp_bfr.To_bry_and_clear();
|
||||||
|
|
||||||
// Normalize &entities and whitespace
|
// Normalize &entities and whitespace
|
||||||
text = decodeCharReferences(null, false, text, 0, text.length);
|
text = decodeCharReferences(null, false, text, 0, text.length);
|
||||||
|
@ -15,6 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
|||||||
*/
|
*/
|
||||||
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
||||||
import gplx.core.btries.*;
|
import gplx.core.btries.*;
|
||||||
|
import gplx.xowa.mediawiki.includes.utls.*;
|
||||||
|
import gplx.xowa.mediawiki.includes.libs.replacers.*;
|
||||||
/**
|
/**
|
||||||
* A collection of static methods to play with strings.
|
* A collection of static methods to play with strings.
|
||||||
*/
|
*/
|
||||||
@ -143,129 +145,123 @@ public class XomwStringUtils {
|
|||||||
//
|
//
|
||||||
// return $output;
|
// return $output;
|
||||||
// }
|
// }
|
||||||
//
|
|
||||||
// /**
|
/**
|
||||||
// * Perform an operation equivalent to `preg_replace_callback()`
|
* Perform an operation equivalent to `preg_replace_callback()`
|
||||||
// *
|
*
|
||||||
// * Matches this code:
|
* Matches this code:
|
||||||
// *
|
*
|
||||||
// * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
|
* preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
|
||||||
// *
|
*
|
||||||
// * If the start delimiter ends with an initial substring of the end delimiter,
|
* If the start delimiter ends with an initial substring of the end delimiter,
|
||||||
// * e.g. in the case of C-style comments, the behavior differs from the model
|
* e.g. in the case of C-style comments, the behavior differs from the model
|
||||||
// * regex. In this implementation, the end must share no characters with the
|
* regex. In this implementation, the end must share no characters with the
|
||||||
// * start, so e.g. `/*\/` is not considered to be both the start and end of a
|
* start, so e.g. `/*\/` is not considered to be both the start and end of a
|
||||||
// * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
|
* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
|
||||||
// *
|
*
|
||||||
// * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
|
* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
|
||||||
// * but uses far less memory. The delimiters are literal strings, not regular expressions.
|
* but uses far less memory. The delimiters are literal strings, not regular expressions.
|
||||||
// *
|
*
|
||||||
// * @param String $startDelim Start delimiter
|
* @param String $startDelim Start delimiter
|
||||||
// * @param String $endDelim End delimiter
|
* @param String $endDelim End delimiter
|
||||||
// * @param callable $callback Function to call on each match
|
* @param callable $callback Function to call on each match
|
||||||
// * @param String $subject
|
* @param String $subject
|
||||||
// * @param String $flags Regular expression flags
|
* @param String $flags Regular expression flags
|
||||||
// * @throws InvalidArgumentException
|
* @throws InvalidArgumentException
|
||||||
// * @return String
|
* @return String
|
||||||
// */
|
*/
|
||||||
// static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
|
// XO.MW:flags not supported; goes directly to regex; also, flags of "i" will do case-insensitive
|
||||||
// $subject, $flags = ''
|
public static void delimiterReplaceCallback(Bry_bfr bfr, byte[] bgn, byte[] end, XomwReplacer callback,
|
||||||
// ) {
|
byte[] src
|
||||||
// $inputPos = 0;
|
) {
|
||||||
// $outputPos = 0;
|
/* XO.MW.PORTED:
|
||||||
// $contentPos = 0;
|
MW does following logic
|
||||||
// $output = '';
|
* Run start/end regex on subject till no matches
|
||||||
// $foundStart = false;
|
* If start/end found, evaluate possible match (handling nesting)
|
||||||
// $encStart = preg_quote($startDelim, '!');
|
* If match found, then pass find-replace pair to callback;
|
||||||
// $encEnd = preg_quote($endDelim, '!');
|
find=substr(subject, outputPos, tokenOffset + tokenLength - outputPos)
|
||||||
// $strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
|
replace=substr(subject, contentPos, tokenOffset - contentPos)
|
||||||
// $endLength = strlen($endDelim);
|
* Also, unnecessary "overlapping" logic: bgn=ab;end=abc
|
||||||
// $m = [];
|
$strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
|
||||||
//
|
*/
|
||||||
// while ($inputPos < strlen($subject) &&
|
int pos = 0;
|
||||||
// preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
|
int prv = 0;
|
||||||
// ) {
|
int srcLen = src.length;
|
||||||
// $tokenOffset = $m[0][1];
|
int bgnLen = bgn.length;
|
||||||
// if ($m[1][0] != '') {
|
int endLen = end.length;
|
||||||
// if ($foundStart &&
|
boolean foundStart = false;
|
||||||
// $strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
|
boolean tokenTypeIsStart = false;
|
||||||
// ) {
|
|
||||||
// # An end match is present at the same location
|
while (true) {
|
||||||
// $tokenType = 'end';
|
if (pos >= srcLen) {
|
||||||
// $tokenLength = $endLength;
|
bfr.Add_mid(src, prv, srcLen);
|
||||||
// } else {
|
break;
|
||||||
// $tokenType = 'start';
|
}
|
||||||
// $tokenLength = strlen($m[0][0]);
|
if (Bry_.Eq(src, pos, pos + bgnLen, bgn)) {
|
||||||
// }
|
tokenTypeIsStart = true;
|
||||||
// } elseif ($m[2][0] != '') {
|
}
|
||||||
// $tokenType = 'end';
|
else if (Bry_.Eq(src, pos, pos + endLen, end)) {
|
||||||
// $tokenLength = strlen($m[0][0]);
|
tokenTypeIsStart = false;
|
||||||
// } else {
|
}
|
||||||
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
|
else {
|
||||||
// }
|
pos++;
|
||||||
//
|
continue;
|
||||||
// if ($tokenType == 'start') {
|
}
|
||||||
// # Only move the start position if we haven't already found a start
|
|
||||||
// # This means that START START END matches outer pair
|
if (tokenTypeIsStart) {
|
||||||
// if (!$foundStart) {
|
// Only move the start position if we haven't already found a start
|
||||||
// # Found start
|
// This means that START START END matches outer pair
|
||||||
// $inputPos = $tokenOffset + $tokenLength;
|
// EX: "(a(b)" has match of "a(b"
|
||||||
// # Write out the non-matching section
|
if (!foundStart) {
|
||||||
// $output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
|
// Found start
|
||||||
// $outputPos = $tokenOffset;
|
// Write out the non-matching section
|
||||||
// $contentPos = $inputPos;
|
bfr.Add_mid(src, prv, pos);
|
||||||
// $foundStart = true;
|
pos += bgnLen;
|
||||||
// } else {
|
prv = pos;
|
||||||
// # Move the input position past the *first character* of START,
|
foundStart = true;
|
||||||
// # to protect against missing END when it overlaps with START
|
} else {
|
||||||
// $inputPos = $tokenOffset + 1;
|
// Move the input position past the *first character* of START,
|
||||||
// }
|
// to protect against missing END when it overlaps with START
|
||||||
// } elseif ($tokenType == 'end') {
|
pos++;
|
||||||
// if ($foundStart) {
|
}
|
||||||
// # Found match
|
} else { // elseif (tokenType == 'end')
|
||||||
// $output .= call_user_func($callback, [
|
if (foundStart) {
|
||||||
// substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
|
// Found match
|
||||||
// substr($subject, $contentPos, $tokenOffset - $contentPos)
|
callback.cb(bfr, src, prv, pos);
|
||||||
// ]);
|
foundStart = false;
|
||||||
// $foundStart = false;
|
} else {
|
||||||
// } else {
|
// Non-matching end, write it out
|
||||||
// # Non-matching end, write it out
|
// EX: "a)b" -> "a)"
|
||||||
// $output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
|
bfr.Add_mid(src, prv, pos + endLen);
|
||||||
// }
|
}
|
||||||
// $inputPos = $outputPos = $tokenOffset + $tokenLength;
|
pos += endLen;
|
||||||
// } else {
|
prv = pos;
|
||||||
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
|
}
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
// if ($outputPos < strlen($subject)) {
|
|
||||||
// $output .= substr($subject, $outputPos);
|
/**
|
||||||
// }
|
* Perform an operation equivalent to `preg_replace()` with flags.
|
||||||
//
|
*
|
||||||
// return $output;
|
* Matches this code:
|
||||||
// }
|
*
|
||||||
//
|
* preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
|
||||||
// /**
|
*
|
||||||
// * Perform an operation equivalent to `preg_replace()` with flags.
|
* @param String $startDelim Start delimiter regular expression
|
||||||
// *
|
* @param String $endDelim End delimiter regular expression
|
||||||
// * Matches this code:
|
* @param String $replace Replacement String. May contain $1, which will be
|
||||||
// *
|
* replaced by the text between the delimiters
|
||||||
// * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
|
* @param String $subject String to search
|
||||||
// *
|
* @param String $flags Regular expression flags
|
||||||
// * @param String $startDelim Start delimiter regular expression
|
* @return String The String with the matches replaced
|
||||||
// * @param String $endDelim End delimiter regular expression
|
*/
|
||||||
// * @param String $replace Replacement String. May contain $1, which will be
|
// XO.MW:removed flags=''
|
||||||
// * replaced by the text between the delimiters
|
public static void delimiterReplace(Bry_bfr bfr, byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject) {
|
||||||
// * @param String $subject String to search
|
XomwRegexlikeReplacer replacer = new XomwRegexlikeReplacer(replace);
|
||||||
// * @param String $flags Regular expression flags
|
|
||||||
// * @return String The String with the matches replaced
|
delimiterReplaceCallback(bfr, startDelim, endDelim, replacer, subject);
|
||||||
// */
|
}
|
||||||
// static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
|
|
||||||
// $replacer = new RegexlikeReplacer($replace);
|
|
||||||
//
|
|
||||||
// return self::delimiterReplaceCallback($startDelim, $endDelim,
|
|
||||||
// $replacer->cb(), $subject, $flags);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// /**
|
// /**
|
||||||
// * More or less "markup-safe" explode()
|
// * More or less "markup-safe" explode()
|
||||||
// * Ignores any instances of the separator inside `<...>`
|
// * Ignores any instances of the separator inside `<...>`
|
||||||
|
@ -19,40 +19,56 @@ public class XomwStringUtilsTest {
|
|||||||
private final XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
|
private final XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
|
||||||
@Test public void Delimiter_explode() {
|
@Test public void Delimiter_explode() {
|
||||||
// basic
|
// basic
|
||||||
fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c");
|
fxt.Test_delimiter_explode("a|b|c" , "a", "b", "c");
|
||||||
// empty
|
// empty
|
||||||
fxt.Test__delimiter_explode("|a||c|" , "", "a", "", "c", "");
|
fxt.Test_delimiter_explode("|a||c|" , "", "a", "", "c", "");
|
||||||
// nest_1
|
// nest_1
|
||||||
fxt.Test__delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d");
|
fxt.Test_delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d");
|
||||||
// nest_many
|
// nest_many
|
||||||
fxt.Test__delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f");
|
fxt.Test_delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f");
|
||||||
}
|
}
|
||||||
@Test public void Replace_markup() {
|
@Test public void Replace_markup() {
|
||||||
// basic
|
// basic
|
||||||
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
|
fxt.Test_replace_markup("a!!b" , "!!", "||", "a||b");
|
||||||
// missing
|
// missing
|
||||||
fxt.Test__replace_markup("abcd" , "!!", "||", "abcd");
|
fxt.Test_replace_markup("abcd" , "!!", "||", "abcd");
|
||||||
// eos
|
// eos
|
||||||
fxt.Test__replace_markup("a!!" , "!!", "||", "a||");
|
fxt.Test_replace_markup("a!!" , "!!", "||", "a||");
|
||||||
// ignore
|
// ignore
|
||||||
fxt.Test__replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
|
fxt.Test_replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
|
||||||
// ignore asym_lhs
|
// ignore asym_lhs
|
||||||
fxt.Test__replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
|
fxt.Test_replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
|
||||||
// ignore asym_lhs
|
// ignore asym_lhs
|
||||||
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">"
|
fxt.Test_replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">"
|
||||||
|
}
|
||||||
|
@Test public void delimiterReplace() {
|
||||||
|
// basic
|
||||||
|
fxt.Test_delimiterReplace("/*", "*/", "a/*0*/c" , "9", "a9c");
|
||||||
|
// overlapping; "/*/"
|
||||||
|
fxt.Test_delimiterReplace("/*", "*/", "a/*/0/*/c" , "9", "a9c");
|
||||||
|
// dangling bgn; "/* /*"
|
||||||
|
fxt.Test_delimiterReplace("/*", "*/", "a/*0/*1*/c" , "9", "a9c"); // fails if "a/*9c"
|
||||||
|
// dangling end; "*/ */"
|
||||||
|
fxt.Test_delimiterReplace("/*", "*/", "a/*0*/1*/c" , "9", "a91*/c");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class XomwStringUtilsFxt {
|
class XomwStringUtilsFxt {
|
||||||
public void Test__delimiter_explode(String src_str, String... expd) {
|
public void Test_delimiter_explode(String src_str, String... expd) {
|
||||||
List_adp tmp = List_adp_.New();
|
List_adp tmp = List_adp_.New();
|
||||||
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
|
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
|
||||||
|
|
||||||
byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
|
byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
|
||||||
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
|
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
|
||||||
}
|
}
|
||||||
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
|
public void Test_replace_markup(String src_str, String find, String repl, String expd) {
|
||||||
byte[] src_bry = Bry_.new_u8(src_str);
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
|
XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
|
||||||
Gftest.Eq__str(expd, src_bry);
|
Gftest.Eq__str(expd, src_bry);
|
||||||
}
|
}
|
||||||
|
// byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject
|
||||||
|
public void Test_delimiterReplace(String bgn, String end, String src, String repl, String expd) {
|
||||||
|
Bry_bfr bfr = Bry_bfr_.New();
|
||||||
|
XomwStringUtils.delimiterReplace(bfr, Bry_.new_u8(bgn), Bry_.new_u8(end), Bry_.new_u8(repl), Bry_.new_u8(src));
|
||||||
|
Gftest.Eq__str(expd, bfr.To_str_and_clear());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
|
||||||
|
public class XomwRegexlikeReplacer implements XomwReplacer {
|
||||||
|
private byte[] replace;
|
||||||
|
public XomwRegexlikeReplacer(byte[] replace) {
|
||||||
|
this.replace = replace;
|
||||||
|
}
|
||||||
|
public void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end) {
|
||||||
|
bfr.Add(replace);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.mediawiki.includes.libs.replacers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.libs.*;
|
||||||
|
/**
|
||||||
|
* Base class for "replacers", objects used in preg_replace_callback() and
|
||||||
|
* StringUtils::delimiterReplaceCallback()
|
||||||
|
*/
|
||||||
|
public interface XomwReplacer {
|
||||||
|
void cb(Bry_bfr bfr, byte[] src, int find_bgn, int find_end);
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user