1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Xomw: Add XomwStringUtils.delimiterReplace

This commit is contained in:
gnosygnu
2017-02-23 12:01:49 -05:00
parent 31fcfaf1bd
commit a8c7f27ff5
5 changed files with 198 additions and 136 deletions

View File

@@ -15,6 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.libs.replacers.*;
/**
* A collection of static methods to play with strings.
*/
@@ -143,129 +145,123 @@ public class XomwStringUtils {
//
// return $output;
// }
//
// /**
// * Perform an operation equivalent to `preg_replace_callback()`
// *
// * Matches this code:
// *
// * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
// *
// * If the start delimiter ends with an initial substring of the end delimiter,
// * e.g. in the case of C-style comments, the behavior differs from the model
// * regex. In this implementation, the end must share no characters with the
// * start, so e.g. `/*\/` is not considered to be both the start and end of a
// * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
// *
// * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
// * but uses far less memory. The delimiters are literal strings, not regular expressions.
// *
// * @param String $startDelim Start delimiter
// * @param String $endDelim End delimiter
// * @param callable $callback Function to call on each match
// * @param String $subject
// * @param String $flags Regular expression flags
// * @throws InvalidArgumentException
// * @return String
// */
// static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
// $subject, $flags = ''
// ) {
// $inputPos = 0;
// $outputPos = 0;
// $contentPos = 0;
// $output = '';
// $foundStart = false;
// $encStart = preg_quote($startDelim, '!');
// $encEnd = preg_quote($endDelim, '!');
// $strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
// $endLength = strlen($endDelim);
// $m = [];
//
// while ($inputPos < strlen($subject) &&
// preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
// ) {
// $tokenOffset = $m[0][1];
// if ($m[1][0] != '') {
// if ($foundStart &&
// $strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
// ) {
// # An end match is present at the same location
// $tokenType = 'end';
// $tokenLength = $endLength;
// } else {
// $tokenType = 'start';
// $tokenLength = strlen($m[0][0]);
// }
// } elseif ($m[2][0] != '') {
// $tokenType = 'end';
// $tokenLength = strlen($m[0][0]);
// } else {
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
// }
//
// if ($tokenType == 'start') {
// # Only move the start position if we haven't already found a start
// # This means that START START END matches outer pair
// if (!$foundStart) {
// # Found start
// $inputPos = $tokenOffset + $tokenLength;
// # Write out the non-matching section
// $output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
// $outputPos = $tokenOffset;
// $contentPos = $inputPos;
// $foundStart = true;
// } else {
// # Move the input position past the *first character* of START,
// # to protect against missing END when it overlaps with START
// $inputPos = $tokenOffset + 1;
// }
// } elseif ($tokenType == 'end') {
// if ($foundStart) {
// # Found match
// $output .= call_user_func($callback, [
// substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
// substr($subject, $contentPos, $tokenOffset - $contentPos)
// ]);
// $foundStart = false;
// } else {
// # Non-matching end, write it out
// $output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
// }
// $inputPos = $outputPos = $tokenOffset + $tokenLength;
// } else {
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
// }
// }
// if ($outputPos < strlen($subject)) {
// $output .= substr($subject, $outputPos);
// }
//
// return $output;
// }
//
// /**
// * Perform an operation equivalent to `preg_replace()` with flags.
// *
// * Matches this code:
// *
// * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
// *
// * @param String $startDelim Start delimiter regular expression
// * @param String $endDelim End delimiter regular expression
// * @param String $replace Replacement String. May contain $1, which will be
// * replaced by the text between the delimiters
// * @param String $subject String to search
// * @param String $flags Regular expression flags
// * @return String The String with the matches replaced
// */
// static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
// $replacer = new RegexlikeReplacer($replace);
//
// return self::delimiterReplaceCallback($startDelim, $endDelim,
// $replacer->cb(), $subject, $flags);
// }
//
/**
* Perform an operation equivalent to `preg_replace_callback()`
*
* Matches this code:
*
* preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
*
* If the start delimiter ends with an initial substring of the end delimiter,
* e.g. in the case of C-style comments, the behavior differs from the model
* regex. In this implementation, the end must share no characters with the
* start, so e.g. `/*\/` is not considered to be both the start and end of a
* comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
*
* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
* but uses far less memory. The delimiters are literal strings, not regular expressions.
*
* @param String $startDelim Start delimiter
* @param String $endDelim End delimiter
* @param callable $callback Function to call on each match
* @param String $subject
* @param String $flags Regular expression flags
* @throws InvalidArgumentException
* @return String
*/
// XO.MW:flags not supported; goes directly to regex; also, flags of "i" will do case-insensitive
public static void delimiterReplaceCallback(Bry_bfr bfr, byte[] bgn, byte[] end, XomwReplacer callback,
byte[] src
) {
/* XO.MW.PORTED:
MW does following logic
* Run start/end regex on subject till no matches
* If start/end found, evaluate possible match (handling nesting)
* If match found, then pass find-replace pair to callback;
find=substr(subject, outputPos, tokenOffset + tokenLength - outputPos)
replace=substr(subject, contentPos, tokenOffset - contentPos)
* Also, unnecessary "overlapping" logic: bgn=ab;end=abc
$strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
*/
int pos = 0;
int prv = 0;
int srcLen = src.length;
int bgnLen = bgn.length;
int endLen = end.length;
boolean foundStart = false;
boolean tokenTypeIsStart = false;
while (true) {
if (pos >= srcLen) {
bfr.Add_mid(src, prv, srcLen);
break;
}
if (Bry_.Eq(src, pos, pos + bgnLen, bgn)) {
tokenTypeIsStart = true;
}
else if (Bry_.Eq(src, pos, pos + endLen, end)) {
tokenTypeIsStart = false;
}
else {
pos++;
continue;
}
if (tokenTypeIsStart) {
// Only move the start position if we haven't already found a start
// This means that START START END matches outer pair
// EX: "(a(b)" has match of "a(b"
if (!foundStart) {
// Found start
// Write out the non-matching section
bfr.Add_mid(src, prv, pos);
pos += bgnLen;
prv = pos;
foundStart = true;
} else {
// Move the input position past the *first character* of START,
// to protect against missing END when it overlaps with START
pos++;
}
} else { // elseif (tokenType == 'end')
if (foundStart) {
// Found match
callback.cb(bfr, src, prv, pos);
foundStart = false;
} else {
// Non-matching end, write it out
// EX: "a)b" -> "a)"
bfr.Add_mid(src, prv, pos + endLen);
}
pos += endLen;
prv = pos;
}
}
}
/**
* Perform an operation equivalent to `preg_replace()` with flags.
*
* Matches this code:
*
* preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
*
* @param String $startDelim Start delimiter regular expression
* @param String $endDelim End delimiter regular expression
* @param String $replace Replacement String. May contain $1, which will be
* replaced by the text between the delimiters
* @param String $subject String to search
* @param String $flags Regular expression flags
* @return String The String with the matches replaced
*/
// XO.MW:removed flags=''
public static void delimiterReplace(Bry_bfr bfr, byte[] startDelim, byte[] endDelim, byte[] replace, byte[] subject) {
XomwRegexlikeReplacer replacer = new XomwRegexlikeReplacer(replace);
delimiterReplaceCallback(bfr, startDelim, endDelim, replacer, subject);
}
// /**
// * More or less "markup-safe" explode()
// * Ignores any instances of the separator inside `<...>`