mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Xomw: Convert Sanitizer, StringUtils; also, support stripAllTags
This commit is contained in:
parent
09dbfc894e
commit
31fcfaf1bd
File diff suppressed because it is too large
Load Diff
@ -121,13 +121,20 @@ public class XomwSanitizerTest {
|
|||||||
// cls: ws
|
// cls: ws
|
||||||
fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, " v1 v2 "), trg_atrs.Clear().Add_many(cls, " v3 v4 "), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
|
fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, " v1 v2 "), trg_atrs.Clear().Add_many(cls, " v3 v4 "), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void normalizeWhitespace() {
|
||||||
|
fxt.Test_normalizeWhitespace("a\r\nb", "a b");
|
||||||
|
fxt.Test_normalizeWhitespace("a\rb", "a b");
|
||||||
|
fxt.Test_normalizeWhitespace("a\nb", "a b");
|
||||||
|
fxt.Test_normalizeWhitespace("a\tb", "a b");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
class XomwSanitizerFxt {
|
class XomwSanitizerFxt {
|
||||||
private final XomwSanitizer sanitizer = new XomwSanitizer();
|
private final XomwSanitizer sanitizer = new XomwSanitizer();
|
||||||
private final Bry_bfr tmp = Bry_bfr_.New();
|
private final Bry_bfr tmp = Bry_bfr_.New();
|
||||||
public void Test__normalize_char_references(String src_str, String expd) {
|
public void Test__normalize_char_references(String src_str, String expd) {
|
||||||
byte[] src_bry = Bry_.new_u8(src_str);
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
sanitizer.normalizeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
||||||
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
||||||
}
|
}
|
||||||
public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
|
public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
|
||||||
@ -152,15 +159,18 @@ class XomwSanitizerFxt {
|
|||||||
}
|
}
|
||||||
public void Test__decode_char_references(String src_str, String expd) {
|
public void Test__decode_char_references(String src_str, String expd) {
|
||||||
byte[] src_bry = Bry_.new_u8(src_str);
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
sanitizer.decodeCharReferences(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
||||||
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
||||||
}
|
}
|
||||||
public void Test__clean_url(String src_str, String expd) {
|
public void Test__clean_url(String src_str, String expd) {
|
||||||
byte[] src_bry = Bry_.new_u8(src_str);
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
|
Gftest.Eq__str(expd, sanitizer.cleanUrl(src_bry));
|
||||||
}
|
}
|
||||||
public void Test__merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg, Xomw_atr_mgr expd) {
|
public void Test__merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg, Xomw_atr_mgr expd) {
|
||||||
sanitizer.Merge_attributes(src, trg);
|
sanitizer.mergeAttributes(src, trg);
|
||||||
Gftest.Eq__ary__lines(expd.To_str(tmp), src.To_str(tmp), "merge_atrs");
|
Gftest.Eq__ary__lines(expd.To_str(tmp), src.To_str(tmp), "merge_atrs");
|
||||||
}
|
}
|
||||||
|
public void Test_normalizeWhitespace(String src_str, String expd) {
|
||||||
|
Gftest.Eq__str(expd, sanitizer.normalizeWhitespace(Bry_.new_u8(src_str)), "merge_atrs");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -53,7 +53,7 @@ public class XomwXml {
|
|||||||
bfr.Add_byte_space();
|
bfr.Add_byte_space();
|
||||||
bfr.Add((byte[])attribs.Get_at(i));
|
bfr.Add((byte[])attribs.Get_at(i));
|
||||||
bfr.Add_byte_eq().Add_byte_quote();
|
bfr.Add_byte_eq().Add_byte_quote();
|
||||||
XomwSanitizer.Encode_attribute(bfr, (byte[])attribs.Get_at(i + 1));
|
XomwSanitizer.encodeAttribute(bfr, (byte[])attribs.Get_at(i + 1));
|
||||||
bfr.Add_byte_quote();
|
bfr.Add_byte_quote();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,373 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
||||||
|
import gplx.core.btries.*;
|
||||||
|
/**
|
||||||
|
* A collection of static methods to play with strings.
|
||||||
|
*/
|
||||||
|
public class XomwStringUtils {
|
||||||
|
// /**
|
||||||
|
// * Test whether a String is valid UTF-8.
|
||||||
|
// *
|
||||||
|
// * The function check for invalid byte sequences, overlong encoding but
|
||||||
|
// * not for different normalisations.
|
||||||
|
// *
|
||||||
|
// * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
|
||||||
|
// * In particular, the pure PHP code path did not in fact check for overlong forms.
|
||||||
|
// * Beware of this when backporting code to that version of MediaWiki.
|
||||||
|
// *
|
||||||
|
// * @since 1.21
|
||||||
|
// * @param String $value String to check
|
||||||
|
// * @return boolean Whether the given $value is a valid UTF-8 encoded String
|
||||||
|
// */
|
||||||
|
// static function isUtf8($value) {
|
||||||
|
// $value = (String)$value;
|
||||||
|
//
|
||||||
|
// // HHVM 3.4 and older come with an outdated version of libmbfl that
|
||||||
|
// // incorrectly allows values above U+10FFFF, so we have to check
|
||||||
|
// // for them separately. (This issue also exists in PHP 5.3 and
|
||||||
|
// // older, which are no longer supported.)
|
||||||
|
// static $newPHP;
|
||||||
|
// if ($newPHP === null) {
|
||||||
|
// $newPHP = !mb_check_encoding("\xf4\x90\x80\x80", 'UTF-8');
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return mb_check_encoding($value, 'UTF-8') &&
|
||||||
|
// ($newPHP || preg_match("/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value) === 0);
|
||||||
|
// }
|
||||||
|
|
||||||
|
private static final byte DELIMITER_EXPLODE__SEP = 0, DELIMITER_EXPLODE__BGN = 1, DELIMITER_EXPLODE__END = 2;
|
||||||
|
private static final Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
|
||||||
|
.Add_str_byte("|" , DELIMITER_EXPLODE__SEP)
|
||||||
|
.Add_str_byte("-{", DELIMITER_EXPLODE__BGN)
|
||||||
|
.Add_str_byte("}-", DELIMITER_EXPLODE__END)
|
||||||
|
;
|
||||||
|
/**
|
||||||
|
* Explode a String, but ignore any instances of the separator inside
|
||||||
|
* the given start and end delimiters, which may optionally nest.
|
||||||
|
* The delimiters are literal strings, not regular expressions.
|
||||||
|
* @param String $startDelim Start delimiter
|
||||||
|
* @param String $endDelim End delimiter
|
||||||
|
* @param String $separator Separator String for the explode.
|
||||||
|
* @param String $subject Subject String to explode.
|
||||||
|
* @param boolean $nested True iff the delimiters are allowed to nest.
|
||||||
|
* @return ArrayIterator
|
||||||
|
*/
|
||||||
|
// XO.MW: NOTE: function only used in two places; hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
|
||||||
|
public static byte[][] delimiterExplode(List_adp tmp, Btrie_rv trv, byte[] src) {
|
||||||
|
// XO.MW.PORTED:entire proc rewritten; see PHP for source
|
||||||
|
int src_bgn = 0;
|
||||||
|
int src_end = src.length;
|
||||||
|
|
||||||
|
int depth = 0;
|
||||||
|
int cur = src_bgn;
|
||||||
|
int prv = cur;
|
||||||
|
while (true) {
|
||||||
|
// eos
|
||||||
|
if (cur == src_end) {
|
||||||
|
// add rest
|
||||||
|
tmp.Add(Bry_.Mid(src, prv, src_end));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
|
||||||
|
|
||||||
|
// regular char; continue;
|
||||||
|
if (o == null) {
|
||||||
|
cur++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle sep, bgn, end
|
||||||
|
byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
|
||||||
|
switch (tid) {
|
||||||
|
case DELIMITER_EXPLODE__SEP:
|
||||||
|
if (depth == 0) {
|
||||||
|
tmp.Add(Bry_.Mid(src, prv, cur));
|
||||||
|
prv = cur + 1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case DELIMITER_EXPLODE__BGN:
|
||||||
|
depth++;
|
||||||
|
break;
|
||||||
|
case DELIMITER_EXPLODE__END:
|
||||||
|
depth--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
cur = trv.Pos();
|
||||||
|
}
|
||||||
|
return (byte[][])tmp.To_ary_and_clear(byte[].class);
|
||||||
|
}
|
||||||
|
|
||||||
|
// /**
|
||||||
|
// * Perform an operation equivalent to `preg_replace()`
|
||||||
|
// *
|
||||||
|
// * Matches this code:
|
||||||
|
// *
|
||||||
|
// * preg_replace("!$startDelim(.*?)$endDelim!", $replace, $subject);
|
||||||
|
// *
|
||||||
|
// * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
|
||||||
|
// * implementation is fast but memory-hungry and inflexible. The memory requirements are such
|
||||||
|
// * that I don't recommend using it on anything but guaranteed small chunks of text.
|
||||||
|
// *
|
||||||
|
// * @param String $startDelim
|
||||||
|
// * @param String $endDelim
|
||||||
|
// * @param String $replace
|
||||||
|
// * @param String $subject
|
||||||
|
// * @return String
|
||||||
|
// */
|
||||||
|
// static function hungryDelimiterReplace($startDelim, $endDelim, $replace, $subject) {
|
||||||
|
// $segments = explode($startDelim, $subject);
|
||||||
|
// $output = array_shift($segments);
|
||||||
|
// foreach ($segments as $s) {
|
||||||
|
// $endDelimPos = strpos($s, $endDelim);
|
||||||
|
// if ($endDelimPos === false) {
|
||||||
|
// $output .= $startDelim . $s;
|
||||||
|
// } else {
|
||||||
|
// $output .= $replace . substr($s, $endDelimPos + strlen($endDelim));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return $output;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /**
|
||||||
|
// * Perform an operation equivalent to `preg_replace_callback()`
|
||||||
|
// *
|
||||||
|
// * Matches this code:
|
||||||
|
// *
|
||||||
|
// * preg_replace_callback("!$startDelim(.*)$endDelim!s$flags", $callback, $subject);
|
||||||
|
// *
|
||||||
|
// * If the start delimiter ends with an initial substring of the end delimiter,
|
||||||
|
// * e.g. in the case of C-style comments, the behavior differs from the model
|
||||||
|
// * regex. In this implementation, the end must share no characters with the
|
||||||
|
// * start, so e.g. `/*\/` is not considered to be both the start and end of a
|
||||||
|
// * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
|
||||||
|
// *
|
||||||
|
// * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
|
||||||
|
// * but uses far less memory. The delimiters are literal strings, not regular expressions.
|
||||||
|
// *
|
||||||
|
// * @param String $startDelim Start delimiter
|
||||||
|
// * @param String $endDelim End delimiter
|
||||||
|
// * @param callable $callback Function to call on each match
|
||||||
|
// * @param String $subject
|
||||||
|
// * @param String $flags Regular expression flags
|
||||||
|
// * @throws InvalidArgumentException
|
||||||
|
// * @return String
|
||||||
|
// */
|
||||||
|
// static function delimiterReplaceCallback($startDelim, $endDelim, $callback,
|
||||||
|
// $subject, $flags = ''
|
||||||
|
// ) {
|
||||||
|
// $inputPos = 0;
|
||||||
|
// $outputPos = 0;
|
||||||
|
// $contentPos = 0;
|
||||||
|
// $output = '';
|
||||||
|
// $foundStart = false;
|
||||||
|
// $encStart = preg_quote($startDelim, '!');
|
||||||
|
// $encEnd = preg_quote($endDelim, '!');
|
||||||
|
// $strcmp = strpos($flags, 'i') === false ? 'strcmp' : 'strcasecmp';
|
||||||
|
// $endLength = strlen($endDelim);
|
||||||
|
// $m = [];
|
||||||
|
//
|
||||||
|
// while ($inputPos < strlen($subject) &&
|
||||||
|
// preg_match("!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos)
|
||||||
|
// ) {
|
||||||
|
// $tokenOffset = $m[0][1];
|
||||||
|
// if ($m[1][0] != '') {
|
||||||
|
// if ($foundStart &&
|
||||||
|
// $strcmp($endDelim, substr($subject, $tokenOffset, $endLength)) == 0
|
||||||
|
// ) {
|
||||||
|
// # An end match is present at the same location
|
||||||
|
// $tokenType = 'end';
|
||||||
|
// $tokenLength = $endLength;
|
||||||
|
// } else {
|
||||||
|
// $tokenType = 'start';
|
||||||
|
// $tokenLength = strlen($m[0][0]);
|
||||||
|
// }
|
||||||
|
// } elseif ($m[2][0] != '') {
|
||||||
|
// $tokenType = 'end';
|
||||||
|
// $tokenLength = strlen($m[0][0]);
|
||||||
|
// } else {
|
||||||
|
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if ($tokenType == 'start') {
|
||||||
|
// # Only move the start position if we haven't already found a start
|
||||||
|
// # This means that START START END matches outer pair
|
||||||
|
// if (!$foundStart) {
|
||||||
|
// # Found start
|
||||||
|
// $inputPos = $tokenOffset + $tokenLength;
|
||||||
|
// # Write out the non-matching section
|
||||||
|
// $output .= substr($subject, $outputPos, $tokenOffset - $outputPos);
|
||||||
|
// $outputPos = $tokenOffset;
|
||||||
|
// $contentPos = $inputPos;
|
||||||
|
// $foundStart = true;
|
||||||
|
// } else {
|
||||||
|
// # Move the input position past the *first character* of START,
|
||||||
|
// # to protect against missing END when it overlaps with START
|
||||||
|
// $inputPos = $tokenOffset + 1;
|
||||||
|
// }
|
||||||
|
// } elseif ($tokenType == 'end') {
|
||||||
|
// if ($foundStart) {
|
||||||
|
// # Found match
|
||||||
|
// $output .= call_user_func($callback, [
|
||||||
|
// substr($subject, $outputPos, $tokenOffset + $tokenLength - $outputPos),
|
||||||
|
// substr($subject, $contentPos, $tokenOffset - $contentPos)
|
||||||
|
// ]);
|
||||||
|
// $foundStart = false;
|
||||||
|
// } else {
|
||||||
|
// # Non-matching end, write it out
|
||||||
|
// $output .= substr($subject, $inputPos, $tokenOffset + $tokenLength - $outputPos);
|
||||||
|
// }
|
||||||
|
// $inputPos = $outputPos = $tokenOffset + $tokenLength;
|
||||||
|
// } else {
|
||||||
|
// throw new InvalidArgumentException('Invalid delimiter given to ' . __METHOD__);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if ($outputPos < strlen($subject)) {
|
||||||
|
// $output .= substr($subject, $outputPos);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return $output;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /**
|
||||||
|
// * Perform an operation equivalent to `preg_replace()` with flags.
|
||||||
|
// *
|
||||||
|
// * Matches this code:
|
||||||
|
// *
|
||||||
|
// * preg_replace("!$startDelim(.*)$endDelim!$flags", $replace, $subject);
|
||||||
|
// *
|
||||||
|
// * @param String $startDelim Start delimiter regular expression
|
||||||
|
// * @param String $endDelim End delimiter regular expression
|
||||||
|
// * @param String $replace Replacement String. May contain $1, which will be
|
||||||
|
// * replaced by the text between the delimiters
|
||||||
|
// * @param String $subject String to search
|
||||||
|
// * @param String $flags Regular expression flags
|
||||||
|
// * @return String The String with the matches replaced
|
||||||
|
// */
|
||||||
|
// static function delimiterReplace($startDelim, $endDelim, $replace, $subject, $flags = '') {
|
||||||
|
// $replacer = new RegexlikeReplacer($replace);
|
||||||
|
//
|
||||||
|
// return self::delimiterReplaceCallback($startDelim, $endDelim,
|
||||||
|
// $replacer->cb(), $subject, $flags);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /**
|
||||||
|
// * More or less "markup-safe" explode()
|
||||||
|
// * Ignores any instances of the separator inside `<...>`
|
||||||
|
// * @param String $separator
|
||||||
|
// * @param String $text
|
||||||
|
// * @return array
|
||||||
|
// */
|
||||||
|
// static function explodeMarkup($separator, $text) {
|
||||||
|
// $placeholder = "\x00";
|
||||||
|
//
|
||||||
|
// // Remove placeholder instances
|
||||||
|
// $text = str_replace($placeholder, '', $text);
|
||||||
|
//
|
||||||
|
// // Replace instances of the separator inside HTML-like tags with the placeholder
|
||||||
|
// $replacer = new DoubleReplacer($separator, $placeholder);
|
||||||
|
// $cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
|
||||||
|
//
|
||||||
|
// // Explode, then put the replaced separators back in
|
||||||
|
// $items = explode($separator, $cleaned);
|
||||||
|
// foreach ($items as $i => $str) {
|
||||||
|
// $items[$i] = str_replace($placeholder, $separator, $str);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return $items;
|
||||||
|
// }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* More or less "markup-safe" str_replace()
|
||||||
|
* Ignores any instances of the separator inside `<...>`
|
||||||
|
* @param String $search
|
||||||
|
* @param String $replace
|
||||||
|
* @param String $text
|
||||||
|
* @return String
|
||||||
|
*/
|
||||||
|
public static void replaceMarkup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
|
||||||
|
// XO.MW.PORTED: avoiding multiple regex calls / String creations
|
||||||
|
// $placeholder = "\x00";
|
||||||
|
//
|
||||||
|
// Remove placeholder instances
|
||||||
|
// $text = str_replace($placeholder, '', $text);
|
||||||
|
//
|
||||||
|
// Replace instances of the separator inside HTML-like tags with the placeholder
|
||||||
|
// $replacer = new DoubleReplacer($search, $placeholder);
|
||||||
|
// $cleaned = StringUtils::delimiterReplaceCallback('<', '>', $replacer->cb(), $text);
|
||||||
|
//
|
||||||
|
// Explode, then put the replaced separators back in
|
||||||
|
// $cleaned = str_replace($search, $replace, $cleaned);
|
||||||
|
// $text = str_replace($placeholder, $search, $cleaned);
|
||||||
|
|
||||||
|
// if same length find / repl, do in-place replacement; EX: "!!" -> "||"
|
||||||
|
int find_len = find.length;
|
||||||
|
int repl_len = repl.length;
|
||||||
|
if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
|
||||||
|
|
||||||
|
byte find_0 = find[0];
|
||||||
|
byte dlm_bgn = Byte_ascii.Angle_bgn;
|
||||||
|
byte dlm_end = Byte_ascii.Angle_end;
|
||||||
|
boolean repl_active = true;
|
||||||
|
|
||||||
|
// loop every char in array
|
||||||
|
for (int i = src_bgn; i < src_end; i++) {
|
||||||
|
byte b = src[i];
|
||||||
|
if ( b == find_0
|
||||||
|
&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
|
||||||
|
&& repl_active
|
||||||
|
) {
|
||||||
|
Bry_.Set(src, i, i + find_len, repl);
|
||||||
|
}
|
||||||
|
else if (b == dlm_bgn) {
|
||||||
|
repl_active = false;
|
||||||
|
}
|
||||||
|
else if (b == dlm_end) {
|
||||||
|
repl_active = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// /**
|
||||||
|
// * Escape a String to make it suitable for inclusion in a preg_replace()
|
||||||
|
// * replacement parameter.
|
||||||
|
// *
|
||||||
|
// * @param String $String
|
||||||
|
// * @return String
|
||||||
|
// */
|
||||||
|
// static function escapeRegexReplacement($String) {
|
||||||
|
// $String = str_replace('\\', '\\\\', $String);
|
||||||
|
// $String = str_replace('$', '\\$', $String);
|
||||||
|
// return $String;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// /**
|
||||||
|
// * Workalike for explode() with limited memory usage.
|
||||||
|
// *
|
||||||
|
// * @param String $separator
|
||||||
|
// * @param String $subject
|
||||||
|
// * @return ArrayIterator|ExplodeIterator
|
||||||
|
// */
|
||||||
|
// static function explode($separator, $subject) {
|
||||||
|
// if (substr_count($subject, $separator) > 1000) {
|
||||||
|
// return new ExplodeIterator($separator, $subject);
|
||||||
|
// } else {
|
||||||
|
// return new ArrayIterator(explode($separator, $subject));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
@ -15,8 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
|||||||
*/
|
*/
|
||||||
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
||||||
import org.junit.*; import gplx.core.tests.*;
|
import org.junit.*; import gplx.core.tests.*;
|
||||||
public class Xomw_string_utils__tst {
|
public class XomwStringUtilsTest {
|
||||||
private final Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
|
private final XomwStringUtilsFxt fxt = new XomwStringUtilsFxt();
|
||||||
@Test public void Delimiter_explode() {
|
@Test public void Delimiter_explode() {
|
||||||
// basic
|
// basic
|
||||||
fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c");
|
fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c");
|
||||||
@ -42,17 +42,17 @@ public class Xomw_string_utils__tst {
|
|||||||
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">"
|
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class Xomw_string_utils__fxt {
|
class XomwStringUtilsFxt {
|
||||||
public void Test__delimiter_explode(String src_str, String... expd) {
|
public void Test__delimiter_explode(String src_str, String... expd) {
|
||||||
List_adp tmp = List_adp_.New();
|
List_adp tmp = List_adp_.New();
|
||||||
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
|
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
|
||||||
|
|
||||||
byte[][] actl = Xomw_string_utils.Delimiter_explode(tmp, trv, Bry_.new_u8(src_str));
|
byte[][] actl = XomwStringUtils.delimiterExplode(tmp, trv, Bry_.new_u8(src_str));
|
||||||
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
|
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
|
||||||
}
|
}
|
||||||
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
|
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
|
||||||
byte[] src_bry = Bry_.new_u8(src_str);
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
|
XomwStringUtils.replaceMarkup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
|
||||||
Gftest.Eq__str(expd, src_bry);
|
Gftest.Eq__str(expd, src_bry);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,123 +0,0 @@
|
|||||||
/*
|
|
||||||
XOWA: the XOWA Offline Wiki Application
|
|
||||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
|
||||||
|
|
||||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
|
||||||
or alternatively under the terms of the Apache License Version 2.0.
|
|
||||||
|
|
||||||
You may use XOWA according to either of these licenses as is most appropriate
|
|
||||||
for your project on a case-by-case basis.
|
|
||||||
|
|
||||||
The terms of each license can be found in the source code repository:
|
|
||||||
|
|
||||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
|
||||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
|
||||||
*/
|
|
||||||
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
|
|
||||||
import gplx.core.btries.*;
|
|
||||||
public class Xomw_string_utils {
|
|
||||||
// Explode a String, but ignore any instances of the separator inside
|
|
||||||
// the given start and end delimiters, which may optionally nest.
|
|
||||||
// The delimiters are literal strings, not regular expressions.
|
|
||||||
// @param String bgn_delim Start delimiter
|
|
||||||
// @param String end_delim End delimiter
|
|
||||||
// @param String separator Separator String for the explode.
|
|
||||||
// @param String subject Subject String to explode.
|
|
||||||
// @param boolean nested True iff the delimiters are allowed to nest.
|
|
||||||
// @return ArrayIterator
|
|
||||||
// XO.MW: hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
|
|
||||||
// XO.MW:SYNC:1.29; DATE:2017-02-03
|
|
||||||
private static final byte Delimiter_explode__sep = 0, Delimiter_explode__bgn = 1, Delimiter_explode__end = 2;
|
|
||||||
private static final Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
|
|
||||||
.Add_str_byte("|" , Delimiter_explode__sep)
|
|
||||||
.Add_str_byte("-{", Delimiter_explode__bgn)
|
|
||||||
.Add_str_byte("}-", Delimiter_explode__end)
|
|
||||||
;
|
|
||||||
public static byte[][] Delimiter_explode(List_adp tmp, Btrie_rv trv, byte[] src) {
|
|
||||||
int src_bgn = 0;
|
|
||||||
int src_end = src.length;
|
|
||||||
|
|
||||||
int depth = 0;
|
|
||||||
int cur = src_bgn;
|
|
||||||
int prv = cur;
|
|
||||||
while (true) {
|
|
||||||
// eos
|
|
||||||
if (cur == src_end) {
|
|
||||||
// add rest
|
|
||||||
tmp.Add(Bry_.Mid(src, prv, src_end));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
|
|
||||||
|
|
||||||
// regular char; continue;
|
|
||||||
if (o == null) {
|
|
||||||
cur++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle sep, bgn, end
|
|
||||||
byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
|
|
||||||
switch (tid) {
|
|
||||||
case Delimiter_explode__sep:
|
|
||||||
if (depth == 0) {
|
|
||||||
tmp.Add(Bry_.Mid(src, prv, cur));
|
|
||||||
prv = cur + 1;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Delimiter_explode__bgn:
|
|
||||||
depth++;
|
|
||||||
break;
|
|
||||||
case Delimiter_explode__end:
|
|
||||||
depth--;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
cur = trv.Pos();
|
|
||||||
}
|
|
||||||
return (byte[][])tmp.To_ary_and_clear(byte[].class);
|
|
||||||
}
|
|
||||||
// More or less "markup-safe" str_replace()
|
|
||||||
// Ignores any instances of the separator inside `<...>`
|
|
||||||
public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
|
|
||||||
// PORTED: avoiding multiple regex calls / String creations
|
|
||||||
// $placeholder = "\x00";
|
|
||||||
|
|
||||||
// Remove placeholder instances
|
|
||||||
// $text = str_replace( $placeholder, '', $text );
|
|
||||||
|
|
||||||
// Replace instances of the separator inside HTML-like tags with the placeholder
|
|
||||||
// $replacer = new DoubleReplacer( $search, $placeholder );
|
|
||||||
// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
|
|
||||||
|
|
||||||
// Explode, then put the replaced separators back in
|
|
||||||
// $cleaned = str_replace( $search, $replace, $cleaned );
|
|
||||||
// $text = str_replace( $placeholder, $search, $cleaned );
|
|
||||||
|
|
||||||
// if same length find / repl, do in-place replacement; EX: "!!" -> "||"
|
|
||||||
int find_len = find.length;
|
|
||||||
int repl_len = repl.length;
|
|
||||||
if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
|
|
||||||
|
|
||||||
byte find_0 = find[0];
|
|
||||||
byte dlm_bgn = Byte_ascii.Angle_bgn;
|
|
||||||
byte dlm_end = Byte_ascii.Angle_end;
|
|
||||||
boolean repl_active = true;
|
|
||||||
|
|
||||||
// loop every char in array
|
|
||||||
for (int i = src_bgn; i < src_end; i++) {
|
|
||||||
byte b = src[i];
|
|
||||||
if ( b == find_0
|
|
||||||
&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
|
|
||||||
&& repl_active
|
|
||||||
) {
|
|
||||||
Bry_.Set(src, i, i + find_len, repl);
|
|
||||||
}
|
|
||||||
else if (b == dlm_bgn) {
|
|
||||||
repl_active = false;
|
|
||||||
}
|
|
||||||
else if (b == dlm_end) {
|
|
||||||
repl_active = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -164,7 +164,7 @@ public class Xomw_link_renderer {
|
|||||||
|
|
||||||
// Merge the custom attribs with the default ones, and iterate
|
// Merge the custom attribs with the default ones, and iterate
|
||||||
// over that, deleting all "false" attributes.
|
// over that, deleting all "false" attributes.
|
||||||
sanitizer.Merge_attributes(src, trg);
|
sanitizer.mergeAttributes(src, trg);
|
||||||
|
|
||||||
// XO.MW:MW removes "false" values; XO removes "null" values
|
// XO.MW:MW removes "false" values; XO removes "null" values
|
||||||
boolean deleted = false;
|
boolean deleted = false;
|
||||||
|
@ -1244,7 +1244,7 @@ public class XomwParser implements XomwParserIface {
|
|||||||
// $text = $this->replaceTransparentTags( $text );
|
// $text = $this->replaceTransparentTags( $text );
|
||||||
mStripState.unstripGeneral(pbfr);
|
mStripState.unstripGeneral(pbfr);
|
||||||
|
|
||||||
sanitizer.Normalize_char_references(pbfr);
|
sanitizer.normalizeCharReferences(pbfr);
|
||||||
|
|
||||||
// if ( MWTidy::isEnabled() ) {
|
// if ( MWTidy::isEnabled() ) {
|
||||||
// if ( $this->mOptions->getTidy() ) {
|
// if ( $this->mOptions->getTidy() ) {
|
||||||
@ -4605,20 +4605,11 @@ public class XomwParser implements XomwParserIface {
|
|||||||
// that are later expanded to html- so expand them now and
|
// that are later expanded to html- so expand them now and
|
||||||
// remove the tags
|
// remove the tags
|
||||||
tooltip = this.mStripState.unstripBoth(tooltip);
|
tooltip = this.mStripState.unstripBoth(tooltip);
|
||||||
// tooltip = Sanitizer::stripAllTags( tooltip );
|
tooltip = sanitizer.stripAllTags(tooltip);
|
||||||
|
|
||||||
return tooltip;
|
return tooltip;
|
||||||
}
|
}
|
||||||
// protected function stripAltText($caption, $holders) {
|
|
||||||
// # make sure there are no placeholders in thumbnail attributes
|
|
||||||
// # that are later expanded to html- so expand them now and
|
|
||||||
// # remove the tags
|
|
||||||
// $tooltip = this.mStripState->unstripBoth($tooltip);
|
|
||||||
// $tooltip = Sanitizer::stripAllTags($tooltip);
|
|
||||||
//
|
|
||||||
// return $tooltip;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// /**
|
// /**
|
||||||
// * Set a flag in the output Object indicating that the content is dynamic and
|
// * Set a flag in the output Object indicating that the content is dynamic and
|
||||||
// * shouldn't be cached.
|
// * shouldn't be cached.
|
||||||
|
@ -200,7 +200,7 @@ public class Xomw_parser implements XomwParserIface {
|
|||||||
// $text = $this->replaceTransparentTags( $text );
|
// $text = $this->replaceTransparentTags( $text );
|
||||||
strip_state.unstripGeneral(pbfr);
|
strip_state.unstripGeneral(pbfr);
|
||||||
|
|
||||||
sanitizer.Normalize_char_references(pbfr);
|
sanitizer.normalizeCharReferences(pbfr);
|
||||||
|
|
||||||
// if ( MWTidy::isEnabled() ) {
|
// if ( MWTidy::isEnabled() ) {
|
||||||
// if ( $this->mOptions->getTidy() ) {
|
// if ( $this->mOptions->getTidy() ) {
|
||||||
|
@ -203,7 +203,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
|
|||||||
// $text = $this->getConverterLanguage()->markNoConversion( $text );
|
// $text = $this->getConverterLanguage()->markNoConversion( $text );
|
||||||
|
|
||||||
byte[] url = Bry_.Mid(src, url_bgn, url_end);
|
byte[] url = Bry_.Mid(src, url_bgn, url_end);
|
||||||
url = sanitizer.Clean_url(url);
|
url = sanitizer.cleanUrl(url);
|
||||||
|
|
||||||
bfr.Add_mid(src, prv, lnke_bgn);
|
bfr.Add_mid(src, prv, lnke_bgn);
|
||||||
prv = cur;
|
prv = cur;
|
||||||
|
@ -472,7 +472,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls
|
|||||||
// * text-bottom
|
// * text-bottom
|
||||||
|
|
||||||
// Protect LanguageConverter markup when splitting into parts
|
// Protect LanguageConverter markup when splitting into parts
|
||||||
byte[][] parts = Xomw_string_utils.Delimiter_explode(tmp_list, trv, options_at_link);
|
byte[][] parts = XomwStringUtils.delimiterExplode(tmp_list, trv, options_at_link);
|
||||||
|
|
||||||
// Give extensions a chance to select the file revision for us
|
// Give extensions a chance to select the file revision for us
|
||||||
// $options = [];
|
// $options = [];
|
||||||
|
@ -252,7 +252,7 @@ public class Xomw_magiclinks_wkr {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = sanitizer.Clean_url(url);
|
url = sanitizer.cleanUrl(url);
|
||||||
|
|
||||||
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
|
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
|
||||||
// Is this an external image?
|
// Is this an external image?
|
||||||
|
@ -107,7 +107,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
|
|||||||
for (int j = 0; j < indent_level; j++)
|
for (int j = 0; j < indent_level; j++)
|
||||||
tmp.Add(Html__dl__bgn);
|
tmp.Add(Html__dl__bgn);
|
||||||
tmp.Add_str_a7("<table");
|
tmp.Add_str_a7("<table");
|
||||||
sanitizer.Fix_tag_attributes(tmp, Name__table, tblw_atrs);
|
sanitizer.fixTagAttributes(tmp, Name__table, tblw_atrs);
|
||||||
tmp.Add_byte(Byte_ascii.Angle_end);
|
tmp.Add_byte(Byte_ascii.Angle_end);
|
||||||
out_line = tmp.To_bry_and_clear();
|
out_line = tmp.To_bry_and_clear();
|
||||||
td_history.Add(false);
|
td_history.Add(false);
|
||||||
@ -150,7 +150,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
|
|||||||
|
|
||||||
// Whats after the tag is now only attributes
|
// Whats after the tag is now only attributes
|
||||||
byte[] atrs = strip_state.unstripBoth(line);
|
byte[] atrs = strip_state.unstripBoth(line);
|
||||||
sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs);
|
sanitizer.fixTagAttributes(tmp, Name__tr, atrs);
|
||||||
atrs = tmp.To_bry_and_clear();
|
atrs = tmp.To_bry_and_clear();
|
||||||
|
|
||||||
Php_ary_.Pop_bry_or_null(tr_attributes);
|
Php_ary_.Pop_bry_or_null(tr_attributes);
|
||||||
@ -188,7 +188,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
|
|||||||
|
|
||||||
// Implies both are valid for table headings.
|
// Implies both are valid for table headings.
|
||||||
if (first_char == Byte_ascii.Bang) {
|
if (first_char == Byte_ascii.Bang) {
|
||||||
Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
|
XomwStringUtils.replaceMarkup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split up multiple cells on the same line.
|
// Split up multiple cells on the same line.
|
||||||
@ -253,7 +253,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
|
|||||||
else {
|
else {
|
||||||
byte[] atrs = strip_state.unstripBoth(cell_data_0);
|
byte[] atrs = strip_state.unstripBoth(cell_data_0);
|
||||||
tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
|
tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
|
||||||
sanitizer.Fix_tag_attributes(tmp, last_tag, atrs);
|
sanitizer.fixTagAttributes(tmp, last_tag, atrs);
|
||||||
tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
|
tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
|
||||||
cell = tmp.To_bry_and_clear();
|
cell = tmp.To_bry_and_clear();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user