mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse: Add clean_url and associated functions to sanitizer
This commit is contained in:
parent
c77e8a4374
commit
9a5c70b506
@ -40,7 +40,7 @@ public class Byte_ascii {
|
|||||||
, Ltr_n = 110, Ltr_o = 111, Ltr_p = 112, Ltr_q = 113, Ltr_r = 114
|
, Ltr_n = 110, Ltr_o = 111, Ltr_p = 112, Ltr_q = 113, Ltr_r = 114
|
||||||
, Ltr_s = 115, Ltr_t = 116, Ltr_u = 117, Ltr_v = 118, Ltr_w = 119
|
, Ltr_s = 115, Ltr_t = 116, Ltr_u = 117, Ltr_v = 118, Ltr_w = 119
|
||||||
, Ltr_x = 120, Ltr_y = 121, Ltr_z = 122, Curly_bgn = 123, Pipe = 124
|
, Ltr_x = 120, Ltr_y = 121, Ltr_z = 122, Curly_bgn = 123, Pipe = 124
|
||||||
, Curly_end = 125, Tilde = 126
|
, Curly_end = 125, Tilde = 126, Delete = 127
|
||||||
;
|
;
|
||||||
public static final byte
|
public static final byte
|
||||||
Angle_bgn = Lt, Angle_end = Gt
|
Angle_bgn = Lt, Angle_end = Gt
|
||||||
|
@ -117,6 +117,14 @@ public class Btrie_slim_mgr implements Btrie_mgr {
|
|||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
public Btrie_slim_mgr Add_many_bry(byte[]... ary) {
|
||||||
|
int len = ary.length;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
byte[] itm = ary[i];
|
||||||
|
Add_obj(itm, itm);
|
||||||
|
}
|
||||||
|
return this;
|
||||||
|
}
|
||||||
public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));}
|
public Btrie_slim_mgr Add_many_int(int val, String... ary) {return Add_many_int(val, Bry_.Ary(ary));}
|
||||||
public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) {
|
public Btrie_slim_mgr Add_many_int(int val, byte[]... ary) {
|
||||||
int len = ary.length;
|
int len = ary.length;
|
||||||
|
40
400_xowa/src/gplx/core/brys/Bry_tmp.java
Normal file
40
400_xowa/src/gplx/core/brys/Bry_tmp.java
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as
|
||||||
|
published by the Free Software Foundation, either version 3 of the
|
||||||
|
License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package gplx.core.brys; import gplx.*; import gplx.core.*;
|
||||||
|
public class Bry_tmp {
|
||||||
|
public byte[] src;
|
||||||
|
public int src_bgn;
|
||||||
|
public int src_end;
|
||||||
|
public boolean dirty;
|
||||||
|
public Bry_tmp Init(byte[] src, int src_bgn, int src_end) {
|
||||||
|
this.dirty = false;
|
||||||
|
this.src = src;
|
||||||
|
this.src_bgn = src_bgn;
|
||||||
|
this.src_end = src_end;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public void Set_by_bfr(Bry_bfr bfr) {
|
||||||
|
dirty = true;
|
||||||
|
src = bfr.To_bry_and_clear();
|
||||||
|
src_bgn = 0;
|
||||||
|
src_end = src.length;
|
||||||
|
}
|
||||||
|
public void Add_to_bfr(Bry_bfr bfr) {
|
||||||
|
bfr.Add_mid(src, src_bgn, src_end);
|
||||||
|
}
|
||||||
|
}
|
@ -62,6 +62,12 @@ public class Gfo_url_encoder_ {
|
|||||||
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.N)
|
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.N)
|
||||||
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
|
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
|
||||||
}
|
}
|
||||||
|
public static Gfo_url_encoder_mkr New__php_urlencode() {
|
||||||
|
// equivalent to php's urlencode; http://php.net/manual/en/function.urlencode.php;
|
||||||
|
// "Returns a String in which all non-alphanumeric characters except -_. have been replaced with a percent (%) sign followed by two hex digits and spaces encoded as plus (+) signs"
|
||||||
|
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y)
|
||||||
|
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Plus);
|
||||||
|
}
|
||||||
private static Gfo_url_encoder_mkr New__http_url_ttl() {
|
private static Gfo_url_encoder_mkr New__http_url_ttl() {
|
||||||
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y);
|
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y);
|
||||||
}
|
}
|
||||||
@ -103,5 +109,6 @@ public class Gfo_url_encoder_ {
|
|||||||
, Http_url = Gfo_url_encoder_.New__http_url().Make()
|
, Http_url = Gfo_url_encoder_.New__http_url().Make()
|
||||||
, Http_url_ttl = Gfo_url_encoder_.New__http_url_ttl().Make()
|
, Http_url_ttl = Gfo_url_encoder_.New__http_url_ttl().Make()
|
||||||
, Mw_ttl = Gfo_url_encoder_.New__mw_ttl().Make()
|
, Mw_ttl = Gfo_url_encoder_.New__mw_ttl().Make()
|
||||||
|
, Php_urlencode = Gfo_url_encoder_.New__php_urlencode().Make()
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
|
|||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
|
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
|
||||||
import gplx.core.btries.*;
|
import gplx.core.btries.*; import gplx.core.brys.*;
|
||||||
import gplx.core.primitives.*;
|
import gplx.core.primitives.*;
|
||||||
public class Php_preg_ {
|
public class Php_preg_ {
|
||||||
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
|
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
|
||||||
@ -72,4 +72,41 @@ public class Php_preg_ {
|
|||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void Replace(Bry_tmp bry, Bry_bfr tmp, Btrie_slim_mgr find_trie, Btrie_rv trv, byte[] repl_bry) {
|
||||||
|
byte[] src = bry.src;
|
||||||
|
int src_bgn = bry.src_bgn;
|
||||||
|
int src_end = bry.src_end;
|
||||||
|
|
||||||
|
int cur = src_bgn;
|
||||||
|
int prv = cur;
|
||||||
|
boolean dirty = false;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// eos
|
||||||
|
if (cur == src_end) {
|
||||||
|
if (dirty) {
|
||||||
|
tmp.Add_mid(src, prv, src_end);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte b = src[cur];
|
||||||
|
Object o = find_trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||||
|
if (o == null) {
|
||||||
|
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
dirty = true;
|
||||||
|
tmp.Add_mid(src, prv, cur);
|
||||||
|
tmp.Add(repl_bry);
|
||||||
|
cur = trv.Pos();
|
||||||
|
prv = cur;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dirty) {
|
||||||
|
bry.Set_by_bfr(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,7 @@ public class Php_str_ {
|
|||||||
if (max == -1) max = src_len;
|
if (max == -1) max = src_len;
|
||||||
int rv = 0;
|
int rv = 0;
|
||||||
for (int i = bgn; i < src_len; i++) {
|
for (int i = bgn; i < src_len; i++) {
|
||||||
if (find[src[i]] && rv < max)
|
if (find[src[i] & 0xFF] && rv < max) // PATCH.JAVA:need to convert to unsigned byte
|
||||||
rv++;
|
rv++;
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
@ -94,7 +94,7 @@ public class Php_str_ {
|
|||||||
if (max == -1) max = Int_.Max_value;
|
if (max == -1) max = Int_.Max_value;
|
||||||
int rv = 0;
|
int rv = 0;
|
||||||
for (int i = bgn - 1; i > -1; i--) {
|
for (int i = bgn - 1; i > -1; i--) {
|
||||||
if (find[src[i]] && rv < max)
|
if (find[src[i & 0xFF]] && rv < max) // PATCH.JAVA:need to convert to unsigned byte
|
||||||
rv++;
|
rv++;
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
|
@ -16,65 +16,123 @@ You should have received a copy of the GNU Affero General Public License
|
|||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
|
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
|
||||||
import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
|
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
|
||||||
import gplx.xowa.parsers.htmls.*;
|
import gplx.xowa.parsers.htmls.*;
|
||||||
import gplx.xowa.mws.parsers.*;
|
import gplx.xowa.mws.parsers.*; import gplx.langs.phps.utls.*;
|
||||||
public class Xomw_sanitizer {
|
public class Xomw_sanitizer {
|
||||||
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
||||||
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
||||||
// static function cleanUrl($url) {
|
private final Xomw_regex_escape_invalid regex_clean_url = new Xomw_regex_escape_invalid();
|
||||||
// // Normalize any HTML entities in input. They will be
|
private final Xomw_regex_find_domain regex_find_domain = new Xomw_regex_find_domain();
|
||||||
// // re-escaped by makeExternalLink().
|
private final Xomw_regex_ipv6_brack regex_ipv6_brack = new Xomw_regex_ipv6_brack();
|
||||||
// $url = Sanitizer::decodeCharReferences($url);
|
private final Bry_tmp tmp_host = new Bry_tmp();
|
||||||
//
|
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||||
// // Escape any control characters introduced by the above step
|
private final Btrie_rv trv = new Btrie_rv();
|
||||||
// $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/',
|
private final Xomw_regex_url_char_cbk__normalize normalize_cbk;
|
||||||
// [ __CLASS__, 'cleanUrlCallback' ], $url);
|
private final Xomw_regex_url_char_cbk__decode decode_cbk;
|
||||||
//
|
|
||||||
// // Validate hostname portion
|
private static Xomw_regex_url_char regex_url_char;
|
||||||
// $matches = [];
|
private static Btrie_slim_mgr invalid_idn_trie;
|
||||||
// if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
|
public Xomw_sanitizer() {
|
||||||
// list(/* $whole */, $protocol, $host, $rest) = $matches;
|
this.normalize_cbk = new Xomw_regex_url_char_cbk__normalize(this);
|
||||||
//
|
this.decode_cbk = new Xomw_regex_url_char_cbk__decode(this);
|
||||||
// // Characters that will be ignored in IDNs.
|
if (regex_url_char == null) {
|
||||||
// // https://tools.ietf.org/html/rfc3454#section-3.1
|
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||||
// // Strip them before further processing so blacklists and such work.
|
regex_url_char = new Xomw_regex_url_char();
|
||||||
// $strip = "/
|
|
||||||
// \\s| // general whitespace
|
// Characters that will be ignored in IDNs.
|
||||||
// \xc2\xad| // 00ad SOFT HYPHEN
|
// https://tools.ietf.org/html/rfc3454#section-3.1
|
||||||
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
|
// $strip = "/
|
||||||
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
|
// \\s| // general whitespace
|
||||||
// \xe2\x81\xa0| // 2060 WORD JOINER
|
// \xc2\xad| // 00ad SOFT HYPHEN
|
||||||
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
|
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
|
||||||
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
|
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
|
||||||
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
|
// \xe2\x81\xa0| // 2060 WORD JOINER
|
||||||
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
|
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
|
||||||
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
|
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
|
||||||
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
|
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
|
||||||
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
|
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
|
||||||
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
|
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||||
// /xuD";
|
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
|
||||||
//
|
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
|
||||||
// $host = preg_replace($strip, '', $host);
|
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
|
||||||
//
|
// /xuD";
|
||||||
// // IPv6 host names are bracketed with []. Url-decode these.
|
// XO.MW.REGEX:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
|
||||||
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
|
// /x : ignore embedded ws
|
||||||
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
|
// /u : enabled pcre utf8
|
||||||
// ) {
|
// /D : $ matches EOS, not NL
|
||||||
// $host = '//[' . $matches[1] . ']' . $matches[2];
|
invalid_idn_trie = Btrie_slim_mgr.cs()
|
||||||
// }
|
.Add_many_bry(new Xomw_regex_parser().Add_ary
|
||||||
//
|
( "\\s"
|
||||||
// // @todo FIXME: Validate hostnames here
|
, "\\xc2\\xad" // 00ad SOFT HYPHEN
|
||||||
//
|
, "\\xe1\\xa0\\x86" // 1806 MONGOLIAN TODO SOFT HYPHEN
|
||||||
// return $protocol . $host . $rest;
|
, "\\xe2\\x80\\x8b" // 200b ZERO WIDTH SPACE
|
||||||
// } else {
|
, "\\xe2\\x81\\xa0" // 2060 WORD JOINER
|
||||||
// return $url;
|
, "\\xef\\xbb\\xbf" // feff ZERO WIDTH NO-BREAK SPACE
|
||||||
// }
|
, "\\xcd\\x8f" // 034f COMBINING GRAPHEME JOINER
|
||||||
// }
|
, "\\xe1\\xa0\\x8b" // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
|
||||||
//
|
, "\\xe1\\xa0\\x8c" // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
|
||||||
// static function cleanUrlCallback($matches) {
|
, "\\xe1\\xa0\\x8d" // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
|
||||||
// return urlencode($matches[0]);
|
, "\\xe2\\x80\\x8c" // 200c ZERO WIDTH NON-JOINER
|
||||||
// }
|
, "\\xe2\\x80\\x8d" // 200d ZERO WIDTH JOINER
|
||||||
|
)
|
||||||
|
.Add_rng
|
||||||
|
( "\\xef\\xb8\\x80", "\\xef\\xb8\\x8f" // fe00-fe0f VARIATION SELECTOR-1-16
|
||||||
|
)
|
||||||
|
.Rslt());
|
||||||
|
|
||||||
|
// assert static structs
|
||||||
|
if (html_entities == null) {
|
||||||
|
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||||
|
html_entities = Html_entities_new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] Clean_url(byte[] url) {
|
||||||
|
// Normalize any HTML entities in input. They will be
|
||||||
|
// re-escaped by makeExternalLink().
|
||||||
|
url = Decode_char_references(null, Bool_.Y, url, 0, url.length);
|
||||||
|
|
||||||
|
// Escape any control characters introduced by the above step
|
||||||
|
// XO.MW.REGEX: $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/', [ __CLASS__, 'cleanUrlCallback' ], $url);
|
||||||
|
// '[]<>"' | '00 -> 32' | 127
|
||||||
|
if (regex_clean_url.Escape(tmp_bfr, url, 0, url.length))
|
||||||
|
url = tmp_bfr.To_bry_and_clear();
|
||||||
|
|
||||||
|
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches))
|
||||||
|
if (regex_find_domain.Match(url, 0, url.length)) {
|
||||||
|
// Characters that will be ignored in IDNs.
|
||||||
|
// https://tools.ietf.org/html/rfc3454#section-3.1
|
||||||
|
// Strip them before further processing so blacklists and such work.
|
||||||
|
Php_preg_.Replace(tmp_host.Init(url, regex_find_domain.host_bgn, regex_find_domain.host_end), tmp_bfr, invalid_idn_trie, trv, Bry_.Empty);
|
||||||
|
|
||||||
|
// IPv6 host names are bracketed with []. Url-decode these.
|
||||||
|
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
|
||||||
|
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
|
||||||
|
// XO.MW.REGEX:
|
||||||
|
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
|
||||||
|
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
|
||||||
|
// EX: [ABCD]:80:12
|
||||||
|
if (regex_ipv6_brack.Match(tmp_host.src, tmp_host.src_bgn, tmp_host.src_end)) {
|
||||||
|
tmp_bfr.Add_str_a7("//[").Add_mid(tmp_host.src, regex_ipv6_brack.host_bgn, regex_ipv6_brack.host_end)
|
||||||
|
.Add_byte(Byte_ascii.Brack_end).Add_mid(tmp_host.src, regex_ipv6_brack.segs_bgn, regex_ipv6_brack.segs_end);
|
||||||
|
tmp_host.Set_by_bfr(tmp_bfr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// @todo FIXME: Validate hostnames here
|
||||||
|
|
||||||
|
tmp_bfr.Add_mid(url, regex_find_domain.prot_bgn, regex_find_domain.prot_end);
|
||||||
|
tmp_host.Add_to_bfr(tmp_bfr);
|
||||||
|
tmp_bfr.Add_mid(url, regex_find_domain.rest_bgn, regex_find_domain.rest_end);
|
||||||
|
return tmp_bfr.To_bry_and_clear();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
|
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
|
||||||
atr_bldr.Atrs__clear();
|
atr_bldr.Atrs__clear();
|
||||||
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
|
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
|
||||||
@ -105,163 +163,13 @@ public class Xomw_sanitizer {
|
|||||||
Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
|
Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
|
||||||
}
|
}
|
||||||
public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
|
public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
|
||||||
// assert static structs
|
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, normalize_cbk);
|
||||||
if (Normalize__dec == null) {
|
}
|
||||||
synchronized (Xomw_sanitizer.class) {
|
public byte[] Decode_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
|
||||||
html_entities = Html_entities_new();
|
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, decode_cbk);
|
||||||
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
|
|
||||||
Normalize__hex = Bool_ary_bldr.New_u8()
|
|
||||||
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
|
|
||||||
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
|
|
||||||
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
|
|
||||||
.To_ary();
|
|
||||||
Normalize__ent = Bool_ary_bldr.New_u8()
|
|
||||||
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
|
|
||||||
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
|
|
||||||
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
|
|
||||||
.Set_rng(128, 255)
|
|
||||||
.To_ary();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// XO.BRY_BFR
|
|
||||||
boolean dirty = false;
|
|
||||||
int cur = src_bgn;
|
|
||||||
boolean called_by_bry = bfr == null;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
// search for "&"
|
|
||||||
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
|
|
||||||
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
|
|
||||||
if (dirty)
|
|
||||||
bfr.Add_mid(src, cur, src_end);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
int ent_bgn = find_bgn + 1; // +1 to skip &
|
|
||||||
|
|
||||||
// get regex; (a) dec (	); (b) hex (ÿ); (c) entity (α);
|
|
||||||
boolean[] regex = null;
|
|
||||||
// check for #;
|
|
||||||
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
|
|
||||||
ent_bgn++;
|
|
||||||
if (ent_bgn < src_end) {
|
|
||||||
byte nxt = src[ent_bgn];
|
|
||||||
// check for x
|
|
||||||
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
|
|
||||||
ent_bgn++;
|
|
||||||
regex = Normalize__hex;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (regex == null)
|
|
||||||
regex = Normalize__dec;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
regex = Normalize__ent;
|
|
||||||
}
|
|
||||||
|
|
||||||
// keep looping until invalid regex
|
|
||||||
int ent_end = ent_bgn;
|
|
||||||
byte b = Byte_ascii.Null;
|
|
||||||
for (int i = ent_bgn; i < src_end; i++) {
|
|
||||||
b = src[i];
|
|
||||||
if (regex[b])
|
|
||||||
ent_end++;
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// mark dirty; can optimize later by checking if "<" already exists
|
|
||||||
dirty = true;
|
|
||||||
if (bfr == null) bfr = Bry_bfr_.New();
|
|
||||||
bfr.Add_mid(src, cur, find_bgn); // add everything before &
|
|
||||||
|
|
||||||
// invalid <- regex ended, but not at semic
|
|
||||||
if (b != Byte_ascii.Semic) {
|
|
||||||
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&"
|
|
||||||
cur = find_bgn + 1; // position after "&"
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do normalization
|
|
||||||
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
|
|
||||||
boolean ret = false;
|
|
||||||
if (regex == Normalize__ent) {
|
|
||||||
Normalize_entity(bfr, name);
|
|
||||||
ret = true;
|
|
||||||
}
|
|
||||||
else if (regex == Normalize__dec) {
|
|
||||||
ret = Dec_char_reference(bfr, name);
|
|
||||||
}
|
|
||||||
else if (regex == Normalize__hex) {
|
|
||||||
ret = Hex_char_reference(bfr, name);
|
|
||||||
}
|
|
||||||
if (!ret) {
|
|
||||||
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&"
|
|
||||||
bfr.Add_bry_escape_html(src, find_bgn + 1, ent_end + 1); // "find_bgn + 1" to start after "&"; "ent_end + 1" to include ";"
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = ent_end + 1; // +1 to position after ";"
|
|
||||||
}
|
|
||||||
|
|
||||||
// XO.BRY_BFR
|
|
||||||
if (dirty) {
|
|
||||||
if (called_by_bry)
|
|
||||||
return bfr.To_bry_and_clear();
|
|
||||||
else
|
|
||||||
return Bry_.Empty;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (called_by_bry) {
|
|
||||||
if (src_bgn == 0 && src_end == src.length)
|
|
||||||
return src;
|
|
||||||
else
|
|
||||||
return Bry_.Mid(src, src_bgn, src_end);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (lone_bfr)
|
|
||||||
bfr.Add_mid(src, src_bgn, src_end);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
|
public boolean Validate_codepoint(int codepoint) {
|
||||||
// return the equivalent numeric entity reference (except for the core <
|
|
||||||
// > & "). If the entity is a MediaWiki-specific alias, returns
|
|
||||||
// the HTML equivalent. Otherwise, returns HTML-escaped text of
|
|
||||||
// pseudo-entity source (eg &foo;)
|
|
||||||
private void Normalize_entity(Bry_bfr bfr, byte[] name) {
|
|
||||||
Object o = html_entities.Get_by_bry(name);
|
|
||||||
if (o == null) {
|
|
||||||
bfr.Add_str_a7("&").Add(name).Add_byte_semic();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Xomw_html_ent entity = (Xomw_html_ent)o;
|
|
||||||
bfr.Add(entity.html);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean Dec_char_reference(Bry_bfr bfr, byte[] codepoint) {
|
|
||||||
int point = Bry_.To_int_or(codepoint, -1);
|
|
||||||
if (Validate_codepoint(point)) {
|
|
||||||
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean Hex_char_reference(Bry_bfr bfr, byte[] codepoint) {
|
|
||||||
int point = Hex_utl_.Parse_or(codepoint, -1);
|
|
||||||
if (Validate_codepoint(point)) {
|
|
||||||
bfr.Add_str_a7("&#x");
|
|
||||||
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
|
|
||||||
bfr.Add_byte_semic();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean Validate_codepoint(int codepoint) {
|
|
||||||
// U+000C is valid in HTML5 but not allowed in XML.
|
// U+000C is valid in HTML5 but not allowed in XML.
|
||||||
// U+000D is valid in XML but not allowed in HTML5.
|
// U+000D is valid in XML but not allowed in HTML5.
|
||||||
// U+007F - U+009F are disallowed in HTML5 (control characters).
|
// U+007F - U+009F are disallowed in HTML5 (control characters).
|
||||||
@ -273,14 +181,13 @@ public class Xomw_sanitizer {
|
|||||||
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
|
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
|
public static Hash_adp_bry html_entities;
|
||||||
private static Hash_adp_bry html_entities;
|
|
||||||
private static Hash_adp_bry Html_entities_new() {
|
private static Hash_adp_bry Html_entities_new() {
|
||||||
Bry_bfr tmp = Bry_bfr_.New();
|
Bry_bfr tmp = Bry_bfr_.New();
|
||||||
Hash_adp_bry rv = Hash_adp_bry.cs();
|
Hash_adp_bry rv = Hash_adp_bry.cs();
|
||||||
|
|
||||||
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "רלמ", "‏");
|
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "רלמ", "‏");
|
||||||
Html_entities_set(rv, Xomw_html_ent.Type__alias, -1, "رلم", "‏");
|
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "رلم", "‏");
|
||||||
|
|
||||||
Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "<");
|
Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "<");
|
||||||
Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", ">");
|
Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", ">");
|
||||||
@ -568,3 +475,395 @@ class Xomw_html_ent {
|
|||||||
public final byte[] html;
|
public final byte[] html;
|
||||||
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
|
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
|
||||||
}
|
}
|
||||||
|
class Xomw_regex_find_domain {
|
||||||
|
public int prot_bgn;
|
||||||
|
public int prot_end;
|
||||||
|
public int host_bgn;
|
||||||
|
public int host_end;
|
||||||
|
public int rest_bgn;
|
||||||
|
public int rest_end;
|
||||||
|
public boolean Match(byte[] src, int src_bgn, int src_end) {
|
||||||
|
// Validate hostname portion
|
||||||
|
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
|
||||||
|
// ([^:]+:)(//[^/]+)?(.*)
|
||||||
|
// "protocol" + "host" + "rest"
|
||||||
|
// "protocol" -> ([^:]+:) EX: "https:" anything not-colon up to colon
|
||||||
|
// "host" -> (//[^/]+)? EX: "//abc/" anything not-slash up to slash
|
||||||
|
// "rest" -> (.*) EX: rest"
|
||||||
|
// /i : case-insensitive
|
||||||
|
// /D : $ matches EOS, not NL
|
||||||
|
|
||||||
|
// find prot; EX: "https:"
|
||||||
|
prot_bgn = src_bgn;
|
||||||
|
prot_end = Bry_find_.Move_fwd(src, Byte_ascii.Colon, prot_bgn, src_end);
|
||||||
|
// exit if not found
|
||||||
|
if (prot_end == Bry_find_.Not_found) return false;
|
||||||
|
|
||||||
|
// find host: EX: "//a.org"
|
||||||
|
host_bgn = prot_end;
|
||||||
|
int double_slash_end = host_bgn + 2;
|
||||||
|
// exit if eos
|
||||||
|
if (double_slash_end >= src_end) return false;
|
||||||
|
// exit if not "//"
|
||||||
|
if ( src[host_bgn ] != Byte_ascii.Slash
|
||||||
|
|| src[host_bgn + 1] != Byte_ascii.Slash
|
||||||
|
) return false;
|
||||||
|
host_end = Bry_find_.Find_fwd(src, Byte_ascii.Slash, double_slash_end, src_end);
|
||||||
|
// exit if not found
|
||||||
|
if (host_end == Bry_find_.Not_found) {
|
||||||
|
host_end = src_end;
|
||||||
|
rest_bgn = rest_end = -1;
|
||||||
|
}
|
||||||
|
// exit if only "//"
|
||||||
|
if (host_end - host_bgn == 2) return false;
|
||||||
|
|
||||||
|
// set rest
|
||||||
|
rest_bgn = host_end;
|
||||||
|
rest_end = src_end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_escape_invalid {
|
||||||
|
// [\][<>"\\x00-\\x20\\x7F\|]
|
||||||
|
public boolean Escape(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
|
||||||
|
boolean dirty = false;
|
||||||
|
int cur = src_bgn;
|
||||||
|
int prv = cur;
|
||||||
|
while (true) {
|
||||||
|
// eos
|
||||||
|
if (cur == src_end) {
|
||||||
|
if (dirty) {
|
||||||
|
bfr.Add_mid(src, prv, src_end);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
boolean match = false;
|
||||||
|
byte b = src[cur];
|
||||||
|
switch (b) {
|
||||||
|
case Byte_ascii.Brack_bgn:
|
||||||
|
case Byte_ascii.Brack_end:
|
||||||
|
case Byte_ascii.Angle_bgn:
|
||||||
|
case Byte_ascii.Angle_end:
|
||||||
|
case Byte_ascii.Quote:
|
||||||
|
case Byte_ascii.Pipe:
|
||||||
|
case Byte_ascii.Delete:
|
||||||
|
match = true;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (b >= 0 && b <= 32)
|
||||||
|
match = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (match) {
|
||||||
|
bfr.Add_mid(src, prv, cur);
|
||||||
|
gplx.langs.htmls.encoders.Gfo_url_encoder_.Php_urlencode.Encode(bfr, src, cur, cur + 1);
|
||||||
|
dirty = true;
|
||||||
|
cur++;
|
||||||
|
prv = cur;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
cur++;
|
||||||
|
}
|
||||||
|
return dirty;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_ipv6_brack {
|
||||||
|
public int host_bgn;
|
||||||
|
public int host_end;
|
||||||
|
public int segs_bgn;
|
||||||
|
public int segs_end;
|
||||||
|
private final byte[]
|
||||||
|
Bry__host_bgn = Bry_.new_a7("//%5B")
|
||||||
|
, Bry__host_end = Bry_.new_a7("%5D")
|
||||||
|
;
|
||||||
|
public boolean Match(byte[] src, int src_bgn, int src_end) {
|
||||||
|
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
|
||||||
|
// XO.MW.REGEX:
|
||||||
|
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
|
||||||
|
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
|
||||||
|
// EX: [ABCD]:80:12
|
||||||
|
host_bgn = src_bgn + Bry__host_bgn.length;
|
||||||
|
// exit if no match for "//%5B"
|
||||||
|
if (!Bry_.Match(src, src_bgn, host_bgn, Bry__host_bgn)) return false;
|
||||||
|
|
||||||
|
// skip all [0-9A-Fa-f:.]
|
||||||
|
host_end = host_bgn;
|
||||||
|
while (true) {
|
||||||
|
// exit if eos
|
||||||
|
if (host_end == src_end) return false;
|
||||||
|
boolean done = false;
|
||||||
|
byte b = src[host_end];
|
||||||
|
switch (b) {
|
||||||
|
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||||
|
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||||
|
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F:
|
||||||
|
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f:
|
||||||
|
case Byte_ascii.Colon:
|
||||||
|
case Byte_ascii.Dot:
|
||||||
|
host_end++;
|
||||||
|
break;
|
||||||
|
case Byte_ascii.Percent:
|
||||||
|
// matches "%5D"
|
||||||
|
segs_bgn = host_end + Bry__host_end.length;
|
||||||
|
if ( Bry_.Match(src, host_end, segs_bgn, Bry__host_end)
|
||||||
|
&& host_end - host_bgn > 0) // host can't be 0-len; EX: "//%5B%5D"
|
||||||
|
done = true;
|
||||||
|
// exit if no match
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
// exit if no match
|
||||||
|
default: {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (done) break;
|
||||||
|
}
|
||||||
|
// skip all (:\d+)
|
||||||
|
segs_end = segs_bgn;
|
||||||
|
while (true) {
|
||||||
|
// stop if eos
|
||||||
|
if (segs_end == src_end) return true;
|
||||||
|
|
||||||
|
// check if ":"
|
||||||
|
if (src[segs_end] == Byte_ascii.Colon) {
|
||||||
|
int num_bgn = segs_end + 1;
|
||||||
|
int num_end = Bry_find_.Find_fwd_while_num(src, num_bgn, src_end);
|
||||||
|
// exit if no nums found; EX:"[ABC]:80:"
|
||||||
|
if (num_end == num_bgn) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
segs_end = num_end;
|
||||||
|
}
|
||||||
|
// exit if seg doesn't start with ":"
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
interface Xomw_regex_url_char_cbk {
|
||||||
|
boolean When_ent(Bry_bfr bfr, byte[] name);
|
||||||
|
boolean When_dec(Bry_bfr bfr, byte[] name);
|
||||||
|
boolean When_hex(Bry_bfr bfr, byte[] name);
|
||||||
|
boolean When_amp(Bry_bfr bfr);
|
||||||
|
}
|
||||||
|
class Xomw_regex_url_char_cbk__normalize implements Xomw_regex_url_char_cbk {
|
||||||
|
private final Xomw_sanitizer sanitizer;
|
||||||
|
public Xomw_regex_url_char_cbk__normalize(Xomw_sanitizer sanitizer) {
|
||||||
|
this.sanitizer = sanitizer;
|
||||||
|
}
|
||||||
|
public boolean When_ent(Bry_bfr bfr, byte[] name) { // XO.MW:normalizeEntity
|
||||||
|
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
|
||||||
|
// return the equivalent numeric entity reference (except for the core <
|
||||||
|
// > & "). If the entity is a MediaWiki-specific alias, returns
|
||||||
|
// the HTML equivalent. Otherwise, returns HTML-escaped text of
|
||||||
|
// pseudo-entity source (eg &foo;)
|
||||||
|
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
|
||||||
|
if (o == null) {
|
||||||
|
bfr.Add_str_a7("&").Add(name).Add_byte_semic();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Xomw_html_ent entity = (Xomw_html_ent)o;
|
||||||
|
bfr.Add(entity.html);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public boolean When_dec(Bry_bfr bfr, byte[] name) { // XO.MW:decCharReference
|
||||||
|
int point = Bry_.To_int_or(name, -1);
|
||||||
|
if (sanitizer.Validate_codepoint(point)) {
|
||||||
|
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
public boolean When_hex(Bry_bfr bfr, byte[] name) { // XO.MW:hexCharReference
|
||||||
|
int point = Hex_utl_.Parse_or(name, -1);
|
||||||
|
if (sanitizer.Validate_codepoint(point)) {
|
||||||
|
bfr.Add_str_a7("&#x");
|
||||||
|
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
|
||||||
|
bfr.Add_byte_semic();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
public boolean When_amp(Bry_bfr bfr) {
|
||||||
|
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&"
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_url_char_cbk__decode implements Xomw_regex_url_char_cbk {
|
||||||
|
private final Xomw_sanitizer sanitizer;
|
||||||
|
public Xomw_regex_url_char_cbk__decode(Xomw_sanitizer sanitizer) {
|
||||||
|
this.sanitizer = sanitizer;
|
||||||
|
}
|
||||||
|
public boolean When_ent(Bry_bfr bfr, byte[] name) {// XO.MW:decodeEntity
|
||||||
|
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
|
||||||
|
// return the UTF-8 encoding of that character. Otherwise, returns
|
||||||
|
// pseudo-entity source (eg "&foo;")
|
||||||
|
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
|
||||||
|
if (o == null) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Amp).Add(name).Add_byte_semic();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Xomw_html_ent entity = (Xomw_html_ent)o;
|
||||||
|
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(entity.code));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
public boolean When_dec(Bry_bfr bfr, byte[] name) {
|
||||||
|
return Decode_char(bfr, Bry_.To_int(name));
|
||||||
|
}
|
||||||
|
public boolean When_hex(Bry_bfr bfr, byte[] name) {
|
||||||
|
return Decode_char(bfr, gplx.core.encoders.Hex_utl_.Parse_or(name, 0, name.length, -1));
|
||||||
|
}
|
||||||
|
public boolean When_amp(Bry_bfr bfr) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Amp);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private boolean Decode_char(Bry_bfr bfr, int point) {// XO.MW:decodeChar
|
||||||
|
// Return UTF-8 String for a codepoint if that is a valid
|
||||||
|
// character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
|
||||||
|
if (sanitizer.Validate_codepoint(point)) {
|
||||||
|
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(point));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bfr.Add(Utf8_replacement_char);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private static final byte[] Utf8_replacement_char = Bry_.New_by_ints(255, 253); // 0xfffd
|
||||||
|
}
|
||||||
|
class Xomw_regex_url_char {
|
||||||
|
// Regular expression to match various types of character references in
|
||||||
|
// Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
|
||||||
|
// static final CHAR_REFS_REGEX =
|
||||||
|
// '/&([A-Za-z0-9\x80-\xff]+);
|
||||||
|
// |&\#([0-9]+);
|
||||||
|
// |&\#[xX]([0-9A-Fa-f]+);
|
||||||
|
// |(&)/x';
|
||||||
|
public Xomw_regex_url_char() {
|
||||||
|
// assert static structs
|
||||||
|
if (Normalize__dec == null) {
|
||||||
|
synchronized (Xomw_sanitizer.class) {
|
||||||
|
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
|
||||||
|
Normalize__hex = Bool_ary_bldr.New_u8()
|
||||||
|
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
|
||||||
|
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
|
||||||
|
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
|
||||||
|
.To_ary();
|
||||||
|
Normalize__ent = Bool_ary_bldr.New_u8()
|
||||||
|
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
|
||||||
|
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
|
||||||
|
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
|
||||||
|
.Set_rng(128, 255)
|
||||||
|
.To_ary();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public byte[] Replace_by_cbk(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, Xomw_regex_url_char_cbk cbk) {
|
||||||
|
// XO.BRY_BFR
|
||||||
|
boolean dirty = false;
|
||||||
|
int cur = src_bgn;
|
||||||
|
boolean called_by_bry = bfr == null;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// search for "&"
|
||||||
|
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
|
||||||
|
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
|
||||||
|
if (dirty)
|
||||||
|
bfr.Add_mid(src, cur, src_end);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int ent_bgn = find_bgn + 1; // +1 to skip &
|
||||||
|
|
||||||
|
// get regex; (a) dec (	); (b) hex (ÿ); (c) entity (α);
|
||||||
|
boolean[] regex = null;
|
||||||
|
// check for #;
|
||||||
|
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
|
||||||
|
ent_bgn++;
|
||||||
|
if (ent_bgn < src_end) {
|
||||||
|
byte nxt = src[ent_bgn];
|
||||||
|
// check for x
|
||||||
|
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
|
||||||
|
ent_bgn++;
|
||||||
|
regex = Normalize__hex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (regex == null)
|
||||||
|
regex = Normalize__dec;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
regex = Normalize__ent;
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep looping until invalid regex
|
||||||
|
int ent_end = ent_bgn;
|
||||||
|
int b = Byte_ascii.Null;
|
||||||
|
for (int i = ent_bgn; i < src_end; i++) {
|
||||||
|
b = src[i] & 0xFF; // PATCH.JAVA:need to convert to unsigned byte
|
||||||
|
if (regex[b])
|
||||||
|
ent_end++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// mark dirty; can optimize later by checking if "<" already exists
|
||||||
|
dirty = true;
|
||||||
|
if (bfr == null) bfr = Bry_bfr_.New();
|
||||||
|
bfr.Add_mid(src, cur, find_bgn); // add everything before &
|
||||||
|
|
||||||
|
// invalid <- regex ended, but not at semic
|
||||||
|
if (b != Byte_ascii.Semic) {
|
||||||
|
cbk.When_amp(bfr);
|
||||||
|
cur = find_bgn + 1; // position after "&"
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do normalization
|
||||||
|
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
|
||||||
|
boolean ret = false;
|
||||||
|
if (regex == Normalize__ent) {
|
||||||
|
cbk.When_ent(bfr, name);
|
||||||
|
ret = true;
|
||||||
|
}
|
||||||
|
else if (regex == Normalize__dec) {
|
||||||
|
ret = cbk.When_dec(bfr, name);
|
||||||
|
}
|
||||||
|
else if (regex == Normalize__hex) {
|
||||||
|
ret = cbk.When_hex(bfr, name);
|
||||||
|
}
|
||||||
|
if (!ret) {
|
||||||
|
cbk.When_amp(bfr);
|
||||||
|
cur = find_bgn + 1; // position after "&"
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ent_end + 1; // +1 to position after ";"
|
||||||
|
}
|
||||||
|
|
||||||
|
// XO.BRY_BFR
|
||||||
|
if (dirty) {
|
||||||
|
if (called_by_bry)
|
||||||
|
return bfr.To_bry_and_clear();
|
||||||
|
else
|
||||||
|
return Bry_.Empty;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (called_by_bry) {
|
||||||
|
if (src_bgn == 0 && src_end == src.length)
|
||||||
|
return src;
|
||||||
|
else
|
||||||
|
return Bry_.Mid(src, src_bgn, src_end);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (lone_bfr)
|
||||||
|
bfr.Add_mid(src, src_bgn, src_end);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
|
||||||
|
}
|
||||||
|
@ -19,19 +19,94 @@ package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
|
|||||||
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
|
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*;
|
||||||
public class Xomw_sanitizer__tst {
|
public class Xomw_sanitizer__tst {
|
||||||
private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
|
private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
|
||||||
@Test public void Text() {fxt.Test__normalize_char_references("abc" , "abc");}
|
@Test public void Normalize__text() {fxt.Test__normalize_char_references("abc" , "abc");}
|
||||||
@Test public void Dec() {fxt.Test__normalize_char_references("" , "&#08;");}
|
@Test public void Normalize__dec() {fxt.Test__normalize_char_references("" , "&#08;");}
|
||||||
@Test public void Dec__invalid() {fxt.Test__normalize_char_references("	" , "	");}
|
@Test public void Normalize__dec__invalid() {fxt.Test__normalize_char_references("	" , "	");}
|
||||||
@Test public void Hex() {fxt.Test__normalize_char_references("ÿ" , "ÿ");}
|
@Test public void Normalize__hex() {fxt.Test__normalize_char_references("ÿ" , "ÿ");}
|
||||||
@Test public void Entity() {fxt.Test__normalize_char_references("α" , "α");}
|
@Test public void Normalize__entity() {fxt.Test__normalize_char_references("α" , "α");}
|
||||||
@Test public void Entity__lt() {fxt.Test__normalize_char_references("<" , "<");}
|
@Test public void Normalize__entity__lt() {fxt.Test__normalize_char_references("<" , "<");}
|
||||||
@Test public void Invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&(invalid);");}
|
@Test public void Normalize__entity__alias() {fxt.Test__normalize_char_references("&רלמ;" , "‏");}
|
||||||
@Test public void Many() {
|
@Test public void Normalize__amp() {fxt.Test__normalize_char_references("a&b" , "a&b");}
|
||||||
|
@Test public void Normalize__invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&(invalid);");}
|
||||||
|
@Test public void Normalize__many() {
|
||||||
fxt.Test__normalize_char_references
|
fxt.Test__normalize_char_references
|
||||||
( "a 	 b α c ÿ d &(invalid); e"
|
( "a 	 b α c ÿ d &(invalid); e"
|
||||||
, "a 	 b α c ÿ d &(invalid); e"
|
, "a 	 b α c ÿ d &(invalid); e"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@Test public void Regex__domain() {
|
||||||
|
Xomw_regex_find_domain regex_domain = new Xomw_regex_find_domain();
|
||||||
|
// normal
|
||||||
|
fxt.Test__regex_domain_y(regex_domain, "https://a.org/bcd", "https:", "//a.org", "/bcd");
|
||||||
|
// trailing backslash
|
||||||
|
fxt.Test__regex_domain_y(regex_domain, "https://a.org/", "https:", "//a.org", "/");
|
||||||
|
// domain only
|
||||||
|
fxt.Test__regex_domain_y(regex_domain, "https://a.org", "https:", "//a.org", "");
|
||||||
|
// colon not found
|
||||||
|
fxt.Test__regex_domain_n(regex_domain, "https//a.org/bcd");
|
||||||
|
// host_bgn.eos
|
||||||
|
fxt.Test__regex_domain_n(regex_domain, "https:");
|
||||||
|
// host_bgn.//
|
||||||
|
fxt.Test__regex_domain_n(regex_domain, "https:a//");
|
||||||
|
// host_bgn.///
|
||||||
|
fxt.Test__regex_domain_n(regex_domain, "https:///a.org/b");
|
||||||
|
}
|
||||||
|
@Test public void Regex__clean_url() {
|
||||||
|
Xomw_regex_escape_invalid regex = new Xomw_regex_escape_invalid();
|
||||||
|
// noop
|
||||||
|
fxt.Test__regex_escape_invalid(regex, "https://a.org/bcd", Bool_.N, "");
|
||||||
|
// symbols
|
||||||
|
fxt.Test__regex_escape_invalid(regex, "[]<>\"|", Bool_.Y, "%5B%5D%3C%3E%22%7C%7F");
|
||||||
|
// range: 00 - 32
|
||||||
|
fxt.Test__regex_escape_invalid(regex, "\t\n ", Bool_.Y, "%09%0A+");
|
||||||
|
}
|
||||||
|
@Test public void Regex__ipv6_brack() {
|
||||||
|
Xomw_regex_ipv6_brack regex = new Xomw_regex_ipv6_brack();
|
||||||
|
// basic
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5B0a.1b:12%5D:123");
|
||||||
|
// port: none
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D");
|
||||||
|
// port: multiple
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D:1:2:3");
|
||||||
|
// "//%5B" missing
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "abc");
|
||||||
|
// ipv6: invalid
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba!%5D:1");
|
||||||
|
// ipv6: 0-len
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5B%5D:1");
|
||||||
|
// port: invalid
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:a");
|
||||||
|
// port: 0-len
|
||||||
|
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:");
|
||||||
|
}
|
||||||
|
@Test public void Decode() {
|
||||||
|
// dec
|
||||||
|
fxt.Test__decode_char_references("!" , "!");
|
||||||
|
// hex
|
||||||
|
fxt.Test__decode_char_references("#" , "#");
|
||||||
|
// entity
|
||||||
|
fxt.Test__decode_char_references("α" , "α");
|
||||||
|
// entity:lt
|
||||||
|
fxt.Test__decode_char_references("<" , "<");
|
||||||
|
// entity:rlm
|
||||||
|
fxt.Test__decode_char_references("&רלמ;" , "");
|
||||||
|
// entity:invalid
|
||||||
|
fxt.Test__decode_char_references("&invalid;" , "&invalid;");
|
||||||
|
// amp
|
||||||
|
fxt.Test__decode_char_references("a&b" , "a&b");
|
||||||
|
}
|
||||||
|
@Test public void Clean_url() {
|
||||||
|
// entity
|
||||||
|
fxt.Test__clean_url("http://a.org/b&c" , "http://a.org/b&c");
|
||||||
|
// entity: escape
|
||||||
|
fxt.Test__clean_url("http://a.org/b"c" , "http://a.org/b%22c");
|
||||||
|
// domain=n; make sure " is changed, but not soft-hyphen
|
||||||
|
fxt.Test__clean_url("a"z" , "a%22z");
|
||||||
|
// host: invalid idn
|
||||||
|
fxt.Test__clean_url("http://a᠆b.org/c᠆d" , "http://ab.org/c᠆d");
|
||||||
|
// ipv6_brack
|
||||||
|
fxt.Test__clean_url("http://[0a.1b:12]:123/cd" , "http://[0a.1b:12]:123/cd");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
class Xomw_sanitizer__fxt {
|
class Xomw_sanitizer__fxt {
|
||||||
private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
|
private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
|
||||||
@ -41,4 +116,33 @@ class Xomw_sanitizer__fxt {
|
|||||||
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
||||||
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
||||||
}
|
}
|
||||||
|
public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
Gftest.Eq__bool(true, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
|
||||||
|
Gftest.Eq__str(expd_prot, Bry_.Mid(src_bry, regex_domain.prot_bgn, regex_domain.prot_end));
|
||||||
|
Gftest.Eq__str(expd_host, Bry_.Mid(src_bry, regex_domain.host_bgn, regex_domain.host_end));
|
||||||
|
Gftest.Eq__str(expd_rest, Bry_.Mid(src_bry, regex_domain.rest_bgn, regex_domain.rest_end));
|
||||||
|
}
|
||||||
|
public void Test__regex_domain_n(Xomw_regex_find_domain regex_domain, String src_str) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
Gftest.Eq__bool(false, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
|
||||||
|
}
|
||||||
|
public void Test__regex_escape_invalid(Xomw_regex_escape_invalid regex, String src_str, boolean expd_rslt, String expd_str) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
Gftest.Eq__bool(expd_rslt, regex.Escape(tmp, src_bry, 0, src_bry.length));
|
||||||
|
Gftest.Eq__str(expd_str, tmp.To_bry_and_clear());
|
||||||
|
}
|
||||||
|
public void Test__regex_ipv6_brack(Xomw_regex_ipv6_brack regex, boolean expd_rslt, String src_str) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
Gftest.Eq__bool(expd_rslt, regex.Match(src_bry, 0, src_bry.length));
|
||||||
|
}
|
||||||
|
public void Test__decode_char_references(String src_str, String expd) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
|
||||||
|
Gftest.Eq__str(expd, tmp.To_str_and_clear());
|
||||||
|
}
|
||||||
|
public void Test__clean_url(String src_str, String expd) {
|
||||||
|
byte[] src_bry = Bry_.new_u8(src_str);
|
||||||
|
Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ public class Xomw_parser {
|
|||||||
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
|
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
|
||||||
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
|
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
|
||||||
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
|
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
|
||||||
private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
|
private final Xomw_magiclinks_wkr magiclinks_wkr;
|
||||||
private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
|
private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
|
||||||
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
|
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
|
||||||
private final Xomw_link_holders holders;
|
private final Xomw_link_holders holders;
|
||||||
@ -50,13 +50,6 @@ public class Xomw_parser {
|
|||||||
public Xomw_lnki_wkr Lnki_wkr() {return lnki_wkr;} private final Xomw_lnki_wkr lnki_wkr;
|
public Xomw_lnki_wkr Lnki_wkr() {return lnki_wkr;} private final Xomw_lnki_wkr lnki_wkr;
|
||||||
public boolean Output_type__wiki() {return output_type__wiki;} private final boolean output_type__wiki = false;
|
public boolean Output_type__wiki() {return output_type__wiki;} private final boolean output_type__wiki = false;
|
||||||
public Xomw_parser() {
|
public Xomw_parser() {
|
||||||
this.protocols_trie = Xomw_parser.Protocols__dflt();
|
|
||||||
this.holders = new Xomw_link_holders(link_renderer, tmp);
|
|
||||||
this.table_wkr = new Xomw_table_wkr(this);
|
|
||||||
this.quote_wkr = new Xomw_quote_wkr(this);
|
|
||||||
this.lnke_wkr = new Xomw_lnke_wkr(this);
|
|
||||||
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
|
|
||||||
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
|
|
||||||
if (regex_space == null) {
|
if (regex_space == null) {
|
||||||
synchronized (Type_adp_.ClassOf_obj(this)) {
|
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||||
regex_space = new Xomw_regex_space();
|
regex_space = new Xomw_regex_space();
|
||||||
@ -64,13 +57,22 @@ public class Xomw_parser {
|
|||||||
regex_url = new Xomw_regex_url(regex_space);
|
regex_url = new Xomw_regex_url(regex_space);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.protocols_trie = Xomw_parser.Protocols__dflt();
|
||||||
|
this.holders = new Xomw_link_holders(link_renderer, tmp);
|
||||||
|
this.table_wkr = new Xomw_table_wkr(this);
|
||||||
|
this.quote_wkr = new Xomw_quote_wkr(this);
|
||||||
|
this.lnke_wkr = new Xomw_lnke_wkr(this);
|
||||||
|
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
|
||||||
|
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
|
||||||
|
this.magiclinks_wkr = new Xomw_magiclinks_wkr(sanitizer, linker, regex_boundary, regex_url);
|
||||||
}
|
}
|
||||||
public void Init_by_wiki(Xowe_wiki wiki) {
|
public void Init_by_wiki(Xowe_wiki wiki) {
|
||||||
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
|
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
|
||||||
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
|
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
|
||||||
lnki_wkr.Init_by_wiki(wiki);
|
lnki_wkr.Init_by_wiki(wiki);
|
||||||
magiclinks_wkr.Init_by_wiki(linker, regex_boundary, regex_url);
|
|
||||||
doubleunder_wkr.Init_by_wiki(doubleunder_data, wiki.Lang());
|
doubleunder_wkr.Init_by_wiki(doubleunder_data, wiki.Lang());
|
||||||
|
magiclinks_wkr.Init_by_wiki();
|
||||||
}
|
}
|
||||||
public void Init_by_page(Xoa_ttl ttl) {
|
public void Init_by_page(Xoa_ttl ttl) {
|
||||||
pctx.Init_by_page(ttl);
|
pctx.Init_by_page(ttl);
|
||||||
@ -115,7 +117,7 @@ public class Xomw_parser {
|
|||||||
table_wkr.Do_table_stuff(pctx, pbfr);
|
table_wkr.Do_table_stuff(pctx, pbfr);
|
||||||
hr_wkr.Replace_hrs(pctx, pbfr);
|
hr_wkr.Replace_hrs(pctx, pbfr);
|
||||||
|
|
||||||
doubleunder_wkr.Do_double_underscore(pctx, pbfr);
|
doubleunder_wkr.Do_double_underscore(pctx, pbfr); // DONE: DATE:2017-01-27
|
||||||
|
|
||||||
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
|
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
|
||||||
lnki_wkr.Replace_internal_links(pctx, pbfr);
|
lnki_wkr.Replace_internal_links(pctx, pbfr);
|
||||||
|
101
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser.java
Normal file
101
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_parser.java
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as
|
||||||
|
published by the Free Software Foundation, either version 3 of the
|
||||||
|
License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||||
|
public class Xomw_regex_parser {
|
||||||
|
private Bry_bfr tmp;
|
||||||
|
public byte[][] Rslt() {return rslt;} private byte[][] rslt;
|
||||||
|
public Xomw_regex_parser Add_ary(String... ary) {return Set_or_add(Parse_ary(ary));}
|
||||||
|
private byte[][] Parse_ary(String... ary) {
|
||||||
|
if (tmp == null) tmp = Bry_bfr_.New();
|
||||||
|
int ary_len = ary.length;
|
||||||
|
byte[][] rv = new byte[ary_len][];
|
||||||
|
for (int i = 0; i < ary_len; i++) {
|
||||||
|
rv[i] = Compile_itm(tmp, Bry_.new_u8(ary[i]));
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
public Xomw_regex_parser Add_rng(String bgn, String end) {return Set_or_add(Parse_rng(bgn, end));}
|
||||||
|
private byte[][] Parse_rng(String bgn, String end) {
|
||||||
|
if (tmp == null) tmp = Bry_bfr_.New();
|
||||||
|
byte[] bgn_bry = Compile_itm(tmp, Bry_.new_u8(bgn));
|
||||||
|
int bgn_val = gplx.core.intls.Utf16_.Decode_to_int(bgn_bry, 0);
|
||||||
|
byte[] end_bry = Compile_itm(tmp, Bry_.new_u8(end));
|
||||||
|
int end_val = gplx.core.intls.Utf16_.Decode_to_int(end_bry, 0);
|
||||||
|
|
||||||
|
int rv_len = end_val - bgn_val + 1;
|
||||||
|
byte[][] rv = new byte[rv_len][];
|
||||||
|
for (int i = 0; i < rv_len; i++) {
|
||||||
|
rv[i] = gplx.core.intls.Utf16_.Encode_int_to_bry(i + bgn_val);
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
private Xomw_regex_parser Set_or_add(byte[][] val) {
|
||||||
|
rslt = rslt == null ? val : Bry_.Ary_add(rslt, val);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
private static byte[] Compile_itm(Bry_bfr tmp, byte[] src) {
|
||||||
|
// parse each itm
|
||||||
|
int src_end = src.length;
|
||||||
|
int cur = 0;
|
||||||
|
int prv = cur;
|
||||||
|
boolean dirty = false;
|
||||||
|
while (true) {
|
||||||
|
// eos
|
||||||
|
if (cur == src_end) {
|
||||||
|
if (dirty)
|
||||||
|
tmp.Add_mid(src, prv, src_end);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// look at byte
|
||||||
|
byte b = src[cur];
|
||||||
|
switch (b) { // escape
|
||||||
|
case Byte_ascii.Backslash:
|
||||||
|
int nxt = cur + 1;
|
||||||
|
if (nxt >= src_end) throw Err_.new_wo_type("regex escape failed: no more chars left", "src", src, "pos", nxt);
|
||||||
|
byte nxt_byte = src[nxt];
|
||||||
|
switch (nxt_byte) {
|
||||||
|
case Byte_ascii.Ltr_s: // \s -> " "
|
||||||
|
src = Byte_ascii.Space_bry;
|
||||||
|
cur = src_end;
|
||||||
|
break;
|
||||||
|
case Byte_ascii.Ltr_x: // \ u -> utf8 sequence in hex-dec; EX: "\xc2\xad" -> new byte[] {194, 160}
|
||||||
|
// read next two bytes
|
||||||
|
dirty = true;
|
||||||
|
nxt++;
|
||||||
|
if (nxt + 2 > src_end) throw Err_.new_wo_type("utf8 escape failed: no more chars left", "src", src, "pos", nxt);
|
||||||
|
tmp.Add_byte((byte)gplx.core.encoders.Hex_utl_.Parse_or(src, nxt, nxt + 2, -1));
|
||||||
|
cur = nxt + 2;
|
||||||
|
prv = cur;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw Err_.new_wo_type("regex escape failed: unknown char", "src", src, "pos", nxt);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default: // handles ascii only
|
||||||
|
if (b > 127)
|
||||||
|
throw Err_.new_wo_type("regex compiled failed: unknown char", "src", src, "pos", cur);
|
||||||
|
cur++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// set item
|
||||||
|
return dirty ? tmp.To_bry_and_clear() : src;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as
|
||||||
|
published by the Free Software Foundation, either version 3 of the
|
||||||
|
License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||||
|
import org.junit.*; import gplx.core.tests.*;
|
||||||
|
public class Xomw_regex_parser__tst {
|
||||||
|
private final Xomw_regex_parser__fxt fxt = new Xomw_regex_parser__fxt();
|
||||||
|
@Test public void Ary__space() {
|
||||||
|
fxt.Test__parse_ary(String_.Ary("\\s"), String_.Ary(" "));
|
||||||
|
}
|
||||||
|
@Test public void Ary__utf8() {
|
||||||
|
fxt.Test__parse_ary(String_.Ary("\\xc2\\xa7", "\\xe0\\xb9\\x90"), String_.Ary("§", "๐"));
|
||||||
|
}
|
||||||
|
@Test public void Rng__ascii() {
|
||||||
|
fxt.Test__parse_rng("a", "c", String_.Ary("a", "b", "c"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_parser__fxt {
|
||||||
|
private final Xomw_regex_parser parser = new Xomw_regex_parser();
|
||||||
|
public void Test__parse_ary(String[] ary, String[] expd) {
|
||||||
|
parser.Add_ary(ary);
|
||||||
|
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
|
||||||
|
}
|
||||||
|
public void Test__parse_rng(String bgn, String end, String[] expd) {
|
||||||
|
parser.Add_rng("a", "c");
|
||||||
|
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
|
||||||
|
}
|
||||||
|
}
|
@ -20,6 +20,7 @@ import gplx.core.btries.*;
|
|||||||
public class Xomw_regex_url {
|
public class Xomw_regex_url {
|
||||||
private final Btrie_slim_mgr trie;
|
private final Btrie_slim_mgr trie;
|
||||||
public Xomw_regex_url(Xomw_regex_space regex_space) {
|
public Xomw_regex_url(Xomw_regex_space regex_space) {
|
||||||
|
// [^][<>"\\x00-\\x20\\x7F\|]
|
||||||
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
|
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
|
||||||
this.trie = Btrie_slim_mgr.cs();
|
this.trie = Btrie_slim_mgr.cs();
|
||||||
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
|
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
|
||||||
|
@ -19,28 +19,40 @@ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; imp
|
|||||||
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
|
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
|
||||||
import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
|
import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
|
||||||
import gplx.langs.regxs.*;
|
import gplx.langs.regxs.*;
|
||||||
|
// TODO.XO: getExternalLinkAttribs($url)
|
||||||
|
// TODO.XO: this->getConverterLanguage()->markNoConversion($url, true),
|
||||||
public class Xomw_magiclinks_wkr {
|
public class Xomw_magiclinks_wkr {
|
||||||
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
||||||
private final Btrie_rv trv = new Btrie_rv();
|
private final Btrie_rv trv = new Btrie_rv();
|
||||||
private static byte[] Tag__anch__rhs;
|
private static byte[] Tag__anch__rhs;
|
||||||
private boolean[] url_separators;
|
private boolean[] url_separators;
|
||||||
private static Xomw_regex_link_interrupt regex_link_interrupt;
|
private static Xomw_regex_link_interrupt regex_link_interrupt;
|
||||||
private Xomw_regex_boundary regex_boundary;
|
private final Xomw_regex_boundary regex_boundary;
|
||||||
private Xomw_regex_url regex_url;
|
private final Xomw_regex_url regex_url;
|
||||||
private Xomw_linker linker;
|
private final Xomw_sanitizer sanitizer;
|
||||||
|
private final Xomw_linker linker;
|
||||||
private byte[] page_title;
|
private byte[] page_title;
|
||||||
|
|
||||||
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
|
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
|
||||||
public Xomw_magiclinks_wkr() {
|
public Xomw_magiclinks_wkr(Xomw_sanitizer sanitizer, Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
|
||||||
|
this.sanitizer = sanitizer;
|
||||||
|
this.linker = linker;
|
||||||
|
this.regex_boundary = regex_boundary;
|
||||||
|
this.regex_url = regex_url;
|
||||||
|
|
||||||
// ',;\.:!?'
|
// ',;\.:!?'
|
||||||
url_separators = Bool_ary_bldr.New_u8()
|
url_separators = Bool_ary_bldr.New_u8()
|
||||||
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
|
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
|
||||||
.To_ary();
|
.To_ary();
|
||||||
|
|
||||||
|
if (Tag__anch__rhs == null) {
|
||||||
|
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||||
|
Tag__anch__rhs = Bry_.new_a7("</a>");
|
||||||
|
regex_link_interrupt = new Xomw_regex_link_interrupt();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
|
public void Init_by_wiki() {
|
||||||
this.linker = linker;
|
|
||||||
this.regex_boundary = regex_boundary;
|
|
||||||
this.regex_url = regex_url;
|
|
||||||
regex_trie.Add_str_byte("<a", Regex__anch);
|
regex_trie.Add_str_byte("<a", Regex__anch);
|
||||||
regex_trie.Add_str_byte("<" , Regex__elem);
|
regex_trie.Add_str_byte("<" , Regex__elem);
|
||||||
|
|
||||||
@ -50,13 +62,6 @@ public class Xomw_magiclinks_wkr {
|
|||||||
Gfo_protocol_itm itm = protocol_ary[i];
|
Gfo_protocol_itm itm = protocol_ary[i];
|
||||||
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
|
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Tag__anch__rhs == null) {
|
|
||||||
synchronized (Type_adp_.ClassOf_obj(this)) {
|
|
||||||
Tag__anch__rhs = Bry_.new_a7("</a>");
|
|
||||||
regex_link_interrupt = new Xomw_regex_link_interrupt();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Replace special strings like "ISBN xxx" and "RFC xxx" with
|
// Replace special strings like "ISBN xxx" and "RFC xxx" with
|
||||||
@ -247,7 +252,7 @@ public class Xomw_magiclinks_wkr {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// $url = Sanitizer::cleanUrl($url);
|
url = sanitizer.Clean_url(url);
|
||||||
|
|
||||||
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
|
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
|
||||||
// Is this an external image?
|
// Is this an external image?
|
||||||
|
@ -34,10 +34,8 @@ public class Xomw_magiclinks_wkr__tst {
|
|||||||
fxt.Test__parse("a https://< z" , "a https://< z");
|
fxt.Test__parse("a https://< z" , "a https://< z");
|
||||||
}
|
}
|
||||||
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
|
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
|
||||||
// hex-dec
|
|
||||||
fxt.Test__parse("a https://b.org`z" , "a <a class='external free' rel='nofollow' href='https://b.org&#x60;z'>https://b.org&#x60;z</a>");
|
|
||||||
// dec-hex
|
// dec-hex
|
||||||
fxt.Test__parse("a https://b.orgc;z" , "a <a class='external free' rel='nofollow' href='https://b.org&#3c;z'>https://b.org&#3c;z</a>");
|
fxt.Test__parse("a https://b.orgc;z" , "a <a class='external free' rel='nofollow' href='https://b.org&#3c;z'>https://b.org&#3c;z</a>");
|
||||||
}
|
}
|
||||||
@Test public void Separator() {
|
@Test public void Separator() {
|
||||||
// basic; ,;.:!?
|
// basic; ,;.:!?
|
||||||
@ -50,10 +48,10 @@ public class Xomw_magiclinks_wkr__tst {
|
|||||||
fxt.Test__parse("a https://b.org;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
|
fxt.Test__parse("a https://b.org;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? z");
|
||||||
// ";" included b/c of ent
|
// ";" included b/c of ent
|
||||||
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&abc;'>https://b.org&abc;</a>.:!? z");
|
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&abc;'>https://b.org&abc;</a>.:!? z");
|
||||||
// ";" included b/c of hex
|
// ";" included b/c of hex; note that Clean_url changes "±" to "±"
|
||||||
fxt.Test__parse("a https://b.org±.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&#xB1;'>https://b.org&#xB1;</a>.:!? z");
|
fxt.Test__parse("a https://b.org±.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org±'>https://b.org±</a>.:!? z");
|
||||||
// ";" included b/c of dec
|
// ";" included b/c of dec; note that Clean_url changes "{" to "{"
|
||||||
fxt.Test__parse("a https://b.org{.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org&#123;'>https://b.org&#123;</a>.:!? z");
|
fxt.Test__parse("a https://b.org{.:!? z", "a <a class='external free' rel='nofollow' href='https://b.org{'>https://b.org{</a>.:!? z");
|
||||||
// ";" excluded b/c of invalid.ent
|
// ";" excluded b/c of invalid.ent
|
||||||
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&a1b'>https://b.org&a1b</a>;.:!? z");
|
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a class='external free' rel='nofollow' href='https://b.org&a1b'>https://b.org&a1b</a>;.:!? z");
|
||||||
// ";" excluded b/c of invalid.hex
|
// ";" excluded b/c of invalid.hex
|
||||||
@ -63,9 +61,13 @@ public class Xomw_magiclinks_wkr__tst {
|
|||||||
// num_post_proto rule
|
// num_post_proto rule
|
||||||
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
|
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
|
||||||
}
|
}
|
||||||
|
@Test public void Clean_url() {
|
||||||
|
// basic
|
||||||
|
fxt.Test__parse("http://a᠆b.org/c᠆d" , "<a class='external free' rel='nofollow' href='http://ab.org/c᠆d'>http://ab.org/c᠆d</a>");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
class Xomw_magiclinks_wkr__fxt {
|
class Xomw_magiclinks_wkr__fxt {
|
||||||
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
|
private final Xomw_magiclinks_wkr wkr;
|
||||||
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
||||||
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
|
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
|
||||||
public Xomw_magiclinks_wkr__fxt() {
|
public Xomw_magiclinks_wkr__fxt() {
|
||||||
@ -74,7 +76,8 @@ class Xomw_magiclinks_wkr__fxt {
|
|||||||
|
|
||||||
Xomw_regex_space regex_space = new Xomw_regex_space();
|
Xomw_regex_space regex_space = new Xomw_regex_space();
|
||||||
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
|
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
|
||||||
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
|
this.wkr = new Xomw_magiclinks_wkr(new Xomw_sanitizer(), new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
|
||||||
|
wkr.Init_by_wiki();
|
||||||
}
|
}
|
||||||
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
|
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
|
||||||
public void Test__parse(boolean apos, String src_str, String expd) {
|
public void Test__parse(boolean apos, String src_str, String expd) {
|
||||||
|
@ -60,13 +60,13 @@ public class Xomw_ttl_utl {
|
|||||||
if (cur == src_end) break;
|
if (cur == src_end) break;
|
||||||
byte b = src[cur];
|
byte b = src[cur];
|
||||||
int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
int b_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||||
if (b_len == 1) { // ASCII
|
if (b_len == 1) { // ASCII
|
||||||
if (valid[b]) // valid; EX: "a0A B&$"
|
if (valid[b & 0xFF]) // valid; EX: "a0A B&$"; PATCH.JAVA:need to convert to unsigned byte
|
||||||
cur++;
|
cur++;
|
||||||
else // invalid; EX: "<title>"
|
else // invalid; EX: "<title>"
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else { // Multi-byte UTF8; NOTE: all sequences are valid
|
else { // Multi-byte UTF8; NOTE: all sequences are valid
|
||||||
cur += b_len;
|
cur += b_len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user