mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse: Handle interrupt and separator logic for magiclinks
This commit is contained in:
parent
e231df0ce1
commit
7e27b5415d
39
400_xowa/src/gplx/core/primitives/Bool_ary_bldr.java
Normal file
39
400_xowa/src/gplx/core/primitives/Bool_ary_bldr.java
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as
|
||||||
|
published by the Free Software Foundation, either version 3 of the
|
||||||
|
License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package gplx.core.primitives; import gplx.*; import gplx.core.*;
|
||||||
|
public class Bool_ary_bldr {
|
||||||
|
private final boolean[] ary;
|
||||||
|
public Bool_ary_bldr(int len) {
|
||||||
|
this.ary = new boolean[len];
|
||||||
|
}
|
||||||
|
public Bool_ary_bldr Set_many(int... v) {
|
||||||
|
int len = v.length;
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
ary[v[i]] = true;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public Bool_ary_bldr Set_rng(int bgn, int end) {
|
||||||
|
for (int i = bgn; i <= end; i++)
|
||||||
|
ary[i] = true;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public boolean[] To_ary() {
|
||||||
|
return ary;
|
||||||
|
}
|
||||||
|
public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);}
|
||||||
|
}
|
@ -39,7 +39,7 @@ public class Php_str_ {
|
|||||||
int end = len < 0 ? src_len + len : bgn + len;
|
int end = len < 0 ? src_len + len : bgn + len;
|
||||||
if (end > src.length) end = src.length;; // handle out of bounds;
|
if (end > src.length) end = src.length;; // handle out of bounds;
|
||||||
return src[bgn];
|
return src[bgn];
|
||||||
}
|
}
|
||||||
public static int Strspn_fwd__ary(byte[] src, boolean[] find, int bgn, int max, int src_len) {
|
public static int Strspn_fwd__ary(byte[] src, boolean[] find, int bgn, int max, int src_len) {
|
||||||
if (max == -1) max = src_len;
|
if (max == -1) max = src_len;
|
||||||
int rv = 0;
|
int rv = 0;
|
||||||
@ -90,6 +90,17 @@ public class Php_str_ {
|
|||||||
}
|
}
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
public static int Strspn_bwd__ary(byte[] src, boolean[] find, int bgn, int max) {
|
||||||
|
if (max == -1) max = Int_.Max_value;
|
||||||
|
int rv = 0;
|
||||||
|
for (int i = bgn - 1; i > -1; i--) {
|
||||||
|
if (find[src[i]] && rv < max)
|
||||||
|
rv++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
public static int Strspn_bwd__space_or_tab(byte[] src, int bgn, int max) {
|
public static int Strspn_bwd__space_or_tab(byte[] src, int bgn, int max) {
|
||||||
if (max == -1) max = Int_.Max_value;
|
if (max == -1) max = Int_.Max_value;
|
||||||
int rv = 0;
|
int rv = 0;
|
||||||
|
@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
|
|||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
|
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
|
||||||
import gplx.core.encoders.*; import gplx.langs.htmls.entitys.*;
|
import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
|
||||||
import gplx.xowa.parsers.htmls.*;
|
import gplx.xowa.parsers.htmls.*;
|
||||||
import gplx.xowa.mws.parsers.*;
|
import gplx.xowa.mws.parsers.*;
|
||||||
public class Xomw_sanitizer {
|
public class Xomw_sanitizer {
|
||||||
@ -515,24 +515,3 @@ class Xomw_html_ent {
|
|||||||
public final byte[] html;
|
public final byte[] html;
|
||||||
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
|
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
|
||||||
}
|
}
|
||||||
class Bool_ary_bldr {
|
|
||||||
private final boolean[] ary;
|
|
||||||
public Bool_ary_bldr(int len) {
|
|
||||||
this.ary = new boolean[len];
|
|
||||||
}
|
|
||||||
public Bool_ary_bldr Set_many(int... v) {
|
|
||||||
int len = v.length;
|
|
||||||
for (int i = 0; i < len; i++)
|
|
||||||
ary[v[i]] = true;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
public Bool_ary_bldr Set_rng(int bgn, int end) {
|
|
||||||
for (int i = bgn; i <= end; i++)
|
|
||||||
ary[i] = true;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
public boolean[] To_ary() {
|
|
||||||
return ary;
|
|
||||||
}
|
|
||||||
public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);}
|
|
||||||
}
|
|
||||||
|
@ -20,7 +20,7 @@ import gplx.core.btries.*;
|
|||||||
public class Xomw_regex_ {
|
public class Xomw_regex_ {
|
||||||
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||||
int cur = src_bgn;
|
int cur = src_bgn;
|
||||||
while (true) {
|
while (cur < src_end) {
|
||||||
byte b = src[cur];
|
byte b = src[cur];
|
||||||
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||||
if (o == null)
|
if (o == null)
|
||||||
@ -32,7 +32,7 @@ public class Xomw_regex_ {
|
|||||||
}
|
}
|
||||||
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||||
int cur = src_bgn;
|
int cur = src_bgn;
|
||||||
while (true) {
|
while (cur < src_end) {
|
||||||
byte b = src[cur];
|
byte b = src[cur];
|
||||||
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||||
if (o == null)
|
if (o == null)
|
||||||
|
@ -23,12 +23,20 @@ public class Xomw_magiclinks_wkr {
|
|||||||
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
||||||
private final Btrie_rv trv = new Btrie_rv();
|
private final Btrie_rv trv = new Btrie_rv();
|
||||||
private static byte[] Tag__anch__rhs;
|
private static byte[] Tag__anch__rhs;
|
||||||
|
private boolean[] url_separators;
|
||||||
|
private static Xomw_regex_link_interrupt regex_link_interrupt;
|
||||||
private Xomw_regex_boundary regex_boundary;
|
private Xomw_regex_boundary regex_boundary;
|
||||||
private Xomw_regex_url regex_url;
|
private Xomw_regex_url regex_url;
|
||||||
private Xomw_linker linker;
|
private Xomw_linker linker;
|
||||||
private byte[] page_title;
|
private byte[] page_title;
|
||||||
|
|
||||||
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
|
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
|
||||||
|
public Xomw_magiclinks_wkr() {
|
||||||
|
// ',;\.:!?'
|
||||||
|
url_separators = Bool_ary_bldr.New_u8()
|
||||||
|
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
|
||||||
|
.To_ary();
|
||||||
|
}
|
||||||
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
|
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
|
||||||
this.linker = linker;
|
this.linker = linker;
|
||||||
this.regex_boundary = regex_boundary;
|
this.regex_boundary = regex_boundary;
|
||||||
@ -46,6 +54,7 @@ public class Xomw_magiclinks_wkr {
|
|||||||
if (Tag__anch__rhs == null) {
|
if (Tag__anch__rhs == null) {
|
||||||
synchronized (Type_adp_.ClassOf_obj(this)) {
|
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||||
Tag__anch__rhs = Bry_.new_a7("</a>");
|
Tag__anch__rhs = Bry_.new_a7("</a>");
|
||||||
|
regex_link_interrupt = new Xomw_regex_link_interrupt();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -64,19 +73,19 @@ public class Xomw_magiclinks_wkr {
|
|||||||
int prv = cur;
|
int prv = cur;
|
||||||
boolean dirty = true;
|
boolean dirty = true;
|
||||||
// PORTED.REGEX: handle below
|
// PORTED.REGEX: handle below
|
||||||
// NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
|
// XO.MW.UNSUPPORTED.OBSOLETE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
|
||||||
//'!(?: // Start cases
|
//'!(?: // Start cases
|
||||||
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||||
// (<.*?>) | // m[2]: Skip stuff inside
|
// (<.*?>) | // m[2]: Skip stuff inside
|
||||||
// // HTML elements' . "
|
// // HTML elements' . "
|
||||||
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
|
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
|
||||||
// // m[4]: Post-protocol path
|
// // m[4]: Post-protocol path
|
||||||
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
|
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
|
||||||
// ([0-9]+)\b |
|
// ([0-9]+)\b |
|
||||||
// \bISBN $spaces ( // m[6]: ISBN, capture number
|
// \bISBN $spaces ( // m[6]: ISBN, capture number
|
||||||
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
|
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
|
||||||
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
|
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
|
||||||
// [0-9Xx] // check digit
|
// [0-9Xx] // check digit
|
||||||
// )\b
|
// )\b
|
||||||
while (true) {
|
while (true) {
|
||||||
if (cur == src_end) {
|
if (cur == src_end) {
|
||||||
@ -173,50 +182,44 @@ public class Xomw_magiclinks_wkr {
|
|||||||
|
|
||||||
// Make a free external link, given a user-supplied URL
|
// Make a free external link, given a user-supplied URL
|
||||||
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
|
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
|
||||||
// byte[] trail = Bry_.Empty;
|
byte[] trail = Bry_.Empty;
|
||||||
|
|
||||||
// The characters '<' and '>' (which were escaped by
|
// The characters '<' and '>' (which were escaped by
|
||||||
// removeHTMLtags()) should not be included in
|
// removeHTMLtags()) should not be included in
|
||||||
// URLs, per RFC 2396.
|
// URLs, per RFC 2396.
|
||||||
// Make terminate a URL as well (bug T84937)
|
// Make terminate a URL as well (bug T84937)
|
||||||
|
int separator_bgn = regex_link_interrupt.Find(trv, url, 0, url.length);
|
||||||
// $m2 = [];
|
if (separator_bgn != Bry_find_.Not_found) {
|
||||||
// if (preg_match(
|
trail = Bry_.Mid(url, separator_bgn);
|
||||||
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
|
url = Bry_.Mid(url, 0, separator_bgn);
|
||||||
// $url,
|
}
|
||||||
// $m2,
|
|
||||||
// PREG_OFFSET_CAPTURE
|
|
||||||
// )) {
|
|
||||||
// trail = substr($url, $m2[0][1]) . $trail;
|
|
||||||
// $url = substr($url, 0, $m2[0][1]);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// Move trailing punctuation to $trail
|
// Move trailing punctuation to $trail
|
||||||
// $sep = ',;\.:!?';
|
int url_len = url.length;
|
||||||
// If there is no left bracket, then consider right brackets fair game too
|
// If there is no left bracket, then consider right brackets fair game too
|
||||||
// if (strpos($url, '(') === false) {
|
// XO.MW: if (strpos($url, '(') === false) {$sep .= ')';}
|
||||||
// $sep .= ')';
|
url_separators[Byte_ascii.Paren_end] = Bry_find_.Find_fwd(url, Byte_ascii.Paren_bgn, 0, url_len) == Bry_find_.Not_found;
|
||||||
// }
|
|
||||||
|
int num_sep_chars = Php_str_.Strspn_bwd__ary(url, url_separators, url_len, -1);
|
||||||
// $urlRev = strrev($url);
|
|
||||||
// $numSepChars = strspn($urlRev, $sep);
|
|
||||||
// Don't break a trailing HTML entity by moving the ; into $trail
|
// Don't break a trailing HTML entity by moving the ; into $trail
|
||||||
// This is in hot code, so use substr_compare to avoid having to
|
// This is in hot code, so use substr_compare to avoid having to
|
||||||
// create a new String Object for the comparison
|
// create a new String Object for the comparison
|
||||||
|
// XO.MW.NOTE: ignore semic if part of entity; EX: "http://a.org'!."
|
||||||
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
|
if (num_sep_chars > 0 && Php_str_.Substr_byte(url, -num_sep_chars) == Byte_ascii.Semic) {
|
||||||
// more optimization: instead of running preg_match with a $
|
// more optimization: instead of running preg_match with a $
|
||||||
// anchor, which can be slow, do the match on the reversed
|
// anchor, which can be slow, do the match on the reversed
|
||||||
// String starting at the desired offset.
|
// String starting at the desired offset.
|
||||||
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
|
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
|
||||||
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) {
|
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
|
||||||
// $numSepChars--;
|
if (Xomw_regex_html_entity.Match_bwd(url, url_len - num_sep_chars, 0)) {
|
||||||
// }
|
num_sep_chars--;
|
||||||
// }
|
}
|
||||||
// if ($numSepChars) {
|
}
|
||||||
// $trail = substr($url, -$numSepChars) . $trail;
|
|
||||||
// $url = substr($url, 0, -$numSepChars);
|
if (num_sep_chars > 0) {
|
||||||
// }
|
trail = Bry_.Add(Php_str_.Substr(url, -num_sep_chars), trail);
|
||||||
|
url = Php_str_.Substr(url, 0, -num_sep_chars);
|
||||||
|
}
|
||||||
|
|
||||||
// Verify that we still have a real URL after trail removal, and
|
// Verify that we still have a real URL after trail removal, and
|
||||||
// not just lone protocol
|
// not just lone protocol
|
||||||
@ -226,7 +229,8 @@ public class Xomw_magiclinks_wkr {
|
|||||||
|
|
||||||
// $url = Sanitizer::cleanUrl($url);
|
// $url = Sanitizer::cleanUrl($url);
|
||||||
|
|
||||||
// Is this an external image?
|
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
|
||||||
|
// Is this an external image?
|
||||||
byte[] text = null; // $this->maybeMakeExternalImage($url);
|
byte[] text = null; // $this->maybeMakeExternalImage($url);
|
||||||
if (text == null) {
|
if (text == null) {
|
||||||
// Not an image, make a link
|
// Not an image, make a link
|
||||||
@ -235,11 +239,130 @@ public class Xomw_magiclinks_wkr {
|
|||||||
, true, Bry_.new_a7("free")
|
, true, Bry_.new_a7("free")
|
||||||
, new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url)
|
, new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url)
|
||||||
, page_title);
|
, page_title);
|
||||||
|
|
||||||
|
// XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions?
|
||||||
// Register it in the output Object...
|
// Register it in the output Object...
|
||||||
// Replace unnecessary URL escape codes with their equivalent characters
|
// Replace unnecessary URL escape codes with their equivalent characters
|
||||||
// $pasteurized = self::normalizeLinkUrl($url);
|
// $pasteurized = self::normalizeLinkUrl($url);
|
||||||
// $this->mOutput->addExternalLink($pasteurized);
|
// $this->mOutput->addExternalLink($pasteurized);
|
||||||
}
|
}
|
||||||
// return $text . $trail;
|
bfr.Add(trail);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_html_entity {
|
||||||
|
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
|
||||||
|
// REGEX: (letters | hex + "#" | dec + "x#") + "&"
|
||||||
|
// \G means "stop if matching breaks"; so, using a reversed example, "http://&#amp;&#!lt;" will not match "&#amp;" b/c "&#!lt;" breaks match
|
||||||
|
// http://www.php.net/manual/en/regexp.reference.escape.php
|
||||||
|
// http://stackoverflow.com/questions/14897949/what-is-the-use-of-g-anchor-in-regex
|
||||||
|
public static boolean Match_bwd(byte[] src, int src_bgn, int src_end) {
|
||||||
|
int cur = src_bgn - 1;
|
||||||
|
int numbers = 0;
|
||||||
|
int letters = 0;
|
||||||
|
while (cur >= src_end) {
|
||||||
|
int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
|
||||||
|
switch (src[b_bgn]) {
|
||||||
|
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
|
||||||
|
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
|
||||||
|
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
|
||||||
|
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
|
||||||
|
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
|
||||||
|
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
|
||||||
|
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
|
||||||
|
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
|
||||||
|
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
|
||||||
|
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
|
||||||
|
letters++;
|
||||||
|
break;
|
||||||
|
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||||
|
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||||
|
numbers++;
|
||||||
|
break;
|
||||||
|
case Byte_ascii.Hash:
|
||||||
|
// next must be &; EX: "&#" and "&#x"
|
||||||
|
int prv = cur - 1;
|
||||||
|
if (prv >= src_end && src[prv] == Byte_ascii.Amp) {
|
||||||
|
// if hex, num | ltr is fine
|
||||||
|
byte hex_byte = src[cur + 1];
|
||||||
|
if (hex_byte == Byte_ascii.Ltr_X || hex_byte == Byte_ascii.Ltr_x) {
|
||||||
|
return numbers > 0 || letters > 1; // 1 to ignore "x"
|
||||||
|
}
|
||||||
|
// if dec, no letters allowed
|
||||||
|
else {
|
||||||
|
return numbers > 0 && letters == 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
case Byte_ascii.Amp:
|
||||||
|
// if entity, no numbers
|
||||||
|
return letters > 0 && numbers == 0;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cur--;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class Xomw_regex_link_interrupt {
|
||||||
|
private static final byte Bgn__ent__lt = 0, Bgn__ent__gt = 1, Bgn__ent__nbsp = 2, Bgn__hex = 3, Bgn__dec = 4;
|
||||||
|
private static final byte End__hex__lt = 0, End__hex__gt = 1, End__hex__nbsp = 2, End__dec__lt = 3, End__dec__gt = 4, End__dec__nbsp = 5;
|
||||||
|
private final Btrie_slim_mgr bgn_trie = Btrie_slim_mgr.cs();
|
||||||
|
private final Btrie_slim_mgr end_trie = Btrie_slim_mgr.ci_a7();
|
||||||
|
public Xomw_regex_link_interrupt() {
|
||||||
|
// MW.REGEX: &(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));
|
||||||
|
bgn_trie.Add_str_byte("<", Bgn__ent__lt);
|
||||||
|
bgn_trie.Add_str_byte(">", Bgn__ent__gt);
|
||||||
|
bgn_trie.Add_str_byte(" ", Bgn__ent__nbsp);
|
||||||
|
bgn_trie.Add_str_byte("&#x", Bgn__hex); // 3C | 3E | A0
|
||||||
|
bgn_trie.Add_str_byte("&#", Bgn__dec); // 60 | 62 | 160
|
||||||
|
|
||||||
|
end_trie.Add_str_byte("3c;", End__hex__lt);
|
||||||
|
end_trie.Add_str_byte("3e;", End__hex__gt);
|
||||||
|
end_trie.Add_str_byte("a0;", End__hex__nbsp);
|
||||||
|
end_trie.Add_str_byte("60;", End__dec__lt);
|
||||||
|
end_trie.Add_str_byte("62;", End__dec__gt);
|
||||||
|
end_trie.Add_str_byte("160;", End__dec__nbsp);
|
||||||
|
}
|
||||||
|
public int Find(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||||
|
int pos = src_bgn;
|
||||||
|
while (true) {
|
||||||
|
if (pos >= src_end) break;
|
||||||
|
byte b = src[pos];
|
||||||
|
Object bgn_obj = bgn_trie.Match_at_w_b0(trv, b, src, pos, src_end);
|
||||||
|
if (bgn_obj == null) {
|
||||||
|
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte bgn_tid = ((Byte_obj_val)bgn_obj).Val();
|
||||||
|
int end_pos = trv.Pos();
|
||||||
|
boolean valid = false;
|
||||||
|
switch (bgn_tid) {
|
||||||
|
case Bgn__ent__lt:
|
||||||
|
case Bgn__ent__gt:
|
||||||
|
case Bgn__ent__nbsp:
|
||||||
|
return pos;
|
||||||
|
case Bgn__hex:
|
||||||
|
case Bgn__dec:
|
||||||
|
// match rest of sequence from above; EX: "3c;", "60;" etc.
|
||||||
|
end_pos = Bry_find_.Find_fwd_while(src, end_pos, src_end, Byte_ascii.Num_0);
|
||||||
|
Object end_obj = end_trie.Match_at(trv, src, end_pos, src_end);
|
||||||
|
if (end_obj != null) {
|
||||||
|
// make sure that hex-dec matches; EX: "`" and "c;" are invalid
|
||||||
|
byte end_tid = ((Byte_obj_val)end_obj).Val();
|
||||||
|
if ( bgn_tid == Bgn__hex && Int_.Between(end_tid, End__hex__lt, End__hex__nbsp)
|
||||||
|
|| bgn_tid == Bgn__dec && Int_.Between(end_tid, End__dec__lt, End__dec__nbsp)
|
||||||
|
)
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (valid)
|
||||||
|
return pos;
|
||||||
|
else
|
||||||
|
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||||
|
}
|
||||||
|
return Bry_find_.Not_found;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,47 @@ import org.junit.*;
|
|||||||
public class Xomw_magiclinks_wkr__tst {
|
public class Xomw_magiclinks_wkr__tst {
|
||||||
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
|
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
|
||||||
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
|
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
|
||||||
|
@Test public void Interrupt() {
|
||||||
|
// ent
|
||||||
|
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||||
|
// hex
|
||||||
|
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||||
|
// dec
|
||||||
|
fxt.Test__parse("a https://b.org<c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a><c");
|
||||||
|
}
|
||||||
|
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
|
||||||
|
// hex-dec
|
||||||
|
fxt.Test__parse("a https://b.org`c" , "a <a class='external free' rel='nofollow' href='https://b.org&#x60;c'>https://b.org&#x60;c</a>");
|
||||||
|
// dec-hex
|
||||||
|
fxt.Test__parse("a https://b.orgc;c" , "a <a class='external free' rel='nofollow' href='https://b.org&#3c;c'>https://b.org&#3c;c</a>");
|
||||||
|
}
|
||||||
|
@Test public void Separator() {
|
||||||
|
// basic
|
||||||
|
fxt.Test__parse("a https://b.org.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>.:!? c");
|
||||||
|
// ")" excluded
|
||||||
|
fxt.Test__parse("a https://b.org).:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? c");
|
||||||
|
// ")" included b/c "(" exists
|
||||||
|
fxt.Test__parse("a https://b.org().:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? c");
|
||||||
|
// ";" excluded
|
||||||
|
fxt.Test__parse("a https://b.org;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? c");
|
||||||
|
// ";" included b/c of ent
|
||||||
|
fxt.Test__parse("a https://b.org&abc;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&abc;'>https://b.org&abc;</a>.:!? c");
|
||||||
|
// ";" included b/c of hex
|
||||||
|
fxt.Test__parse("a https://b.org±.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&#xB1;'>https://b.org&#xB1;</a>.:!? c");
|
||||||
|
// ";" included b/c of dec
|
||||||
|
fxt.Test__parse("a https://b.org{.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&#123;'>https://b.org&#123;</a>.:!? c");
|
||||||
|
// ";" excluded b/c of invalid.ent
|
||||||
|
fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&a1b'>https://b.org&a1b</a>;.:!? c");
|
||||||
|
// ";" excluded b/c of invalid.hex
|
||||||
|
fxt.Test__parse("a https://b.org&#x;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&#x'>https://b.org&#x</a>;.:!? c");
|
||||||
|
// ";" excluded b/c of invalid.dec
|
||||||
|
fxt.Test__parse("a https://b.org&#a;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&#a'>https://b.org&#a</a>;.:!? c");
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
TESTS: regex
|
||||||
|
"<a https://a.org>"
|
||||||
|
"<img https://a.org>"
|
||||||
|
*/
|
||||||
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
|
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
|
||||||
}
|
}
|
||||||
class Xomw_magiclinks_wkr__fxt {
|
class Xomw_magiclinks_wkr__fxt {
|
||||||
|
Loading…
Reference in New Issue
Block a user