1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Mw_parse: Handle interrupt and separator logic for magiclinks

This commit is contained in:
gnosygnu 2017-01-28 02:47:22 -05:00
parent e231df0ce1
commit 7e27b5415d
6 changed files with 263 additions and 70 deletions

View File

@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.primitives; import gplx.*; import gplx.core.*;
public class Bool_ary_bldr {
private final boolean[] ary;
public Bool_ary_bldr(int len) {
this.ary = new boolean[len];
}
public Bool_ary_bldr Set_many(int... v) {
int len = v.length;
for (int i = 0; i < len; i++)
ary[v[i]] = true;
return this;
}
public Bool_ary_bldr Set_rng(int bgn, int end) {
for (int i = bgn; i <= end; i++)
ary[i] = true;
return this;
}
public boolean[] To_ary() {
return ary;
}
public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);}
}

View File

@ -39,7 +39,7 @@ public class Php_str_ {
int end = len < 0 ? src_len + len : bgn + len;
if (end > src.length) end = src.length;; // handle out of bounds;
return src[bgn];
}
}
public static int Strspn_fwd__ary(byte[] src, boolean[] find, int bgn, int max, int src_len) {
if (max == -1) max = src_len;
int rv = 0;
@ -90,6 +90,17 @@ public class Php_str_ {
}
return rv;
}
public static int Strspn_bwd__ary(byte[] src, boolean[] find, int bgn, int max) {
if (max == -1) max = Int_.Max_value;
int rv = 0;
for (int i = bgn - 1; i > -1; i--) {
if (find[src[i]] && rv < max)
rv++;
else
break;
}
return rv;
}
public static int Strspn_bwd__space_or_tab(byte[] src, int bgn, int max) {
if (max == -1) max = Int_.Max_value;
int rv = 0;

View File

@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws; import gplx.*; import gplx.xowa.*;
import gplx.core.encoders.*; import gplx.langs.htmls.entitys.*;
import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.mws.parsers.*;
public class Xomw_sanitizer {
@ -515,24 +515,3 @@ class Xomw_html_ent {
public final byte[] html;
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
}
class Bool_ary_bldr {
private final boolean[] ary;
public Bool_ary_bldr(int len) {
this.ary = new boolean[len];
}
public Bool_ary_bldr Set_many(int... v) {
int len = v.length;
for (int i = 0; i < len; i++)
ary[v[i]] = true;
return this;
}
public Bool_ary_bldr Set_rng(int bgn, int end) {
for (int i = bgn; i <= end; i++)
ary[i] = true;
return this;
}
public boolean[] To_ary() {
return ary;
}
public static Bool_ary_bldr New_u8() {return new Bool_ary_bldr(256);}
}

View File

@ -20,7 +20,7 @@ import gplx.core.btries.*;
public class Xomw_regex_ {
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (true) {
while (cur < src_end) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
@ -32,7 +32,7 @@ public class Xomw_regex_ {
}
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (true) {
while (cur < src_end) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)

View File

@ -23,12 +23,20 @@ public class Xomw_magiclinks_wkr {
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
private final Btrie_rv trv = new Btrie_rv();
private static byte[] Tag__anch__rhs;
private boolean[] url_separators;
private static Xomw_regex_link_interrupt regex_link_interrupt;
private Xomw_regex_boundary regex_boundary;
private Xomw_regex_url regex_url;
private Xomw_linker linker;
private byte[] page_title;
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
public Xomw_magiclinks_wkr() {
// ',;\.:!?'
url_separators = Bool_ary_bldr.New_u8()
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
.To_ary();
}
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
this.linker = linker;
this.regex_boundary = regex_boundary;
@ -46,6 +54,7 @@ public class Xomw_magiclinks_wkr {
if (Tag__anch__rhs == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Tag__anch__rhs = Bry_.new_a7("</a>");
regex_link_interrupt = new Xomw_regex_link_interrupt();
}
}
}
@ -64,19 +73,19 @@ public class Xomw_magiclinks_wkr {
int prv = cur;
boolean dirty = true;
// PORTED.REGEX: handle below
// NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
//'!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// XO.MW.UNSUPPORTED.OBSOLETE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
//'!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// ([0-9]+)\b |
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// )\b
while (true) {
if (cur == src_end) {
@ -173,50 +182,44 @@ public class Xomw_magiclinks_wkr {
// Make a free external link, given a user-supplied URL
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
// byte[] trail = Bry_.Empty;
byte[] trail = Bry_.Empty;
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// Make &nbsp; terminate a URL as well (bug T84937)
// $m2 = [];
// if (preg_match(
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
// $url,
// $m2,
// PREG_OFFSET_CAPTURE
// )) {
// trail = substr($url, $m2[0][1]) . $trail;
// $url = substr($url, 0, $m2[0][1]);
// }
int separator_bgn = regex_link_interrupt.Find(trv, url, 0, url.length);
if (separator_bgn != Bry_find_.Not_found) {
trail = Bry_.Mid(url, separator_bgn);
url = Bry_.Mid(url, 0, separator_bgn);
}
// Move trailing punctuation to $trail
// $sep = ',;\.:!?';
int url_len = url.length;
// If there is no left bracket, then consider right brackets fair game too
// if (strpos($url, '(') === false) {
// $sep .= ')';
// }
// $urlRev = strrev($url);
// $numSepChars = strspn($urlRev, $sep);
// XO.MW: if (strpos($url, '(') === false) {$sep .= ')';}
url_separators[Byte_ascii.Paren_end] = Bry_find_.Find_fwd(url, Byte_ascii.Paren_bgn, 0, url_len) == Bry_find_.Not_found;
int num_sep_chars = Php_str_.Strspn_bwd__ary(url, url_separators, url_len, -1);
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// XO.MW.NOTE: ignore semic if part of entity; EX: "http://a.org&apos;!."
if (num_sep_chars > 0 && Php_str_.Substr_byte(url, -num_sep_chars) == Byte_ascii.Semic) {
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// String starting at the desired offset.
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) {
// $numSepChars--;
// }
// }
// if ($numSepChars) {
// $trail = substr($url, -$numSepChars) . $trail;
// $url = substr($url, 0, -$numSepChars);
// }
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
if (Xomw_regex_html_entity.Match_bwd(url, url_len - num_sep_chars, 0)) {
num_sep_chars--;
}
}
if (num_sep_chars > 0) {
trail = Bry_.Add(Php_str_.Substr(url, -num_sep_chars), trail);
url = Php_str_.Substr(url, 0, -num_sep_chars);
}
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
@ -226,7 +229,8 @@ public class Xomw_magiclinks_wkr {
// $url = Sanitizer::cleanUrl($url);
// Is this an external image?
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
// Is this an external image?
byte[] text = null; // $this->maybeMakeExternalImage($url);
if (text == null) {
// Not an image, make a link
@ -235,11 +239,130 @@ public class Xomw_magiclinks_wkr {
, true, Bry_.new_a7("free")
, new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url)
, page_title);
// XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions?
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
}
// return $text . $trail;
bfr.Add(trail);
}
}
class Xomw_regex_html_entity {
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
// REGEX: (letters | hex + "#" | dec + "x#") + "&"
// \G means "stop if matching breaks"; so, using a reversed example, "http://&#amp;&#!lt;" will not match "&#amp;" b/c "&#!lt;" breaks match
// http://www.php.net/manual/en/regexp.reference.escape.php
// http://stackoverflow.com/questions/14897949/what-is-the-use-of-g-anchor-in-regex
public static boolean Match_bwd(byte[] src, int src_bgn, int src_end) {
int cur = src_bgn - 1;
int numbers = 0;
int letters = 0;
while (cur >= src_end) {
int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
switch (src[b_bgn]) {
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
letters++;
break;
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
numbers++;
break;
case Byte_ascii.Hash:
// next must be &; EX: "&#" and "&#x"
int prv = cur - 1;
if (prv >= src_end && src[prv] == Byte_ascii.Amp) {
// if hex, num | ltr is fine
byte hex_byte = src[cur + 1];
if (hex_byte == Byte_ascii.Ltr_X || hex_byte == Byte_ascii.Ltr_x) {
return numbers > 0 || letters > 1; // 1 to ignore "x"
}
// if dec, no letters allowed
else {
return numbers > 0 && letters == 0;
}
}
return false;
case Byte_ascii.Amp:
// if entity, no numbers
return letters > 0 && numbers == 0;
default:
return false;
}
cur--;
}
return false;
}
}
class Xomw_regex_link_interrupt {
private static final byte Bgn__ent__lt = 0, Bgn__ent__gt = 1, Bgn__ent__nbsp = 2, Bgn__hex = 3, Bgn__dec = 4;
private static final byte End__hex__lt = 0, End__hex__gt = 1, End__hex__nbsp = 2, End__dec__lt = 3, End__dec__gt = 4, End__dec__nbsp = 5;
private final Btrie_slim_mgr bgn_trie = Btrie_slim_mgr.cs();
private final Btrie_slim_mgr end_trie = Btrie_slim_mgr.ci_a7();
public Xomw_regex_link_interrupt() {
// MW.REGEX: &(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));
bgn_trie.Add_str_byte("&lt;", Bgn__ent__lt);
bgn_trie.Add_str_byte("&gt;", Bgn__ent__gt);
bgn_trie.Add_str_byte("&nbsp;", Bgn__ent__nbsp);
bgn_trie.Add_str_byte("&#x", Bgn__hex); // 3C | 3E | A0
bgn_trie.Add_str_byte("&#", Bgn__dec); // 60 | 62 | 160
end_trie.Add_str_byte("3c;", End__hex__lt);
end_trie.Add_str_byte("3e;", End__hex__gt);
end_trie.Add_str_byte("a0;", End__hex__nbsp);
end_trie.Add_str_byte("60;", End__dec__lt);
end_trie.Add_str_byte("62;", End__dec__gt);
end_trie.Add_str_byte("160;", End__dec__nbsp);
}
public int Find(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int pos = src_bgn;
while (true) {
if (pos >= src_end) break;
byte b = src[pos];
Object bgn_obj = bgn_trie.Match_at_w_b0(trv, b, src, pos, src_end);
if (bgn_obj == null) {
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
continue;
}
byte bgn_tid = ((Byte_obj_val)bgn_obj).Val();
int end_pos = trv.Pos();
boolean valid = false;
switch (bgn_tid) {
case Bgn__ent__lt:
case Bgn__ent__gt:
case Bgn__ent__nbsp:
return pos;
case Bgn__hex:
case Bgn__dec:
// match rest of sequence from above; EX: "3c;", "60;" etc.
end_pos = Bry_find_.Find_fwd_while(src, end_pos, src_end, Byte_ascii.Num_0);
Object end_obj = end_trie.Match_at(trv, src, end_pos, src_end);
if (end_obj != null) {
// make sure that hex-dec matches; EX: "&#x60;" and "&#3c;" are invalid
byte end_tid = ((Byte_obj_val)end_obj).Val();
if ( bgn_tid == Bgn__hex && Int_.Between(end_tid, End__hex__lt, End__hex__nbsp)
|| bgn_tid == Bgn__dec && Int_.Between(end_tid, End__dec__lt, End__dec__nbsp)
)
return pos;
}
break;
}
if (valid)
return pos;
else
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
}
return Bry_find_.Not_found;
}
}

View File

@ -20,6 +20,47 @@ import org.junit.*;
public class Xomw_magiclinks_wkr__tst {
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
@Test public void Interrupt() {
// ent
fxt.Test__parse("a https://b.org&lt;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&lt;c");
// hex
fxt.Test__parse("a https://b.org&#x3c;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#x3c;c");
// dec
fxt.Test__parse("a https://b.org&#60;c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>&#60;c");
}
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
// hex-dec
fxt.Test__parse("a https://b.org&#x60;c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x60;c'>https://b.org&amp;#x60;c</a>");
// dec-hex
fxt.Test__parse("a https://b.org&#3c;c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#3c;c'>https://b.org&amp;#3c;c</a>");
}
@Test public void Separator() {
// basic
fxt.Test__parse("a https://b.org.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>.:!? c");
// ")" excluded
fxt.Test__parse("a https://b.org).:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>).:!? c");
// ")" included b/c "(" exists
fxt.Test__parse("a https://b.org().:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org()'>https://b.org()</a>.:!? c");
// ";" excluded
fxt.Test__parse("a https://b.org;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a>;.:!? c");
// ";" included b/c of ent
fxt.Test__parse("a https://b.org&abc;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? c");
// ";" included b/c of hex
fxt.Test__parse("a https://b.org&#xB1;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#xB1;'>https://b.org&amp;#xB1;</a>.:!? c");
// ";" included b/c of dec
fxt.Test__parse("a https://b.org&#123;.:!? c", "a <a class='external free' rel='nofollow' href='https://b.org&amp;#123;'>https://b.org&amp;#123;</a>.:!? c");
// ";" excluded b/c of invalid.ent
fxt.Test__parse("a https://b.org&a1b;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? c");
// ";" excluded b/c of invalid.hex
fxt.Test__parse("a https://b.org&#x;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? c");
// ";" excluded b/c of invalid.dec
fxt.Test__parse("a https://b.org&#a;.:!? c" , "a <a class='external free' rel='nofollow' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? c");
}
/*
TESTS: regex
"<a https://a.org>"
"<img https://a.org>"
*/
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
}
class Xomw_magiclinks_wkr__fxt {