1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-09-30 07:20:52 +00:00

Mw_parse: Implement most of external_links

This commit is contained in:
gnosygnu 2017-02-02 04:12:33 -05:00
parent bcd4bd46ef
commit 4ade9c71b1
6 changed files with 58 additions and 40 deletions

View File

@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.core.btries; import gplx.*; import gplx.core.*;
import gplx.core.threads.poolables.*;
public class Btrie_rv {
public int Match_bgn = -1;
public Object Obj() {return obj;} private Object obj;
public int Pos() {return pos;} private int pos;
public Btrie_rv Init(int pos, Object obj) {

View File

@ -60,6 +60,7 @@ public class Php_preg_ {
return rv;
}
public static Object Match(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
trv.Match_bgn = -1;
int cur = src_bgn;
while (cur < src_end) {
byte b = src[cur];
@ -67,6 +68,7 @@ public class Php_preg_ {
if (o == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else {
trv.Match_bgn = cur;
return o;
}
}

View File

@ -20,10 +20,8 @@ import gplx.core.btries.*;
import gplx.xowa.mws.htmls.*;
import gplx.xowa.mws.linkers.*;
/* TODO.XO
* titleFormatter->gePrefixedTex
* $html = HtmlArmor::getHtml($text);
* Get_link_url
* Get_link_classes
* P7: titleFormatter->getPrefixedText
* P7: $html = HtmlArmor::getHtml($text);
*/
public class Xomw_linker {
private final Bry_bfr tmp = Bry_bfr_.New();
@ -107,13 +105,13 @@ public class Xomw_linker {
link_renderer.Make_known_link(bfr, target, text, custom_attribs, query);
}
else if (options.broken) {
// return $linkRenderer->makeBrokenLink($target, $text, $customAttribs, $query);
link_renderer.Make_broken_link(bfr, target, text, custom_attribs, query);
}
else if (options.no_classes) {
link_renderer.Make_preloaded_link(bfr, target, text, Bry_.Empty, custom_attribs, query);
}
else {
// $linkRenderer->makeLink($target, $text, $customAttribs, $query);
link_renderer.Make_link(bfr, target, text, Bry_.Empty, custom_attribs, query);
}
}
public void Make_self_link_obj(Bry_bfr bfr, Xoa_ttl nt, byte[] html, byte[] query, byte[] trail, byte[] prefix) {

View File

@ -290,6 +290,8 @@ public class Xomw_parser {
byte[] key = itm.Text_bry(); // EX: "https://"
rv.Add_obj(key, key);
}
byte[] bry__relative = Bry_.new_a7("//");
rv.Add_obj(bry__relative, bry__relative); // REF.MW: "$this->mUrlProtocols = wfUrlProtocols();"; "wfUrlProtocols( $includeProtocolRelative = true )"
return rv;
}
}

View File

@ -20,10 +20,6 @@ import gplx.core.btries.*; import gplx.core.primitives.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.mws.htmls.*;
/* TODO.XO
* P8: url = sanitizer.Clean_url(url);
* P8: The characters '<' and '>' (which were escaped by
* P7: add proto-rel; EX: [//a.org b]
* P7: list( $dtrail, $trail ) = Linker::splitTrail( $trail );
* P3: $langObj->formatNum( ++$this->mAutonumber );
* P2: $this->getConverterLanguage()->markNoConversion( $text );
*/
@ -33,7 +29,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
private int autonumber;
private final Xomw_parser parser;
private final Xomw_linker linker;
// private final Xomw_sanitizer sanitizer;
private final Xomw_sanitizer sanitizer;
private final Xomw_atr_mgr attribs = new Xomw_atr_mgr();
private Xomw_regex_url regex_url;
private Xomw_regex_space regex_space;
@ -41,7 +37,27 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
this.parser = parser;
this.tmp = parser.Tmp();
this.linker = parser.Linker();
// this.sanitizer = parser.Sanitizer();
this.sanitizer = parser.Sanitizer();
if (angle_entities_trie == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Link_type__free = Bry_.new_a7("free");
Link_type__text = Bry_.new_a7("text");
Link_type__autonumber = Bry_.new_a7("autonumber");
angle_entities_trie = Btrie_slim_mgr.cs().Add_many_str("&lt;", "&gt;");
// REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
invalid_text_chars_trie = Btrie_slim_mgr.cs();
New__trie_itm__by_len(invalid_text_chars_trie, Byte_ascii.Brack_end);
for (int i = 0; i <= 8; i++) { // x00-x08
New__trie_itm__by_len(invalid_text_chars_trie, i);
}
for (int i = 10; i <= 31; i++) { // x0a-x1F
New__trie_itm__by_len(invalid_text_chars_trie, i);
}
}
}
}
public void Init_by_wiki(Btrie_slim_mgr protocol_trie, Xomw_regex_url regex_url, Xomw_regex_space regex_space) {
this.protocol_trie = protocol_trie;
@ -151,12 +167,11 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// TODO.XO:
//$m2 = [];
//if ( preg_match( '/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE ) ) {
// $text = substr( $url, $m2[0][1] ) . ' ' . $text;
// $url = substr( $url, 0, $m2[0][1] );
//}
if (Php_preg_.Match(angle_entities_trie, trv, src, url_bgn, url_end) != null) {
int angle_bgn = trv.Match_bgn;
text_bgn = angle_bgn;
url_end = angle_bgn;
}
// If the link text is an image URL, replace it with an <img> tag
// This happened by accident in the original parser, but some people used it extensively
@ -164,6 +179,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
// $img = $this->maybeMakeExternalImage( $text );
// if ($img !== false) $text = $img;
// XO.MW.SKIP: See "Have link text"
//$dtrail = '';
// Set linktype for CSS - if URL==text, link is essentially free
@ -179,17 +195,17 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
link_type = Link_type__autonumber;
}
else {
// XO.MW.SKIP: skipped b/c MW splits $trail into $dtrail and $trail but does no extra logic with variables; just concatenates later; "$this->getExternalLinkAttribs( $url ) ) . $dtrail . $trail;"
// Have link text, e.g. [http://domain.tld/some.link text]s
// Check for trail
// TODO.XO:
// list( $dtrail, $trail ) = Linker::splitTrail( $trail );
}
// TODO.XO:
// $text = $this->getConverterLanguage()->markNoConversion( $text );
// TODO.XO:
// url = sanitizer.Clean_url(url);
byte[] url = Bry_.Mid(src, url_bgn, url_end);
url = sanitizer.Clean_url(url);
bfr.Add_mid(src, prv, lnke_bgn);
prv = cur;
@ -197,7 +213,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
// This means that users can paste URLs directly into the text
// Funny characters like <EFBFBD> aren't valid in URLs anyway
// This was changed in August 2004
linker.Make_external_link(bfr, Bry_.Mid(src, url_bgn, url_end), Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, parser.Get_external_link_attribs(attribs), Bry_.Empty);
linker.Make_external_link(bfr, url, Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, parser.Get_external_link_attribs(attribs), Bry_.Empty);
// XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions?
// Register link in the output Object.
@ -208,24 +224,9 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
}
}
private static final byte[]
Link_type__free = Bry_.new_a7("free")
, Link_type__text = Bry_.new_a7("text")
, Link_type__autonumber = Bry_.new_a7("autonumber")
;
private static final Btrie_slim_mgr invalid_text_chars_trie = New__invalid_text_chars_trie();
private static Btrie_slim_mgr New__invalid_text_chars_trie() { // REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
New__trie_itm__by_len(rv, Byte_ascii.Brack_end);
for (int i = 0; i <= 8; i++) { // x00-x08
New__trie_itm__by_len(rv, i);
}
for (int i = 10; i <= 31; i++) { // x0a-x1F
New__trie_itm__by_len(rv, i);
}
return rv;
}
private static byte[] Link_type__free, Link_type__text, Link_type__autonumber;
private static Btrie_slim_mgr angle_entities_trie;
private static Btrie_slim_mgr invalid_text_chars_trie;
private static void New__trie_itm__by_len(Btrie_slim_mgr mgr, int... ary) {
mgr.Add_obj(Bry_.New_by_ints(ary), new Int_obj_val(ary.length));
}

View File

@ -39,6 +39,20 @@ public class Xomw_lnke_wkr__tst {
, "g"
));
}
@Test public void Protocol_rel() {
fxt.Test__parse("[//a.org b]" , "<a rel='nofollow' class='external text' href='//a.org'>b</a>");
}
@Test public void Url_should_not_has_angle_entities() {
fxt.Test__parse("[https://a.org/b&lt;c z]" , "<a rel='nofollow' class='external text' href='https://a.org/b'>&lt;c z</a>");
fxt.Test__parse("[https://a.org/b&gt;c z]" , "<a rel='nofollow' class='external text' href='https://a.org/b'>&gt;c z</a>");
}
@Test public void Link_trail() {// checks for noop via "Have link text"
fxt.Test__parse("[https://a.org b]xyz" , "<a rel='nofollow' class='external text' href='https://a.org'>b</a>xyz");
fxt.Test__parse("[https://a.org b]x!z" , "<a rel='nofollow' class='external text' href='https://a.org'>b</a>x!z");
}
@Test public void Clean_url() {
fxt.Test__parse("[https://a&quot;­b c]" , "<a rel='nofollow' class='external text' href='https://a%22b'>c</a>");
}
}
class Xomw_lnke_wkr__fxt {
private final Xomw_lnke_wkr wkr = new Xomw_lnke_wkr(new Xomw_parser());