@ -17,169 +17,54 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* /
* /
package gplx.xowa.mws.parsers.magiclinks ; import gplx.* ; import gplx.xowa.* ; import gplx.xowa.mws.* ; import gplx.xowa.mws.parsers.* ;
package gplx.xowa.mws.parsers.magiclinks ; import gplx.* ; import gplx.xowa.* ; import gplx.xowa.mws.* ; import gplx.xowa.mws.parsers.* ;
import gplx.core.primitives.* ; import gplx.core.btries.* ; import gplx.core.net.* ;
import gplx.core.primitives.* ; import gplx.core.btries.* ; import gplx.core.net.* ;
import gplx.langs.phps.utls.* ;
import gplx.langs.phps.utls.* ; import gplx.xowa.mws.htmls.* ;
// public class Xomw_magiclinks_wkr {
import gplx.langs.regxs.* ;
// private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
public class Xomw_magiclinks_wkr {
// private final Btrie_rv trv = new Btrie_rv();
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr . ci_a7 ( ) ; // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
// public Xomw_magiclinks_wkr() {
private final Btrie_rv trv = new Btrie_rv ( ) ;
// }
private static byte [ ] Tag__anch__rhs ;
// private static byte[] Tag__anch__rhs, Prefix__rfc, Prefix__pmid;
private Xomw_regex_boundary regex_boundary ;
//
private Xomw_regex_url regex_url ;
// private static final byte Space__tab = 1, Space__nbsp_ent = 2, Space__nbsp_dec = 3, Space__nbsp_hex = 4;
private Xomw_linker linker ;
// private static Btrie_slim_mgr space_trie;
private byte [ ] page_title ;
// // static final SPACE_NOT_NL = '(?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})';
//// public void Test() {
//// regex.Add("\t", Space__tab);
//// regex.Add(" ", Space__nbsp__ent);
//// regex.Add(Regex.Make("&#").Star("0").Add("160;"), Space__nbsp__dec);
//// regex.Add(Regex.Make("&#").Brack("X", "x").Star("0").Brack("A", "a").Add("0"), Space__nbsp__hex);
//// }
// public int Find_fwd_space(byte[] src, int cur, int src_end) {
// return -1;
// }
//
// private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3, Regex__rfc = 5, Regex__isbn = 6, Regex__pmid = 7;
// public void Init_by_wiki() {
// regex_trie.Add_str_byte("<a", Regex__anch);
// regex_trie.Add_str_byte("<" , Regex__elem);
//
// Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
// int protocol_len = protocol_ary.length;
// for (int i = 0; i < protocol_len; i++) {
// Gfo_protocol_itm itm = protocol_ary[i];
// regex_trie.Add_bry_byte(itm.Key_w_colon_bry(), Regex__free);
// }
// regex_trie.Add_str_byte("RFC " , Regex__rfc);
// regex_trie.Add_str_byte("PMID " , Regex__rfc);
// regex_trie.Add_str_byte("ISBN ", Regex__rfc);
//
// if (Tag__anch__rhs == null) {
// synchronized (Type_adp_.ClassOf_obj(this)) {
// Tag__anch__rhs = Bry_.new_a7("</a>");
// Prefix__rfc = Bry_.new_a7("RFC");
// Prefix__pmid = Bry_.new_a7("PMID");
// space_trie = Btrie_slim_mgr.ci_a7()
// .Add_str_byte("\t", Space__tab)
// .Add_str_byte(" ", Space__nbsp_ent)
// .Add_str_byte("&#", Space__nbsp_dec)
// .Add_str_byte("&x", Space__nbsp_hex)
// ;
// }
// }
// }
//
// // Replace special strings like "ISBN xxx" and "RFC xxx" with
// // magic external links.
// public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// // XO.PBFR
// Bry_bfr src_bfr = pbfr.Src();
// byte[] src = src_bfr.Bfr();
// int src_bgn = 0;
// int src_end = src_bfr.Len();
// Bry_bfr bfr = pbfr.Trg();
//
// int cur = src_bgn;
// int prv = cur;
// boolean dirty = true;
// while (true) {
// if (cur == src_end) {
// if (dirty)
// bfr.Add_mid(src, prv, src_end);
// break;
// }
//
// byte b = src[cur];
// Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
// // current byte doesn't look like magiclink; continue;
// if (o == null) {
// cur++;
// continue;
// }
// // looks like magiclink; do additional processing
// byte regex_tid = ((Byte_obj_ref)o).Val();
// int trv_pos = trv.Pos();
// int nxt_pos = trv_pos;
// boolean regex_valid = true;
// switch (regex_tid) {
// case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// if (trv_pos < src_end) {
// // find ws in "[ \t\r\n>]"
// byte ws_byte = src[cur];
// switch (ws_byte) {
// case Byte_ascii.Space:
// case Byte_ascii.Tab:
// case Byte_ascii.Cr:
// case Byte_ascii.Nl:
// break;
// default:
// regex_valid = false;
// break;
// }
// if (regex_valid) {
// // find </a>
// nxt_pos++;
// int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
// if (anch_end == Bry_find_.Not_found) {
// regex_valid = false;
// }
// else {
// cur = anch_end + Tag__anch__rhs.length;
// }
// }
// }
// else {
// regex_valid = false;
// }
// break;
// case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// // just find ">"
// int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
// if (elem_end == Bry_find_.Not_found)
// regex_valid = false;
// else
// cur = elem_end + 1;
// break;
// case Regex__free:
// // addr; urlchar
// break;
// case Regex__rfc:
// case Regex__pmid:
// // byte[] prefix = regex == Regex__rfc ? Prefix__rfc : Prefix__pmid;
// // match previous for case sensitivity
//// if (Bry_.Eq(src, trv_pos - prefix.length - 1, trv_pos - 1, prefix)) {
////
//// }
//// else {
//// regex_valid = false;
//// }
// break;
// }
//
//// '!(?: // Start cases
//// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
//// (<.*?>) | // m[2]: Skip stuff inside
//// // HTML elements' . "
//// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
//// // m[4]: Post-protocol path
//// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
//// ([0-9]+)\b |
//// \bISBN $spaces ( // m[6]: ISBN, capture number
//// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
//// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
//// [0-9Xx] // check digit
//// )\b
//
// }
// if (dirty)
// pbfr.Switch();
// $prots = wfUrlProtocolsWithoutProtRel();
private static final byte Regex__anch = 1 , Regex__elem = 2 , Regex__free = 3 ;
// $urlChar = self::EXT_LINK_URL_CLASS;
public void Init_by_wiki ( Xomw_linker linker , Xomw_regex_boundary regex_boundary , Xomw_regex_url regex_url ) {
// $addr = self::EXT_LINK_ADDR;
this . linker = linker ;
// $space = self::SPACE_NOT_NL; // non-newline space
this . regex_boundary = regex_boundary ;
// $spdash = "(?:-|$space)"; // a dash or a non-newline space
this . regex_url = regex_url ;
// $spaces = "$space++"; // possessive match of 1 or more spaces
regex_trie . Add_str_byte ( "<a" , Regex__anch ) ;
// $text = preg_replace_callback(
regex_trie . Add_str_byte ( "<" , Regex__elem ) ;
Gfo_protocol_itm [ ] protocol_ary = Gfo_protocol_itm . Ary ( ) ;
int protocol_len = protocol_ary . length ;
for ( int i = 0 ; i < protocol_len ; i + + ) {
Gfo_protocol_itm itm = protocol_ary [ i ] ;
regex_trie . Add_bry_byte ( itm . Text_bry ( ) , Regex__free ) ;
}
if ( Tag__anch__rhs = = null ) {
synchronized ( Type_adp_ . ClassOf_obj ( this ) ) {
Tag__anch__rhs = Bry_ . new_a7 ( "</a>" ) ;
}
}
}
// Replace special strings like "ISBN xxx" and "RFC xxx" with
// magic external links.
public void Do_magic_links ( Xomw_parser_ctx pctx , Xomw_parser_bfr pbfr ) {
// XO.PBFR
Bry_bfr src_bfr = pbfr . Src ( ) ;
byte [ ] src = src_bfr . Bfr ( ) ;
int src_bgn = 0 ;
int src_end = src_bfr . Len ( ) ;
Bry_bfr bfr = pbfr . Trg ( ) ;
int cur = src_bgn ;
int prv = cur ;
boolean dirty = true ;
// PORTED.REGEX: handle below
// NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
//'!(?: // Start cases
//'!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// (<.*?>) | // m[2]: Skip stuff inside
@ -193,80 +78,108 @@ import gplx.langs.phps.utls.*;
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// [0-9Xx] // check digit
// )\b
// )\b
// )!xu", [ &$this, 'magicLinkCallback' ], $text);
while ( true ) {
// return $text;
if ( cur = = src_end ) {
// }
if ( dirty )
bfr . Add_mid ( src , prv , src_end ) ;
break ;
}
// public function magicLinkCallback($m) {
byte b = src [ cur ] ;
// if (isset($m[1]) && $m[1] !== '') {
Object o = regex_trie . Match_at_w_b0 ( trv , b , src , cur , src_end ) ;
// // Skip anchor
// current byte doesn't look like magiclink; continue;
// return $m[0];
if ( o = = null ) {
// } else if (isset($m[2]) && $m[2] !== '') {
cur + + ;
// // Skip HTML element
continue ;
// return $m[0];
}
// } else if (isset($m[3]) && $m[3] !== '') {
// // Free external link
// looks like magiclink; do additional processing
// return $this->makeFreeExternalLink($m[0], strlen($m[4]));
byte regex_tid = ( ( Byte_obj_val ) o ) . Val ( ) ;
// } else if (isset($m[5]) && $m[5] !== '') {
int old_pos = cur ;
// // RFC or PMID
int trv_pos = trv . Pos ( ) ;
// if (substr($m[0], 0, 3) === 'RFC') {
int nxt_pos = trv_pos ;
// if (!$this->mOptions->getMagicRFCLinks()) {
boolean regex_valid = true ;
// return $m[0];
switch ( regex_tid ) {
// }
case Regex__anch : // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// $keyword = 'RFC';
if ( trv_pos < src_end ) {
// $urlmsg = 'rfcurl';
// find ws in "[ \t\r\n>]"
// $cssClass = 'mw-magiclink-rfc';
byte ws_byte = src [ cur ] ;
// $trackingCat = 'magiclink-tracking-rfc';
switch ( ws_byte ) {
// $id = $m[5];
case Byte_ascii . Space :
// } else if (substr($m[0], 0, 4) === 'PMID') {
case Byte_ascii . Tab :
// if (!$this->mOptions->getMagicPMIDLinks()) {
case Byte_ascii . Cr :
// return $m[0];
case Byte_ascii . Nl :
// }
break ;
// $keyword = 'PMID';
default :
// $urlmsg = 'pubmedurl';
regex_valid = false ;
// $cssClass = 'mw-magiclink-pmid';
break ;
// $trackingCat = 'magiclink-tracking-pmid';
}
// $id = $m[5];
if ( regex_valid ) {
// } else {
// find </a>
// throw new MWException(__METHOD__ . ': unrecognised match type "' .
nxt_pos + + ;
// substr($m[0], 0, 20) . '"');
int anch_end = Bry_find_ . Find_fwd ( src , Tag__anch__rhs , nxt_pos , src_end ) ;
// }
if ( anch_end = = Bry_find_ . Not_found ) {
// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text();
regex_valid = false ;
// $this->addTrackingCategory($trackingCat);
}
// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle);
else {
// } else if (isset($m[6]) && $m[6] !== ''
cur = anch_end + Tag__anch__rhs . length ;
// && $this->mOptions->getMagicISBNLinks()
}
// ) {
}
// // ISBN
}
// $isbn = $m[6];
else {
// $space = self::SPACE_NOT_NL; // non-newline space
regex_valid = false ;
// $isbn = preg_replace("/$space/", ' ', $isbn);
}
// $num = strtr($isbn, [
break ;
// '-' => '',
case Regex__elem : // (<.*?>) | // m[2]: Skip stuff inside
// ' ' => '',
// just find ">"
// 'x' => 'X',
int elem_end = Bry_find_ . Find_fwd ( src , Byte_ascii . Angle_end , nxt_pos , src_end ) ;
// ]);
if ( elem_end = = Bry_find_ . Not_found )
// $this->addTrackingCategory('magiclink-tracking-isbn');
regex_valid = false ;
// return $this->getLinkRenderer()->makeKnownLink(
else
// SpecialPage::getTitleFor('Booksources', $num),
cur = elem_end + 1 ;
// "ISBN $isbn",
break ;
// [
case Regex__free :
// 'class' => '@gplx.Internal protected mw-magiclink-isbn',
if ( regex_boundary . Is_boundary_prv ( src , cur ) ) {
// 'title' => false // suppress title attribute
int url_end = regex_url . Find_fwd_while ( trv , src , nxt_pos , src_end ) ;
// ]
if ( url_end = = nxt_pos ) {
// );
regex_valid = false ;
// } else {
}
// return $m[0];
else
// }
cur = url_end ;
}
else
regex_valid = false ;
break ;
}
if ( ! regex_valid ) {
cur + + ;
}
else {
if ( regex_tid = = Regex__free ) {
this . page_title = pctx . Page_title ( ) . Full_db ( ) ;
dirty = true ;
bfr . Add_mid ( src , prv , old_pos ) ;
this . Make_free_external_link ( bfr , Bry_ . Mid ( src , old_pos , cur ) , 0 ) ;
prv = cur ;
}
else {
}
}
}
if ( dirty ) {
pbfr . Switch ( ) ;
}
}
// Make a free external link, given a user-supplied URL
// Make a free external link, given a user-supplied URL
// public void Make_free_external_link(byte[] url, int num_post_proto) {
public void Make_free_external_link (Bry_bfr bfr , byte [ ] url , int num_post_proto ) {
// byte[] trail = Bry_.Empty;
// byte[] trail = Bry_.Empty;
// The characters '<' and '>' (which were escaped by
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// URLs, per RFC 2396.
// Make terminate a URL as well (bug T84937)
// Make terminate a URL as well (bug T84937)
// $m2 = [];
// $m2 = [];
// if (preg_match(
// if (preg_match(
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
@ -290,6 +203,7 @@ import gplx.langs.phps.utls.*;
// Don't break a trailing HTML entity by moving the ; into $trail
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// create a new String Object for the comparison
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// more optimization: instead of running preg_match with a $
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// anchor, which can be slow, do the match on the reversed
@ -313,19 +227,19 @@ import gplx.langs.phps.utls.*;
// $url = Sanitizer::cleanUrl($url);
// $url = Sanitizer::cleanUrl($url);
// Is this an external image?
// Is this an external image?
// $text = $this->maybeMakeExternalImage($url);
byte [ ] text = null ; // $this->maybeMakeExternalImage($url);
// if ($text === false) {
if ( text = = null ) {
// Not an image, make a link
// Not an image, make a link
// $text = Linker::makeExternalLink($url,
linker . Make_external_link ( bfr , url
// $this->getConverterLanguage()->markNoConversion($url, true),
, url // $this->getConverterLanguage()->markNoConversion($url, true),
// true, 'free',
, true , Bry_ . new_a7 ( "free" )
// $this->getExternalLinkAttribs($url), $this->mTitle);
, new Xomwh_atr_mgr ( ) // $this->getExternalLinkAttribs($url)
, page_title ) ;
// Register it in the output Object...
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
// $this->mOutput->addExternalLink($pasteurized);
// }
}
// return $text . $trail;
// return $text . $trail;
// }
}
// }
}
// }