Mw_parse: Add basic implementation for magiclinks

pull/620/head
gnosygnu 8 years ago
parent 7bd176f51f
commit aa1f1ec801

@ -40,6 +40,7 @@ public class Regx_adp {
return (Regx_match[])rv.To_ary(Regx_match.class);
}
private Pattern under;
public Pattern Under() {return under;}
void Under_sync() {
try {under = Pattern.compile(pattern, Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);} // JRE.7:UNICODE_CHARACTER_CLASS; added during %w fix for en.w:A#; DATE:2015-06-10
catch (Exception e) { // NOTE: if invalid, then default to empty pattern (which should return nothing); EX:d:〆る generates [^]; DATE:2013-10-20

@ -17,10 +17,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
public class Regx_group {
public Regx_group(boolean rslt, int bgn, int end, String val) {this.rslt = rslt; this.bgn = bgn; this.end = end; this.val = val;}
public boolean Rslt() {return rslt;} private boolean rslt;
public int Bgn() {return bgn;} int bgn;
public int End() {return end;} int end;
public String Val() {return val;} private String val;
public static final Regx_group[] Ary_empty = new Regx_group[0];
public Regx_group(boolean rslt, int bgn, int end, String val) {
this.rslt = rslt;
this.bgn = bgn;
this.end = end;
this.val = val;
}
public boolean Rslt() {return rslt;} private boolean rslt;
public int Bgn() {return bgn;} private int bgn;
public int End() {return end;} private int end;
public String Val() {return val;} private String val;
public void Init(boolean rslt, int bgn, int end, String val) {
this.rslt = rslt;
this.bgn = bgn;
this.end = end;
this.val = val;
}
public static final Regx_group[] Ary_empty = new Regx_group[0];
}

@ -24,5 +24,5 @@ public class Regx_match {
public int Find_end() {return find_end;} int find_end;
public int Find_len() {return find_end - find_bgn;}
public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty;
public static final Regx_match[] Ary_empty = new Regx_match[0];
public static final Regx_match[] Ary_empty = new Regx_match[0];
}

@ -0,0 +1,46 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Regx_rslt {// THREAD.UNSAFE
private int src_pos;
private Regx_group tmp_grp = new Regx_group(false, -1, -1, null);
public Matcher match;
public int Groups__len() {return match.groupCount() + 1;} // +1 to include group=0 which is entire pattern
public Regx_group Groups__get_at(int i) {
tmp_grp.Init(true, match.start(i), match.end(i), null);
return tmp_grp;
}
public void Init(Regx_adp regex, String src, int src_bgn) {
match = regex.Under().matcher(src);
this.src_pos = src_bgn;
}
public boolean Match_next() {
this.found = match.find(src_pos);
if (found) {
this.find_bgn = match.start();
this.find_end = match.end();
this.src_pos = find_end;
}
return found;
}
public boolean Found() {return found;} private boolean found;
public int Find_bgn() {return find_bgn;} private int find_bgn;
public int Find_end() {return find_end;} private int find_end;
}

@ -19,7 +19,7 @@ package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xo
import gplx.core.btries.*; import gplx.core.net.*;
import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*;
import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*;
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*;
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*;
import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*;
public class Xomw_parser {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
@ -29,10 +29,13 @@ public class Xomw_parser {
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
private final Xomw_link_holders holders;
private final Xomw_heading_cbk__html heading_wkr_cbk;
private final Btrie_slim_mgr protocols_trie;
private static Xomw_regex_space regex_space;
private static Xomw_regex_url regex_url;
private final Btrie_rv trv = new Btrie_rv();
private int marker_index = 0;
// private final Xomw_prepro_wkr prepro_wkr = new Xomw_prepro_wkr();
@ -51,10 +54,16 @@ public class Xomw_parser {
this.lnke_wkr = new Xomw_lnke_wkr(this);
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
if (regex_space == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
regex_space = new Xomw_regex_space();
regex_url = new Xomw_regex_url(regex_space);
}
}
}
public void Init_by_wiki(Xowe_wiki wiki) {
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
lnke_wkr.Init_by_wiki(protocols_trie);
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
lnki_wkr.Init_by_wiki(wiki);
}
public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) {
@ -107,8 +116,8 @@ public class Xomw_parser {
// replaceInternalLinks may sometimes leave behind
// absolute URLs, which have to be masked to hide them from replaceExternalLinks
Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty);
magiclinks_wkr.Do_magic_links(pctx, pbfr);
// $text = $this->doMagicLinks($text);
// $text = $this->formatHeadings($text, $origText, $isMain);
}

@ -21,37 +21,9 @@ public class Xomw_parser__tst {
private final Xomw_parser__fxt fxt = new Xomw_parser__fxt();
@Test public void Basic() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "== heading_1 =="
, "para_1"
, "== heading_2 =="
, "para_2"
, "-----"
, "{|"
, "|-"
, "|a"
, "|}"
, "''italics''"
, "[https://a.org b]"
, "[[A|abc]]"
, "a »b« &#160;!important c"
("a https://c.org b"
), String_.Concat_lines_nl_skip_last
( "<h2> heading_1 </h2>"
, "<p>para_1"
, "</p>"
, "<h2> heading_2 </h2>"
, "<p>para_2"
, "</p>"
, "<hr />"
, "<table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
, "<p><i>italics</i>"
, "<a class=\"external text\" rel=\"nofollow\" href=\"https://a.org\">b</a>"
, "<a href=\"/wiki/A\" title=\"A\">abc</a>"
, "a&#160;»b«&#160; !important c"
, "</p>"
( ""
));
}
}

@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
public class Xomw_regex_ {
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (true) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
break;
else
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
}
return cur;
}
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (true) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else
break;
}
return cur;
}
}

@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent interface
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
private final Btrie_rv trv = new Btrie_rv();
public Xomw_regex_boundary(Xomw_regex_space space) {
// naive implementation of is_boundary; ignore all ws and underscore
byte[][] ary = space.Ws();
for (byte[] bry : ary)
trie.Add_bry_byte(bry, Byte_.Zero);
ary = space.Zs();
for (byte[] bry : ary)
trie.Add_bry_byte(bry, Byte_.Zero);
}
public boolean Is_boundary_prv(byte[] src, int pos) {
if (pos == 0) return true; // BOS is true
int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
byte b = src[bgn];
Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
return o != null;
}
}

@ -0,0 +1,64 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
public class Xomw_regex_space {
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
public Xomw_regex_space() {
byte[] space = Bry_.New_by_ints(32);
ws = new byte[][]
{ space
, Bry_.New_by_ints(9)
, Bry_.New_by_ints(10)
, Bry_.New_by_ints(13)
};
// Zs; REF:http://www.fileformat.info/info/unicode/category/Zs/list.htm
zs = new byte[][]
{ space
, Bry_.New_by_ints(194, 160)
, Bry_.New_by_ints(225, 154, 128)
, Bry_.New_by_ints(226, 128, 129)
, Bry_.New_by_ints(226, 128, 130)
, Bry_.New_by_ints(226, 128, 131)
, Bry_.New_by_ints(226, 128, 132)
, Bry_.New_by_ints(226, 128, 133)
, Bry_.New_by_ints(226, 128, 134)
, Bry_.New_by_ints(226, 128, 135)
, Bry_.New_by_ints(226, 128, 136)
, Bry_.New_by_ints(226, 128, 137)
, Bry_.New_by_ints(226, 128, 138)
, Bry_.New_by_ints(226, 128, 175)
, Bry_.New_by_ints(226, 129, 159)
, Bry_.New_by_ints(227, 128, 128)
};
byte[][] ary = ws;
for (byte[] bry : ary) {
trie.Add_bry_byte(bry, Byte_.Zero);
}
ary = zs;
for (byte[] bry : ary) {
trie.Add_bry_byte(bry, Byte_.Zero);
}
}
public byte[][] Ws() {return ws;} private byte[][] ws;
public byte[][] Zs() {return zs;} private byte[][] zs;
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
return Xomw_regex_.Find_fwd_while(trie, trv, src, src_bgn, src_end);
}
}

@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
import gplx.core.btries.*;
public class Xomw_regex_url {
private final Btrie_slim_mgr trie;
public Xomw_regex_url(Xomw_regex_space regex_space) {
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
this.trie = Btrie_slim_mgr.cs();
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
for (byte i = 0; i < 33; i++) {
trie.Add_bry_byte(new byte[] {i}, Byte_.Zero);
}
trie.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
byte[][] zs_ary = regex_space.Zs();
for (byte[] zs : zs_ary) {
trie.Add_bry_byte(zs, Byte_.Zero);
}
}
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
return Xomw_regex_.Find_fwd_until(trie, trv, src, src_bgn, src_end);
}
}

@ -26,12 +26,16 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
private int autonumber;
private final Xomw_linker linker;
private final Xomwh_atr_mgr attribs = new Xomwh_atr_mgr();
private Xomw_regex_url regex_url;
private Xomw_regex_space regex_space;
public Xomw_lnke_wkr(Xomw_parser mgr) {
this.tmp = mgr.Tmp();
this.linker = mgr.Linker();
}
public void Init_by_wiki(Btrie_slim_mgr protocol_trie) {
public void Init_by_wiki(Btrie_slim_mgr protocol_trie, Xomw_regex_url regex_url, Xomw_regex_space regex_space) {
this.protocol_trie = protocol_trie;
this.regex_url = regex_url;
this.regex_space = regex_space;
}
public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
@ -101,14 +105,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
// check for one-or-more url chars; [^][<>"\\x00-\\x20\\x7F\p{Zs}]
int domain_bgn = cur;
while (true) {
byte b = src[cur];
Object url_char_byte = invalid_url_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (url_char_byte == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else
break;
}
cur = regex_url.Find_fwd_while(trv, src, domain_bgn, src_end);
if (cur - domain_bgn == 0) {
bfr.Add_mid(src, prv, cur);
prv = cur;
@ -116,14 +113,8 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
}
int url_end = cur;
// get ws (if any)
int ws_bgn = -1;
while (true) {
Object space_byte = space_chars_trie.Match_at(trv, src, cur, src_end);
if (space_byte == null) break;
if (ws_bgn == -1) ws_bgn = cur;
cur += ((Int_obj_val)space_byte).Val();
}
// skip ws
cur = regex_space.Find_fwd_while(trv, src, cur, src_end);
// get text (if any)
int text_bgn = -1, text_end = -1;
@ -244,27 +235,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
, Link_type__autonumber = Bry_.new_a7("autonumber")
;
private static final Btrie_slim_mgr
invalid_url_chars_trie = New__invalid_url_chars_trie()
, space_chars_trie = New__space_chars_trie()
, invalid_text_chars_trie = New__invalid_text_chars_trie()
;
private static Btrie_slim_mgr New__invalid_url_chars_trie() { // REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
rv.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
for (byte i = 0; i < 33; i++) {
rv.Add_bry_byte(new byte[] {i}, Byte_.Zero);
}
rv.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
rv.Add_bry_byte(Bry_.New_by_ints(227, 128, 128), Byte_.Zero); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
return rv;
}
private static Btrie_slim_mgr New__space_chars_trie() { // REGEX:\p{Zs}; NOTE: val is key.length
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
New__trie_itm__by_len(rv, 32);
New__trie_itm__by_len(rv, 227, 128, 128); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
return rv;
}
private static final Btrie_slim_mgr invalid_text_chars_trie = New__invalid_text_chars_trie();
private static Btrie_slim_mgr New__invalid_text_chars_trie() { // REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
New__trie_itm__by_len(rv, Byte_ascii.Brack_end);

@ -45,7 +45,8 @@ class Xomw_lnke_wkr__fxt {
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_lnke_wkr__fxt() {
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt());
Xomw_regex_space regex_space = new Xomw_regex_space();
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt(), new Xomw_regex_url(regex_space), regex_space);
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);

@ -17,315 +17,229 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
import gplx.langs.phps.utls.*;
// public class Xomw_magiclinks_wkr {
// private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
// private final Btrie_rv trv = new Btrie_rv();
// public Xomw_magiclinks_wkr() {
// }
// private static byte[] Tag__anch__rhs, Prefix__rfc, Prefix__pmid;
//
// private static final byte Space__tab = 1, Space__nbsp_ent = 2, Space__nbsp_dec = 3, Space__nbsp_hex = 4;
// private static Btrie_slim_mgr space_trie;
// // static final SPACE_NOT_NL = '(?:\t|&nbsp;|&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})';
//// public void Test() {
//// regex.Add("\t", Space__tab);
//// regex.Add("&nbsp;", Space__nbsp__ent);
//// regex.Add(Regex.Make("&#").Star("0").Add("160;"), Space__nbsp__dec);
//// regex.Add(Regex.Make("&#").Brack("X", "x").Star("0").Brack("A", "a").Add("0"), Space__nbsp__hex);
//// }
// public int Find_fwd_space(byte[] src, int cur, int src_end) {
// return -1;
// }
//
// private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3, Regex__rfc = 5, Regex__isbn = 6, Regex__pmid = 7;
// public void Init_by_wiki() {
// regex_trie.Add_str_byte("<a", Regex__anch);
// regex_trie.Add_str_byte("<" , Regex__elem);
//
// Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
// int protocol_len = protocol_ary.length;
// for (int i = 0; i < protocol_len; i++) {
// Gfo_protocol_itm itm = protocol_ary[i];
// regex_trie.Add_bry_byte(itm.Key_w_colon_bry(), Regex__free);
// }
// regex_trie.Add_str_byte("RFC " , Regex__rfc);
// regex_trie.Add_str_byte("PMID " , Regex__rfc);
// regex_trie.Add_str_byte("ISBN ", Regex__rfc);
//
// if (Tag__anch__rhs == null) {
// synchronized (Type_adp_.ClassOf_obj(this)) {
// Tag__anch__rhs = Bry_.new_a7("</a>");
// Prefix__rfc = Bry_.new_a7("RFC");
// Prefix__pmid = Bry_.new_a7("PMID");
// space_trie = Btrie_slim_mgr.ci_a7()
// .Add_str_byte("\t", Space__tab)
// .Add_str_byte("&nbsp;", Space__nbsp_ent)
// .Add_str_byte("&#", Space__nbsp_dec)
// .Add_str_byte("&x", Space__nbsp_hex)
// ;
// }
// }
// }
//
// // Replace special strings like "ISBN xxx" and "RFC xxx" with
// // magic external links.
// public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// // XO.PBFR
// Bry_bfr src_bfr = pbfr.Src();
// byte[] src = src_bfr.Bfr();
// int src_bgn = 0;
// int src_end = src_bfr.Len();
// Bry_bfr bfr = pbfr.Trg();
//
// int cur = src_bgn;
// int prv = cur;
// boolean dirty = true;
// while (true) {
// if (cur == src_end) {
// if (dirty)
// bfr.Add_mid(src, prv, src_end);
// break;
// }
//
// byte b = src[cur];
// Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
// // current byte doesn't look like magiclink; continue;
// if (o == null) {
// cur++;
// continue;
// }
// // looks like magiclink; do additional processing
// byte regex_tid = ((Byte_obj_ref)o).Val();
// int trv_pos = trv.Pos();
// int nxt_pos = trv_pos;
// boolean regex_valid = true;
// switch (regex_tid) {
// case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// if (trv_pos < src_end) {
// // find ws in "[ \t\r\n>]"
// byte ws_byte = src[cur];
// switch (ws_byte) {
// case Byte_ascii.Space:
// case Byte_ascii.Tab:
// case Byte_ascii.Cr:
// case Byte_ascii.Nl:
// break;
// default:
// regex_valid = false;
// break;
// }
// if (regex_valid) {
// // find </a>
// nxt_pos++;
// int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
// if (anch_end == Bry_find_.Not_found) {
// regex_valid = false;
// }
// else {
// cur = anch_end + Tag__anch__rhs.length;
// }
// }
// }
// else {
// regex_valid = false;
// }
// break;
// case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// // just find ">"
// int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
// if (elem_end == Bry_find_.Not_found)
// regex_valid = false;
// else
// cur = elem_end + 1;
// break;
// case Regex__free:
// // addr; urlchar
// break;
// case Regex__rfc:
// case Regex__pmid:
// // byte[] prefix = regex == Regex__rfc ? Prefix__rfc : Prefix__pmid;
// // match previous for case sensitivity
//// if (Bry_.Eq(src, trv_pos - prefix.length - 1, trv_pos - 1, prefix)) {
////
//// }
//// else {
//// regex_valid = false;
//// }
// break;
// }
//
//// '!(?: // Start cases
//// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
//// (<.*?>) | // m[2]: Skip stuff inside
//// // HTML elements' . "
//// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
//// // m[4]: Post-protocol path
//// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
//// ([0-9]+)\b |
//// \bISBN $spaces ( // m[6]: ISBN, capture number
//// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
//// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
//// [0-9Xx] // check digit
//// )\b
//
// }
// if (dirty)
// pbfr.Switch();
import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
import gplx.langs.regxs.*;
public class Xomw_magiclinks_wkr {
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
private final Btrie_rv trv = new Btrie_rv();
private static byte[] Tag__anch__rhs;
private Xomw_regex_boundary regex_boundary;
private Xomw_regex_url regex_url;
private Xomw_linker linker;
private byte[] page_title;
// $prots = wfUrlProtocolsWithoutProtRel();
// $urlChar = self::EXT_LINK_URL_CLASS;
// $addr = self::EXT_LINK_ADDR;
// $space = self::SPACE_NOT_NL; // non-newline space
// $spdash = "(?:-|$space)"; // a dash or a non-newline space
// $spaces = "$space++"; // possessive match of 1 or more spaces
// $text = preg_replace_callback(
// '!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// ([0-9]+)\b |
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// )\b
// )!xu", [ &$this, 'magicLinkCallback' ], $text);
// return $text;
// }
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
this.linker = linker;
this.regex_boundary = regex_boundary;
this.regex_url = regex_url;
regex_trie.Add_str_byte("<a", Regex__anch);
regex_trie.Add_str_byte("<" , Regex__elem);
// public function magicLinkCallback($m) {
// if (isset($m[1]) && $m[1] !== '') {
// // Skip anchor
// return $m[0];
// } else if (isset($m[2]) && $m[2] !== '') {
// // Skip HTML element
// return $m[0];
// } else if (isset($m[3]) && $m[3] !== '') {
// // Free external link
// return $this->makeFreeExternalLink($m[0], strlen($m[4]));
// } else if (isset($m[5]) && $m[5] !== '') {
// // RFC or PMID
// if (substr($m[0], 0, 3) === 'RFC') {
// if (!$this->mOptions->getMagicRFCLinks()) {
// return $m[0];
// }
// $keyword = 'RFC';
// $urlmsg = 'rfcurl';
// $cssClass = 'mw-magiclink-rfc';
// $trackingCat = 'magiclink-tracking-rfc';
// $id = $m[5];
// } else if (substr($m[0], 0, 4) === 'PMID') {
// if (!$this->mOptions->getMagicPMIDLinks()) {
// return $m[0];
// }
// $keyword = 'PMID';
// $urlmsg = 'pubmedurl';
// $cssClass = 'mw-magiclink-pmid';
// $trackingCat = 'magiclink-tracking-pmid';
// $id = $m[5];
// } else {
// throw new MWException(__METHOD__ . ': unrecognised match type "' .
// substr($m[0], 0, 20) . '"');
// }
// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text();
// $this->addTrackingCategory($trackingCat);
// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle);
// } else if (isset($m[6]) && $m[6] !== ''
// && $this->mOptions->getMagicISBNLinks()
// ) {
// // ISBN
// $isbn = $m[6];
// $space = self::SPACE_NOT_NL; // non-newline space
// $isbn = preg_replace("/$space/", ' ', $isbn);
// $num = strtr($isbn, [
// '-' => '',
// ' ' => '',
// 'x' => 'X',
// ]);
// $this->addTrackingCategory('magiclink-tracking-isbn');
// return $this->getLinkRenderer()->makeKnownLink(
// SpecialPage::getTitleFor('Booksources', $num),
// "ISBN $isbn",
// [
// 'class' => '@gplx.Internal protected mw-magiclink-isbn',
// 'title' => false // suppress title attribute
// ]
// );
// } else {
// return $m[0];
// }
Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
int protocol_len = protocol_ary.length;
for (int i = 0; i < protocol_len; i++) {
Gfo_protocol_itm itm = protocol_ary[i];
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
}
// Make a free external link, given a user-supplied URL
// public void Make_free_external_link(byte[] url, int num_post_proto) {
// byte[] trail = Bry_.Empty;
if (Tag__anch__rhs == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Tag__anch__rhs = Bry_.new_a7("</a>");
}
}
}
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// Make &nbsp; terminate a URL as well (bug T84937)
// $m2 = [];
// if (preg_match(
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
// $url,
// $m2,
// PREG_OFFSET_CAPTURE
// )) {
// trail = substr($url, $m2[0][1]) . $trail;
// $url = substr($url, 0, $m2[0][1]);
// }
// Replace special strings like "ISBN xxx" and "RFC xxx" with
// magic external links.
public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
// Move trailing punctuation to $trail
// $sep = ',;\.:!?';
// If there is no left bracket, then consider right brackets fair game too
// if (strpos($url, '(') === false) {
// $sep .= ')';
// }
int cur = src_bgn;
int prv = cur;
boolean dirty = true;
// PORTED.REGEX: handle below
// NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
//'!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// ([0-9]+)\b |
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// )\b
while (true) {
if (cur == src_end) {
if (dirty)
bfr.Add_mid(src, prv, src_end);
break;
}
// $urlRev = strrev($url);
// $numSepChars = strspn($urlRev, $sep);
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// String starting at the desired offset.
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) {
// $numSepChars--;
// }
// }
// if ($numSepChars) {
// $trail = substr($url, -$numSepChars) . $trail;
// $url = substr($url, 0, -$numSepChars);
// }
byte b = src[cur];
Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
// current byte doesn't look like magiclink; continue;
if (o == null) {
cur++;
continue;
}
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
// if (strlen($trail) >= $numPostProto) {
// return $url . $trail;
// }
// looks like magiclink; do additional processing
byte regex_tid = ((Byte_obj_val)o).Val();
int old_pos = cur;
int trv_pos = trv.Pos();
int nxt_pos = trv_pos;
boolean regex_valid = true;
switch (regex_tid) {
case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
if (trv_pos < src_end) {
// find ws in "[ \t\r\n>]"
byte ws_byte = src[cur];
switch (ws_byte) {
case Byte_ascii.Space:
case Byte_ascii.Tab:
case Byte_ascii.Cr:
case Byte_ascii.Nl:
break;
default:
regex_valid = false;
break;
}
if (regex_valid) {
// find </a>
nxt_pos++;
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
if (anch_end == Bry_find_.Not_found) {
regex_valid = false;
}
else {
cur = anch_end + Tag__anch__rhs.length;
}
}
}
else {
regex_valid = false;
}
break;
case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// just find ">"
int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
if (elem_end == Bry_find_.Not_found)
regex_valid = false;
else
cur = elem_end + 1;
break;
case Regex__free:
if (regex_boundary.Is_boundary_prv(src, cur)) {
int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end);
if (url_end == nxt_pos) {
regex_valid = false;
}
else
cur = url_end;
}
else
regex_valid = false;
break;
}
if (!regex_valid) {
cur++;
}
else {
if (regex_tid == Regex__free) {
this.page_title = pctx.Page_title().Full_db();
dirty = true;
bfr.Add_mid(src, prv, old_pos);
this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0);
prv = cur;
}
else {
}
}
}
if (dirty) {
pbfr.Switch();
}
}
// Make a free external link, given a user-supplied URL
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
// byte[] trail = Bry_.Empty;
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// Make &nbsp; terminate a URL as well (bug T84937)
// $m2 = [];
// if (preg_match(
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
// $url,
// $m2,
// PREG_OFFSET_CAPTURE
// )) {
// trail = substr($url, $m2[0][1]) . $trail;
// $url = substr($url, 0, $m2[0][1]);
// }
// Move trailing punctuation to $trail
// $sep = ',;\.:!?';
// If there is no left bracket, then consider right brackets fair game too
// if (strpos($url, '(') === false) {
// $sep .= ')';
// }
// $url = Sanitizer::cleanUrl($url);
// $urlRev = strrev($url);
// $numSepChars = strspn($urlRev, $sep);
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// Is this an external image?
// $text = $this->maybeMakeExternalImage($url);
// if ($text === false) {
// Not an image, make a link
// $text = Linker::makeExternalLink($url,
// $this->getConverterLanguage()->markNoConversion($url, true),
// true, 'free',
// $this->getExternalLinkAttribs($url), $this->mTitle);
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// String starting at the desired offset.
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) {
// $numSepChars--;
// }
// return $text . $trail;
// }
// }
// }
// if ($numSepChars) {
// $trail = substr($url, -$numSepChars) . $trail;
// $url = substr($url, 0, -$numSepChars);
// }
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
// if (strlen($trail) >= $numPostProto) {
// return $url . $trail;
// }
// $url = Sanitizer::cleanUrl($url);
// Is this an external image?
byte[] text = null; // $this->maybeMakeExternalImage($url);
if (text == null) {
// Not an image, make a link
linker.Make_external_link(bfr, url
, url // $this->getConverterLanguage()->markNoConversion($url, true),
, true, Bry_.new_a7("free")
, new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url)
, page_title);
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
}
// return $text . $trail;
}
}

@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*;
public class Xomw_magiclinks_wkr__tst {
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
}
class Xomw_magiclinks_wkr__fxt {
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_magiclinks_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
Xomw_regex_space regex_space = new Xomw_regex_space();
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
pbfr.Init(src_bry);
wkr.Do_magic_links(pctx, pbfr);
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}
Loading…
Cancel
Save