mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse: Add basic implementation for magiclinks
This commit is contained in:
parent
7bd176f51f
commit
aa1f1ec801
@ -40,6 +40,7 @@ public class Regx_adp {
|
||||
return (Regx_match[])rv.To_ary(Regx_match.class);
|
||||
}
|
||||
private Pattern under;
|
||||
public Pattern Under() {return under;}
|
||||
void Under_sync() {
|
||||
try {under = Pattern.compile(pattern, Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);} // JRE.7:UNICODE_CHARACTER_CLASS; added during %w fix for en.w:A#; DATE:2015-06-10
|
||||
catch (Exception e) { // NOTE: if invalid, then default to empty pattern (which should return nothing); EX:d:〆る generates [^]; DATE:2013-10-20
|
||||
|
@ -17,10 +17,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
public class Regx_group {
|
||||
public Regx_group(boolean rslt, int bgn, int end, String val) {this.rslt = rslt; this.bgn = bgn; this.end = end; this.val = val;}
|
||||
public Regx_group(boolean rslt, int bgn, int end, String val) {
|
||||
this.rslt = rslt;
|
||||
this.bgn = bgn;
|
||||
this.end = end;
|
||||
this.val = val;
|
||||
}
|
||||
public boolean Rslt() {return rslt;} private boolean rslt;
|
||||
public int Bgn() {return bgn;} int bgn;
|
||||
public int End() {return end;} int end;
|
||||
public int Bgn() {return bgn;} private int bgn;
|
||||
public int End() {return end;} private int end;
|
||||
public String Val() {return val;} private String val;
|
||||
public void Init(boolean rslt, int bgn, int end, String val) {
|
||||
this.rslt = rslt;
|
||||
this.bgn = bgn;
|
||||
this.end = end;
|
||||
this.val = val;
|
||||
}
|
||||
public static final Regx_group[] Ary_empty = new Regx_group[0];
|
||||
}
|
||||
|
46
100_core/src/gplx/langs/regxs/Regx_rslt.java
Normal file
46
100_core/src/gplx/langs/regxs/Regx_rslt.java
Normal file
@ -0,0 +1,46 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
public class Regx_rslt {// THREAD.UNSAFE
|
||||
private int src_pos;
|
||||
private Regx_group tmp_grp = new Regx_group(false, -1, -1, null);
|
||||
public Matcher match;
|
||||
public int Groups__len() {return match.groupCount() + 1;} // +1 to include group=0 which is entire pattern
|
||||
public Regx_group Groups__get_at(int i) {
|
||||
tmp_grp.Init(true, match.start(i), match.end(i), null);
|
||||
return tmp_grp;
|
||||
}
|
||||
public void Init(Regx_adp regex, String src, int src_bgn) {
|
||||
match = regex.Under().matcher(src);
|
||||
this.src_pos = src_bgn;
|
||||
}
|
||||
public boolean Match_next() {
|
||||
this.found = match.find(src_pos);
|
||||
if (found) {
|
||||
this.find_bgn = match.start();
|
||||
this.find_end = match.end();
|
||||
this.src_pos = find_end;
|
||||
}
|
||||
return found;
|
||||
}
|
||||
public boolean Found() {return found;} private boolean found;
|
||||
public int Find_bgn() {return find_bgn;} private int find_bgn;
|
||||
public int Find_end() {return find_end;} private int find_end;
|
||||
}
|
@ -19,7 +19,7 @@ package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xo
|
||||
import gplx.core.btries.*; import gplx.core.net.*;
|
||||
import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*;
|
||||
import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*;
|
||||
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*;
|
||||
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*;
|
||||
import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*;
|
||||
public class Xomw_parser {
|
||||
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
||||
@ -29,10 +29,13 @@ public class Xomw_parser {
|
||||
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
|
||||
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
|
||||
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
|
||||
private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
|
||||
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
|
||||
private final Xomw_link_holders holders;
|
||||
private final Xomw_heading_cbk__html heading_wkr_cbk;
|
||||
private final Btrie_slim_mgr protocols_trie;
|
||||
private static Xomw_regex_space regex_space;
|
||||
private static Xomw_regex_url regex_url;
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
private int marker_index = 0;
|
||||
// private final Xomw_prepro_wkr prepro_wkr = new Xomw_prepro_wkr();
|
||||
@ -51,10 +54,16 @@ public class Xomw_parser {
|
||||
this.lnke_wkr = new Xomw_lnke_wkr(this);
|
||||
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
|
||||
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
|
||||
if (regex_space == null) {
|
||||
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||
regex_space = new Xomw_regex_space();
|
||||
regex_url = new Xomw_regex_url(regex_space);
|
||||
}
|
||||
}
|
||||
}
|
||||
public void Init_by_wiki(Xowe_wiki wiki) {
|
||||
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
|
||||
lnke_wkr.Init_by_wiki(protocols_trie);
|
||||
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
|
||||
lnki_wkr.Init_by_wiki(wiki);
|
||||
}
|
||||
public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) {
|
||||
@ -107,8 +116,8 @@ public class Xomw_parser {
|
||||
// replaceInternalLinks may sometimes leave behind
|
||||
// absolute URLs, which have to be masked to hide them from replaceExternalLinks
|
||||
Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty);
|
||||
magiclinks_wkr.Do_magic_links(pctx, pbfr);
|
||||
|
||||
// $text = $this->doMagicLinks($text);
|
||||
// $text = $this->formatHeadings($text, $origText, $isMain);
|
||||
}
|
||||
|
||||
|
@ -21,37 +21,9 @@ public class Xomw_parser__tst {
|
||||
private final Xomw_parser__fxt fxt = new Xomw_parser__fxt();
|
||||
@Test public void Basic() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "== heading_1 =="
|
||||
, "para_1"
|
||||
, "== heading_2 =="
|
||||
, "para_2"
|
||||
, "-----"
|
||||
, "{|"
|
||||
, "|-"
|
||||
, "|a"
|
||||
, "|}"
|
||||
, "''italics''"
|
||||
, "[https://a.org b]"
|
||||
, "[[A|abc]]"
|
||||
, "a »b«  !important c"
|
||||
("a https://c.org b"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<h2> heading_1 </h2>"
|
||||
, "<p>para_1"
|
||||
, "</p>"
|
||||
, "<h2> heading_2 </h2>"
|
||||
, "<p>para_2"
|
||||
, "</p>"
|
||||
, "<hr />"
|
||||
, "<table>"
|
||||
, ""
|
||||
, "<tr>"
|
||||
, "<td>a"
|
||||
, "</td></tr></table>"
|
||||
, "<p><i>italics</i>"
|
||||
, "<a class=\"external text\" rel=\"nofollow\" href=\"https://a.org\">b</a>"
|
||||
, "<a href=\"/wiki/A\" title=\"A\">abc</a>"
|
||||
, "a »b«  !important c"
|
||||
, "</p>"
|
||||
( ""
|
||||
));
|
||||
}
|
||||
}
|
||||
|
45
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_.java
Normal file
45
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_.java
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||
import gplx.core.btries.*;
|
||||
public class Xomw_regex_ {
|
||||
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||
int cur = src_bgn;
|
||||
while (true) {
|
||||
byte b = src[cur];
|
||||
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||
if (o == null)
|
||||
break;
|
||||
else
|
||||
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||
int cur = src_bgn;
|
||||
while (true) {
|
||||
byte b = src[cur];
|
||||
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||
if (o == null)
|
||||
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||
else
|
||||
break;
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
}
|
39
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_boundary.java
Normal file
39
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_boundary.java
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||
import gplx.core.btries.*;
|
||||
public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent interface
|
||||
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
public Xomw_regex_boundary(Xomw_regex_space space) {
|
||||
// naive implementation of is_boundary; ignore all ws and underscore
|
||||
byte[][] ary = space.Ws();
|
||||
for (byte[] bry : ary)
|
||||
trie.Add_bry_byte(bry, Byte_.Zero);
|
||||
ary = space.Zs();
|
||||
for (byte[] bry : ary)
|
||||
trie.Add_bry_byte(bry, Byte_.Zero);
|
||||
}
|
||||
public boolean Is_boundary_prv(byte[] src, int pos) {
|
||||
if (pos == 0) return true; // BOS is true
|
||||
int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
|
||||
byte b = src[bgn];
|
||||
Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
|
||||
return o != null;
|
||||
}
|
||||
}
|
64
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_space.java
Normal file
64
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_space.java
Normal file
@ -0,0 +1,64 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||
import gplx.core.btries.*;
|
||||
public class Xomw_regex_space {
|
||||
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
|
||||
public Xomw_regex_space() {
|
||||
byte[] space = Bry_.New_by_ints(32);
|
||||
ws = new byte[][]
|
||||
{ space
|
||||
, Bry_.New_by_ints(9)
|
||||
, Bry_.New_by_ints(10)
|
||||
, Bry_.New_by_ints(13)
|
||||
};
|
||||
// Zs; REF:http://www.fileformat.info/info/unicode/category/Zs/list.htm
|
||||
zs = new byte[][]
|
||||
{ space
|
||||
, Bry_.New_by_ints(194, 160)
|
||||
, Bry_.New_by_ints(225, 154, 128)
|
||||
, Bry_.New_by_ints(226, 128, 129)
|
||||
, Bry_.New_by_ints(226, 128, 130)
|
||||
, Bry_.New_by_ints(226, 128, 131)
|
||||
, Bry_.New_by_ints(226, 128, 132)
|
||||
, Bry_.New_by_ints(226, 128, 133)
|
||||
, Bry_.New_by_ints(226, 128, 134)
|
||||
, Bry_.New_by_ints(226, 128, 135)
|
||||
, Bry_.New_by_ints(226, 128, 136)
|
||||
, Bry_.New_by_ints(226, 128, 137)
|
||||
, Bry_.New_by_ints(226, 128, 138)
|
||||
, Bry_.New_by_ints(226, 128, 175)
|
||||
, Bry_.New_by_ints(226, 129, 159)
|
||||
, Bry_.New_by_ints(227, 128, 128)
|
||||
};
|
||||
|
||||
byte[][] ary = ws;
|
||||
for (byte[] bry : ary) {
|
||||
trie.Add_bry_byte(bry, Byte_.Zero);
|
||||
}
|
||||
ary = zs;
|
||||
for (byte[] bry : ary) {
|
||||
trie.Add_bry_byte(bry, Byte_.Zero);
|
||||
}
|
||||
}
|
||||
public byte[][] Ws() {return ws;} private byte[][] ws;
|
||||
public byte[][] Zs() {return zs;} private byte[][] zs;
|
||||
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||
return Xomw_regex_.Find_fwd_while(trie, trv, src, src_bgn, src_end);
|
||||
}
|
||||
}
|
39
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java
Normal file
39
400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*;
|
||||
import gplx.core.btries.*;
|
||||
public class Xomw_regex_url {
|
||||
private final Btrie_slim_mgr trie;
|
||||
public Xomw_regex_url(Xomw_regex_space regex_space) {
|
||||
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
|
||||
this.trie = Btrie_slim_mgr.cs();
|
||||
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
|
||||
for (byte i = 0; i < 33; i++) {
|
||||
trie.Add_bry_byte(new byte[] {i}, Byte_.Zero);
|
||||
}
|
||||
trie.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
|
||||
|
||||
byte[][] zs_ary = regex_space.Zs();
|
||||
for (byte[] zs : zs_ary) {
|
||||
trie.Add_bry_byte(zs, Byte_.Zero);
|
||||
}
|
||||
}
|
||||
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
|
||||
return Xomw_regex_.Find_fwd_until(trie, trv, src, src_bgn, src_end);
|
||||
}
|
||||
}
|
@ -26,12 +26,16 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
private int autonumber;
|
||||
private final Xomw_linker linker;
|
||||
private final Xomwh_atr_mgr attribs = new Xomwh_atr_mgr();
|
||||
private Xomw_regex_url regex_url;
|
||||
private Xomw_regex_space regex_space;
|
||||
public Xomw_lnke_wkr(Xomw_parser mgr) {
|
||||
this.tmp = mgr.Tmp();
|
||||
this.linker = mgr.Linker();
|
||||
}
|
||||
public void Init_by_wiki(Btrie_slim_mgr protocol_trie) {
|
||||
public void Init_by_wiki(Btrie_slim_mgr protocol_trie, Xomw_regex_url regex_url, Xomw_regex_space regex_space) {
|
||||
this.protocol_trie = protocol_trie;
|
||||
this.regex_url = regex_url;
|
||||
this.regex_space = regex_space;
|
||||
}
|
||||
public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
|
||||
// XO.PBFR
|
||||
@ -101,14 +105,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
|
||||
// check for one-or-more url chars; [^][<>"\\x00-\\x20\\x7F\p{Zs}]
|
||||
int domain_bgn = cur;
|
||||
while (true) {
|
||||
byte b = src[cur];
|
||||
Object url_char_byte = invalid_url_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||
if (url_char_byte == null)
|
||||
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
|
||||
else
|
||||
break;
|
||||
}
|
||||
cur = regex_url.Find_fwd_while(trv, src, domain_bgn, src_end);
|
||||
if (cur - domain_bgn == 0) {
|
||||
bfr.Add_mid(src, prv, cur);
|
||||
prv = cur;
|
||||
@ -116,14 +113,8 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
}
|
||||
int url_end = cur;
|
||||
|
||||
// get ws (if any)
|
||||
int ws_bgn = -1;
|
||||
while (true) {
|
||||
Object space_byte = space_chars_trie.Match_at(trv, src, cur, src_end);
|
||||
if (space_byte == null) break;
|
||||
if (ws_bgn == -1) ws_bgn = cur;
|
||||
cur += ((Int_obj_val)space_byte).Val();
|
||||
}
|
||||
// skip ws
|
||||
cur = regex_space.Find_fwd_while(trv, src, cur, src_end);
|
||||
|
||||
// get text (if any)
|
||||
int text_bgn = -1, text_end = -1;
|
||||
@ -244,27 +235,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
, Link_type__autonumber = Bry_.new_a7("autonumber")
|
||||
;
|
||||
|
||||
private static final Btrie_slim_mgr
|
||||
invalid_url_chars_trie = New__invalid_url_chars_trie()
|
||||
, space_chars_trie = New__space_chars_trie()
|
||||
, invalid_text_chars_trie = New__invalid_text_chars_trie()
|
||||
;
|
||||
private static Btrie_slim_mgr New__invalid_url_chars_trie() { // REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
|
||||
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
|
||||
rv.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
|
||||
for (byte i = 0; i < 33; i++) {
|
||||
rv.Add_bry_byte(new byte[] {i}, Byte_.Zero);
|
||||
}
|
||||
rv.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
|
||||
rv.Add_bry_byte(Bry_.New_by_ints(227, 128, 128), Byte_.Zero); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
|
||||
return rv;
|
||||
}
|
||||
private static Btrie_slim_mgr New__space_chars_trie() { // REGEX:\p{Zs}; NOTE: val is key.length
|
||||
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
|
||||
New__trie_itm__by_len(rv, 32);
|
||||
New__trie_itm__by_len(rv, 227, 128, 128); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052
|
||||
return rv;
|
||||
}
|
||||
private static final Btrie_slim_mgr invalid_text_chars_trie = New__invalid_text_chars_trie();
|
||||
private static Btrie_slim_mgr New__invalid_text_chars_trie() { // REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
|
||||
Btrie_slim_mgr rv = Btrie_slim_mgr.cs();
|
||||
New__trie_itm__by_len(rv, Byte_ascii.Brack_end);
|
||||
|
@ -45,7 +45,8 @@ class Xomw_lnke_wkr__fxt {
|
||||
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
|
||||
private boolean apos = true;
|
||||
public Xomw_lnke_wkr__fxt() {
|
||||
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt());
|
||||
Xomw_regex_space regex_space = new Xomw_regex_space();
|
||||
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt(), new Xomw_regex_url(regex_space), regex_space);
|
||||
}
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
|
@ -17,256 +17,169 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
|
||||
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
// public class Xomw_magiclinks_wkr {
|
||||
// private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
||||
// private final Btrie_rv trv = new Btrie_rv();
|
||||
// public Xomw_magiclinks_wkr() {
|
||||
// }
|
||||
// private static byte[] Tag__anch__rhs, Prefix__rfc, Prefix__pmid;
|
||||
//
|
||||
// private static final byte Space__tab = 1, Space__nbsp_ent = 2, Space__nbsp_dec = 3, Space__nbsp_hex = 4;
|
||||
// private static Btrie_slim_mgr space_trie;
|
||||
// // static final SPACE_NOT_NL = '(?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})';
|
||||
//// public void Test() {
|
||||
//// regex.Add("\t", Space__tab);
|
||||
//// regex.Add(" ", Space__nbsp__ent);
|
||||
//// regex.Add(Regex.Make("&#").Star("0").Add("160;"), Space__nbsp__dec);
|
||||
//// regex.Add(Regex.Make("&#").Brack("X", "x").Star("0").Brack("A", "a").Add("0"), Space__nbsp__hex);
|
||||
//// }
|
||||
// public int Find_fwd_space(byte[] src, int cur, int src_end) {
|
||||
// return -1;
|
||||
// }
|
||||
//
|
||||
// private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3, Regex__rfc = 5, Regex__isbn = 6, Regex__pmid = 7;
|
||||
// public void Init_by_wiki() {
|
||||
// regex_trie.Add_str_byte("<a", Regex__anch);
|
||||
// regex_trie.Add_str_byte("<" , Regex__elem);
|
||||
//
|
||||
// Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
|
||||
// int protocol_len = protocol_ary.length;
|
||||
// for (int i = 0; i < protocol_len; i++) {
|
||||
// Gfo_protocol_itm itm = protocol_ary[i];
|
||||
// regex_trie.Add_bry_byte(itm.Key_w_colon_bry(), Regex__free);
|
||||
// }
|
||||
// regex_trie.Add_str_byte("RFC " , Regex__rfc);
|
||||
// regex_trie.Add_str_byte("PMID " , Regex__rfc);
|
||||
// regex_trie.Add_str_byte("ISBN ", Regex__rfc);
|
||||
//
|
||||
// if (Tag__anch__rhs == null) {
|
||||
// synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||
// Tag__anch__rhs = Bry_.new_a7("</a>");
|
||||
// Prefix__rfc = Bry_.new_a7("RFC");
|
||||
// Prefix__pmid = Bry_.new_a7("PMID");
|
||||
// space_trie = Btrie_slim_mgr.ci_a7()
|
||||
// .Add_str_byte("\t", Space__tab)
|
||||
// .Add_str_byte(" ", Space__nbsp_ent)
|
||||
// .Add_str_byte("&#", Space__nbsp_dec)
|
||||
// .Add_str_byte("&x", Space__nbsp_hex)
|
||||
// ;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // Replace special strings like "ISBN xxx" and "RFC xxx" with
|
||||
// // magic external links.
|
||||
// public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
|
||||
// // XO.PBFR
|
||||
// Bry_bfr src_bfr = pbfr.Src();
|
||||
// byte[] src = src_bfr.Bfr();
|
||||
// int src_bgn = 0;
|
||||
// int src_end = src_bfr.Len();
|
||||
// Bry_bfr bfr = pbfr.Trg();
|
||||
//
|
||||
// int cur = src_bgn;
|
||||
// int prv = cur;
|
||||
// boolean dirty = true;
|
||||
// while (true) {
|
||||
// if (cur == src_end) {
|
||||
// if (dirty)
|
||||
// bfr.Add_mid(src, prv, src_end);
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
// byte b = src[cur];
|
||||
// Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||
// // current byte doesn't look like magiclink; continue;
|
||||
// if (o == null) {
|
||||
// cur++;
|
||||
// continue;
|
||||
// }
|
||||
// // looks like magiclink; do additional processing
|
||||
// byte regex_tid = ((Byte_obj_ref)o).Val();
|
||||
// int trv_pos = trv.Pos();
|
||||
// int nxt_pos = trv_pos;
|
||||
// boolean regex_valid = true;
|
||||
// switch (regex_tid) {
|
||||
// case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
// if (trv_pos < src_end) {
|
||||
// // find ws in "[ \t\r\n>]"
|
||||
// byte ws_byte = src[cur];
|
||||
// switch (ws_byte) {
|
||||
// case Byte_ascii.Space:
|
||||
// case Byte_ascii.Tab:
|
||||
// case Byte_ascii.Cr:
|
||||
// case Byte_ascii.Nl:
|
||||
// break;
|
||||
// default:
|
||||
// regex_valid = false;
|
||||
// break;
|
||||
// }
|
||||
// if (regex_valid) {
|
||||
// // find </a>
|
||||
// nxt_pos++;
|
||||
// int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
|
||||
// if (anch_end == Bry_find_.Not_found) {
|
||||
// regex_valid = false;
|
||||
// }
|
||||
// else {
|
||||
// cur = anch_end + Tag__anch__rhs.length;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else {
|
||||
// regex_valid = false;
|
||||
// }
|
||||
// break;
|
||||
// case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
|
||||
// // just find ">"
|
||||
// int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
|
||||
// if (elem_end == Bry_find_.Not_found)
|
||||
// regex_valid = false;
|
||||
// else
|
||||
// cur = elem_end + 1;
|
||||
// break;
|
||||
// case Regex__free:
|
||||
// // addr; urlchar
|
||||
// break;
|
||||
// case Regex__rfc:
|
||||
// case Regex__pmid:
|
||||
// // byte[] prefix = regex == Regex__rfc ? Prefix__rfc : Prefix__pmid;
|
||||
// // match previous for case sensitivity
|
||||
//// if (Bry_.Eq(src, trv_pos - prefix.length - 1, trv_pos - 1, prefix)) {
|
||||
////
|
||||
//// }
|
||||
//// else {
|
||||
//// regex_valid = false;
|
||||
//// }
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
//// '!(?: // Start cases
|
||||
//// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
//// (<.*?>) | // m[2]: Skip stuff inside
|
||||
//// // HTML elements' . "
|
||||
//// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
|
||||
//// // m[4]: Post-protocol path
|
||||
//// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
|
||||
//// ([0-9]+)\b |
|
||||
//// \bISBN $spaces ( // m[6]: ISBN, capture number
|
||||
//// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
|
||||
//// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
|
||||
//// [0-9Xx] // check digit
|
||||
//// )\b
|
||||
//
|
||||
// }
|
||||
// if (dirty)
|
||||
// pbfr.Switch();
|
||||
import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public class Xomw_magiclinks_wkr {
|
||||
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
private static byte[] Tag__anch__rhs;
|
||||
private Xomw_regex_boundary regex_boundary;
|
||||
private Xomw_regex_url regex_url;
|
||||
private Xomw_linker linker;
|
||||
private byte[] page_title;
|
||||
|
||||
// $prots = wfUrlProtocolsWithoutProtRel();
|
||||
// $urlChar = self::EXT_LINK_URL_CLASS;
|
||||
// $addr = self::EXT_LINK_ADDR;
|
||||
// $space = self::SPACE_NOT_NL; // non-newline space
|
||||
// $spdash = "(?:-|$space)"; // a dash or a non-newline space
|
||||
// $spaces = "$space++"; // possessive match of 1 or more spaces
|
||||
// $text = preg_replace_callback(
|
||||
// '!(?: // Start cases
|
||||
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
// (<.*?>) | // m[2]: Skip stuff inside
|
||||
// // HTML elements' . "
|
||||
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
|
||||
// // m[4]: Post-protocol path
|
||||
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
|
||||
// ([0-9]+)\b |
|
||||
// \bISBN $spaces ( // m[6]: ISBN, capture number
|
||||
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
|
||||
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
|
||||
// [0-9Xx] // check digit
|
||||
// )\b
|
||||
// )!xu", [ &$this, 'magicLinkCallback' ], $text);
|
||||
// return $text;
|
||||
// }
|
||||
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
|
||||
public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
|
||||
this.linker = linker;
|
||||
this.regex_boundary = regex_boundary;
|
||||
this.regex_url = regex_url;
|
||||
regex_trie.Add_str_byte("<a", Regex__anch);
|
||||
regex_trie.Add_str_byte("<" , Regex__elem);
|
||||
|
||||
// public function magicLinkCallback($m) {
|
||||
// if (isset($m[1]) && $m[1] !== '') {
|
||||
// // Skip anchor
|
||||
// return $m[0];
|
||||
// } else if (isset($m[2]) && $m[2] !== '') {
|
||||
// // Skip HTML element
|
||||
// return $m[0];
|
||||
// } else if (isset($m[3]) && $m[3] !== '') {
|
||||
// // Free external link
|
||||
// return $this->makeFreeExternalLink($m[0], strlen($m[4]));
|
||||
// } else if (isset($m[5]) && $m[5] !== '') {
|
||||
// // RFC or PMID
|
||||
// if (substr($m[0], 0, 3) === 'RFC') {
|
||||
// if (!$this->mOptions->getMagicRFCLinks()) {
|
||||
// return $m[0];
|
||||
// }
|
||||
// $keyword = 'RFC';
|
||||
// $urlmsg = 'rfcurl';
|
||||
// $cssClass = 'mw-magiclink-rfc';
|
||||
// $trackingCat = 'magiclink-tracking-rfc';
|
||||
// $id = $m[5];
|
||||
// } else if (substr($m[0], 0, 4) === 'PMID') {
|
||||
// if (!$this->mOptions->getMagicPMIDLinks()) {
|
||||
// return $m[0];
|
||||
// }
|
||||
// $keyword = 'PMID';
|
||||
// $urlmsg = 'pubmedurl';
|
||||
// $cssClass = 'mw-magiclink-pmid';
|
||||
// $trackingCat = 'magiclink-tracking-pmid';
|
||||
// $id = $m[5];
|
||||
// } else {
|
||||
// throw new MWException(__METHOD__ . ': unrecognised match type "' .
|
||||
// substr($m[0], 0, 20) . '"');
|
||||
// }
|
||||
// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text();
|
||||
// $this->addTrackingCategory($trackingCat);
|
||||
// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle);
|
||||
// } else if (isset($m[6]) && $m[6] !== ''
|
||||
// && $this->mOptions->getMagicISBNLinks()
|
||||
// ) {
|
||||
// // ISBN
|
||||
// $isbn = $m[6];
|
||||
// $space = self::SPACE_NOT_NL; // non-newline space
|
||||
// $isbn = preg_replace("/$space/", ' ', $isbn);
|
||||
// $num = strtr($isbn, [
|
||||
// '-' => '',
|
||||
// ' ' => '',
|
||||
// 'x' => 'X',
|
||||
// ]);
|
||||
// $this->addTrackingCategory('magiclink-tracking-isbn');
|
||||
// return $this->getLinkRenderer()->makeKnownLink(
|
||||
// SpecialPage::getTitleFor('Booksources', $num),
|
||||
// "ISBN $isbn",
|
||||
// [
|
||||
// 'class' => '@gplx.Internal protected mw-magiclink-isbn',
|
||||
// 'title' => false // suppress title attribute
|
||||
// ]
|
||||
// );
|
||||
// } else {
|
||||
// return $m[0];
|
||||
// }
|
||||
Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
|
||||
int protocol_len = protocol_ary.length;
|
||||
for (int i = 0; i < protocol_len; i++) {
|
||||
Gfo_protocol_itm itm = protocol_ary[i];
|
||||
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
|
||||
}
|
||||
|
||||
if (Tag__anch__rhs == null) {
|
||||
synchronized (Type_adp_.ClassOf_obj(this)) {
|
||||
Tag__anch__rhs = Bry_.new_a7("</a>");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Replace special strings like "ISBN xxx" and "RFC xxx" with
|
||||
// magic external links.
|
||||
public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
|
||||
// XO.PBFR
|
||||
Bry_bfr src_bfr = pbfr.Src();
|
||||
byte[] src = src_bfr.Bfr();
|
||||
int src_bgn = 0;
|
||||
int src_end = src_bfr.Len();
|
||||
Bry_bfr bfr = pbfr.Trg();
|
||||
|
||||
int cur = src_bgn;
|
||||
int prv = cur;
|
||||
boolean dirty = true;
|
||||
// PORTED.REGEX: handle below
|
||||
// NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
|
||||
//'!(?: // Start cases
|
||||
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
// (<.*?>) | // m[2]: Skip stuff inside
|
||||
// // HTML elements' . "
|
||||
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
|
||||
// // m[4]: Post-protocol path
|
||||
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
|
||||
// ([0-9]+)\b |
|
||||
// \bISBN $spaces ( // m[6]: ISBN, capture number
|
||||
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
|
||||
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
|
||||
// [0-9Xx] // check digit
|
||||
// )\b
|
||||
while (true) {
|
||||
if (cur == src_end) {
|
||||
if (dirty)
|
||||
bfr.Add_mid(src, prv, src_end);
|
||||
break;
|
||||
}
|
||||
|
||||
byte b = src[cur];
|
||||
Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
|
||||
// current byte doesn't look like magiclink; continue;
|
||||
if (o == null) {
|
||||
cur++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// looks like magiclink; do additional processing
|
||||
byte regex_tid = ((Byte_obj_val)o).Val();
|
||||
int old_pos = cur;
|
||||
int trv_pos = trv.Pos();
|
||||
int nxt_pos = trv_pos;
|
||||
boolean regex_valid = true;
|
||||
switch (regex_tid) {
|
||||
case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
|
||||
if (trv_pos < src_end) {
|
||||
// find ws in "[ \t\r\n>]"
|
||||
byte ws_byte = src[cur];
|
||||
switch (ws_byte) {
|
||||
case Byte_ascii.Space:
|
||||
case Byte_ascii.Tab:
|
||||
case Byte_ascii.Cr:
|
||||
case Byte_ascii.Nl:
|
||||
break;
|
||||
default:
|
||||
regex_valid = false;
|
||||
break;
|
||||
}
|
||||
if (regex_valid) {
|
||||
// find </a>
|
||||
nxt_pos++;
|
||||
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end);
|
||||
if (anch_end == Bry_find_.Not_found) {
|
||||
regex_valid = false;
|
||||
}
|
||||
else {
|
||||
cur = anch_end + Tag__anch__rhs.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
regex_valid = false;
|
||||
}
|
||||
break;
|
||||
case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
|
||||
// just find ">"
|
||||
int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end);
|
||||
if (elem_end == Bry_find_.Not_found)
|
||||
regex_valid = false;
|
||||
else
|
||||
cur = elem_end + 1;
|
||||
break;
|
||||
case Regex__free:
|
||||
if (regex_boundary.Is_boundary_prv(src, cur)) {
|
||||
int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end);
|
||||
if (url_end == nxt_pos) {
|
||||
regex_valid = false;
|
||||
}
|
||||
else
|
||||
cur = url_end;
|
||||
}
|
||||
else
|
||||
regex_valid = false;
|
||||
break;
|
||||
}
|
||||
if (!regex_valid) {
|
||||
cur++;
|
||||
}
|
||||
else {
|
||||
if (regex_tid == Regex__free) {
|
||||
this.page_title = pctx.Page_title().Full_db();
|
||||
dirty = true;
|
||||
bfr.Add_mid(src, prv, old_pos);
|
||||
this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0);
|
||||
prv = cur;
|
||||
}
|
||||
else {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dirty) {
|
||||
pbfr.Switch();
|
||||
}
|
||||
}
|
||||
|
||||
// Make a free external link, given a user-supplied URL
|
||||
// public void Make_free_external_link(byte[] url, int num_post_proto) {
|
||||
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
|
||||
// byte[] trail = Bry_.Empty;
|
||||
|
||||
// The characters '<' and '>' (which were escaped by
|
||||
// removeHTMLtags()) should not be included in
|
||||
// URLs, per RFC 2396.
|
||||
// Make terminate a URL as well (bug T84937)
|
||||
|
||||
// $m2 = [];
|
||||
// if (preg_match(
|
||||
// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/',
|
||||
@ -290,6 +203,7 @@ import gplx.langs.phps.utls.*;
|
||||
// Don't break a trailing HTML entity by moving the ; into $trail
|
||||
// This is in hot code, so use substr_compare to avoid having to
|
||||
// create a new String Object for the comparison
|
||||
|
||||
// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) {
|
||||
// more optimization: instead of running preg_match with a $
|
||||
// anchor, which can be slow, do the match on the reversed
|
||||
@ -313,19 +227,19 @@ import gplx.langs.phps.utls.*;
|
||||
// $url = Sanitizer::cleanUrl($url);
|
||||
|
||||
// Is this an external image?
|
||||
// $text = $this->maybeMakeExternalImage($url);
|
||||
// if ($text === false) {
|
||||
byte[] text = null; // $this->maybeMakeExternalImage($url);
|
||||
if (text == null) {
|
||||
// Not an image, make a link
|
||||
// $text = Linker::makeExternalLink($url,
|
||||
// $this->getConverterLanguage()->markNoConversion($url, true),
|
||||
// true, 'free',
|
||||
// $this->getExternalLinkAttribs($url), $this->mTitle);
|
||||
linker.Make_external_link(bfr, url
|
||||
, url // $this->getConverterLanguage()->markNoConversion($url, true),
|
||||
, true, Bry_.new_a7("free")
|
||||
, new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url)
|
||||
, page_title);
|
||||
// Register it in the output Object...
|
||||
// Replace unnecessary URL escape codes with their equivalent characters
|
||||
// $pasteurized = self::normalizeLinkUrl($url);
|
||||
// $this->mOutput->addExternalLink($pasteurized);
|
||||
// }
|
||||
}
|
||||
// return $text . $trail;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_magiclinks_wkr__tst {
|
||||
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
|
||||
@Test public void Basic() {fxt.Test__parse("a https://b.org c", "a <a class='external free' rel='nofollow' href='https://b.org'>https://b.org</a> c");}
|
||||
@Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");}
|
||||
}
|
||||
class Xomw_magiclinks_wkr__fxt {
|
||||
private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr();
|
||||
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
||||
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
|
||||
private boolean apos = true;
|
||||
public Xomw_magiclinks_wkr__fxt() {
|
||||
Xoae_app app = Xoa_app_fxt.Make__app__edit();
|
||||
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
|
||||
|
||||
Xomw_regex_space regex_space = new Xomw_regex_space();
|
||||
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
|
||||
wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
|
||||
}
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
pbfr.Init(src_bry);
|
||||
wkr.Do_magic_links(pctx, pbfr);
|
||||
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
|
||||
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user