diff --git a/100_core/src/gplx/langs/regxs/Regx_adp.java b/100_core/src/gplx/langs/regxs/Regx_adp.java index 5d8bfd335..515fc200e 100644 --- a/100_core/src/gplx/langs/regxs/Regx_adp.java +++ b/100_core/src/gplx/langs/regxs/Regx_adp.java @@ -40,6 +40,7 @@ public class Regx_adp { return (Regx_match[])rv.To_ary(Regx_match.class); } private Pattern under; + public Pattern Under() {return under;} void Under_sync() { try {under = Pattern.compile(pattern, Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);} // JRE.7:UNICODE_CHARACTER_CLASS; added during %w fix for en.w:A#; DATE:2015-06-10 catch (Exception e) { // NOTE: if invalid, then default to empty pattern (which should return nothing); EX:d:〆る generates [^]; DATE:2013-10-20 diff --git a/100_core/src/gplx/langs/regxs/Regx_group.java b/100_core/src/gplx/langs/regxs/Regx_group.java index eb32176f7..fb46c72a3 100644 --- a/100_core/src/gplx/langs/regxs/Regx_group.java +++ b/100_core/src/gplx/langs/regxs/Regx_group.java @@ -17,10 +17,21 @@ along with this program. If not, see . */ package gplx.langs.regxs; import gplx.*; import gplx.langs.*; public class Regx_group { - public Regx_group(boolean rslt, int bgn, int end, String val) {this.rslt = rslt; this.bgn = bgn; this.end = end; this.val = val;} - public boolean Rslt() {return rslt;} private boolean rslt; - public int Bgn() {return bgn;} int bgn; - public int End() {return end;} int end; - public String Val() {return val;} private String val; - public static final Regx_group[] Ary_empty = new Regx_group[0]; + public Regx_group(boolean rslt, int bgn, int end, String val) { + this.rslt = rslt; + this.bgn = bgn; + this.end = end; + this.val = val; + } + public boolean Rslt() {return rslt;} private boolean rslt; + public int Bgn() {return bgn;} private int bgn; + public int End() {return end;} private int end; + public String Val() {return val;} private String val; + public void Init(boolean rslt, int bgn, int end, String val) { + this.rslt = rslt; + this.bgn = bgn; + this.end = end; + this.val = val; + } + public static final Regx_group[] Ary_empty = new Regx_group[0]; } diff --git a/100_core/src/gplx/langs/regxs/Regx_match.java b/100_core/src/gplx/langs/regxs/Regx_match.java index 34617151f..2a71106e1 100644 --- a/100_core/src/gplx/langs/regxs/Regx_match.java +++ b/100_core/src/gplx/langs/regxs/Regx_match.java @@ -24,5 +24,5 @@ public class Regx_match { public int Find_end() {return find_end;} int find_end; public int Find_len() {return find_end - find_bgn;} public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty; - public static final Regx_match[] Ary_empty = new Regx_match[0]; + public static final Regx_match[] Ary_empty = new Regx_match[0]; } diff --git a/100_core/src/gplx/langs/regxs/Regx_rslt.java b/100_core/src/gplx/langs/regxs/Regx_rslt.java new file mode 100644 index 000000000..5ebc86522 --- /dev/null +++ b/100_core/src/gplx/langs/regxs/Regx_rslt.java @@ -0,0 +1,46 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.langs.regxs; import gplx.*; import gplx.langs.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +public class Regx_rslt {// THREAD.UNSAFE + private int src_pos; + private Regx_group tmp_grp = new Regx_group(false, -1, -1, null); + public Matcher match; + public int Groups__len() {return match.groupCount() + 1;} // +1 to include group=0 which is entire pattern + public Regx_group Groups__get_at(int i) { + tmp_grp.Init(true, match.start(i), match.end(i), null); + return tmp_grp; + } + public void Init(Regx_adp regex, String src, int src_bgn) { + match = regex.Under().matcher(src); + this.src_pos = src_bgn; + } + public boolean Match_next() { + this.found = match.find(src_pos); + if (found) { + this.find_bgn = match.start(); + this.find_end = match.end(); + this.src_pos = find_end; + } + return found; + } + public boolean Found() {return found;} private boolean found; + public int Find_bgn() {return find_bgn;} private int find_bgn; + public int Find_end() {return find_end;} private int find_end; +} \ No newline at end of file diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java index 48fd4a3ab..9aa4dd415 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java @@ -19,7 +19,7 @@ package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xo import gplx.core.btries.*; import gplx.core.net.*; import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*; import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*; -import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; +import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*; import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*; public class Xomw_parser { private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); @@ -29,10 +29,13 @@ public class Xomw_parser { private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr(); private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass(); private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr(); + private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr(); private final Xomw_link_renderer link_renderer = new Xomw_link_renderer(); private final Xomw_link_holders holders; private final Xomw_heading_cbk__html heading_wkr_cbk; private final Btrie_slim_mgr protocols_trie; + private static Xomw_regex_space regex_space; + private static Xomw_regex_url regex_url; private final Btrie_rv trv = new Btrie_rv(); private int marker_index = 0; // private final Xomw_prepro_wkr prepro_wkr = new Xomw_prepro_wkr(); @@ -51,10 +54,16 @@ public class Xomw_parser { this.lnke_wkr = new Xomw_lnke_wkr(this); this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie); this.heading_wkr_cbk = new Xomw_heading_cbk__html(); + if (regex_space == null) { + synchronized (Type_adp_.ClassOf_obj(this)) { + regex_space = new Xomw_regex_space(); + regex_url = new Xomw_regex_url(regex_space); + } + } } public void Init_by_wiki(Xowe_wiki wiki) { linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie()); - lnke_wkr.Init_by_wiki(protocols_trie); + lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space); lnki_wkr.Init_by_wiki(wiki); } public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) { @@ -107,8 +116,8 @@ public class Xomw_parser { // replaceInternalLinks may sometimes leave behind // absolute URLs, which have to be masked to hide them from replaceExternalLinks Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty); + magiclinks_wkr.Do_magic_links(pctx, pbfr); -// $text = $this->doMagicLinks($text); // $text = $this->formatHeadings($text, $origText, $isMain); } diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java index 1fd74535b..b6a66e4cb 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java @@ -21,37 +21,9 @@ public class Xomw_parser__tst { private final Xomw_parser__fxt fxt = new Xomw_parser__fxt(); @Test public void Basic() { fxt.Test__parse(String_.Concat_lines_nl_skip_last - ( "== heading_1 ==" - , "para_1" - , "== heading_2 ==" - , "para_2" - , "-----" - , "{|" - , "|-" - , "|a" - , "|}" - , "''italics''" - , "[https://a.org b]" - , "[[A|abc]]" - , "a »b«  !important c" + ("a https://c.org b" ), String_.Concat_lines_nl_skip_last - ( "

heading_1

" - , "

para_1" - , "

" - , "

heading_2

" - , "

para_2" - , "

" - , "
" - , "" - , "" - , "" - , "
a" - , "
" - , "

italics" - , "b" - , "abc" - , "a »b«  !important c" - , "

" + ( "" )); } } diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_.java new file mode 100644 index 000000000..e098ff240 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_.java @@ -0,0 +1,45 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +import gplx.core.btries.*; +public class Xomw_regex_ { + public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) { + int cur = src_bgn; + while (true) { + byte b = src[cur]; + Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end); + if (o == null) + break; + else + cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); + } + return cur; + } + public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) { + int cur = src_bgn; + while (true) { + byte b = src[cur]; + Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end); + if (o == null) + cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); + else + break; + } + return cur; + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_boundary.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_boundary.java new file mode 100644 index 000000000..0d519f987 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_boundary.java @@ -0,0 +1,39 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +import gplx.core.btries.*; +public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent interface + private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); + private final Btrie_rv trv = new Btrie_rv(); + public Xomw_regex_boundary(Xomw_regex_space space) { + // naive implementation of is_boundary; ignore all ws and underscore + byte[][] ary = space.Ws(); + for (byte[] bry : ary) + trie.Add_bry_byte(bry, Byte_.Zero); + ary = space.Zs(); + for (byte[] bry : ary) + trie.Add_bry_byte(bry, Byte_.Zero); + } + public boolean Is_boundary_prv(byte[] src, int pos) { + if (pos == 0) return true; // BOS is true + int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1); + byte b = src[bgn]; + Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos); + return o != null; + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_space.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_space.java new file mode 100644 index 000000000..21c7eef1c --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_space.java @@ -0,0 +1,64 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +import gplx.core.btries.*; +public class Xomw_regex_space { + private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); + public Xomw_regex_space() { + byte[] space = Bry_.New_by_ints(32); + ws = new byte[][] + { space + , Bry_.New_by_ints(9) + , Bry_.New_by_ints(10) + , Bry_.New_by_ints(13) + }; + // Zs; REF:http://www.fileformat.info/info/unicode/category/Zs/list.htm + zs = new byte[][] + { space + , Bry_.New_by_ints(194, 160) + , Bry_.New_by_ints(225, 154, 128) + , Bry_.New_by_ints(226, 128, 129) + , Bry_.New_by_ints(226, 128, 130) + , Bry_.New_by_ints(226, 128, 131) + , Bry_.New_by_ints(226, 128, 132) + , Bry_.New_by_ints(226, 128, 133) + , Bry_.New_by_ints(226, 128, 134) + , Bry_.New_by_ints(226, 128, 135) + , Bry_.New_by_ints(226, 128, 136) + , Bry_.New_by_ints(226, 128, 137) + , Bry_.New_by_ints(226, 128, 138) + , Bry_.New_by_ints(226, 128, 175) + , Bry_.New_by_ints(226, 129, 159) + , Bry_.New_by_ints(227, 128, 128) + }; + + byte[][] ary = ws; + for (byte[] bry : ary) { + trie.Add_bry_byte(bry, Byte_.Zero); + } + ary = zs; + for (byte[] bry : ary) { + trie.Add_bry_byte(bry, Byte_.Zero); + } + } + public byte[][] Ws() {return ws;} private byte[][] ws; + public byte[][] Zs() {return zs;} private byte[][] zs; + public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) { + return Xomw_regex_.Find_fwd_while(trie, trv, src, src_bgn, src_end); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java new file mode 100644 index 000000000..fbcba0bbe --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_regex_url.java @@ -0,0 +1,39 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; +import gplx.core.btries.*; +public class Xomw_regex_url { + private final Btrie_slim_mgr trie; + public Xomw_regex_url(Xomw_regex_space regex_space) { + // REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker + this.trie = Btrie_slim_mgr.cs(); + trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\""); + for (byte i = 0; i < 33; i++) { + trie.Add_bry_byte(new byte[] {i}, Byte_.Zero); + } + trie.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F + + byte[][] zs_ary = regex_space.Zs(); + for (byte[] zs : zs_ary) { + trie.Add_bry_byte(zs, Byte_.Zero); + } + } + public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) { + return Xomw_regex_.Find_fwd_until(trie, trv, src, src_bgn, src_end); + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr.java b/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr.java index 8c0311721..78e3f911b 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr.java @@ -26,12 +26,16 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls private int autonumber; private final Xomw_linker linker; private final Xomwh_atr_mgr attribs = new Xomwh_atr_mgr(); + private Xomw_regex_url regex_url; + private Xomw_regex_space regex_space; public Xomw_lnke_wkr(Xomw_parser mgr) { this.tmp = mgr.Tmp(); this.linker = mgr.Linker(); } - public void Init_by_wiki(Btrie_slim_mgr protocol_trie) { + public void Init_by_wiki(Btrie_slim_mgr protocol_trie, Xomw_regex_url regex_url, Xomw_regex_space regex_space) { this.protocol_trie = protocol_trie; + this.regex_url = regex_url; + this.regex_space = regex_space; } public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // XO.PBFR @@ -101,14 +105,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls // check for one-or-more url chars; [^][<>"\\x00-\\x20\\x7F\p{Zs}] int domain_bgn = cur; - while (true) { - byte b = src[cur]; - Object url_char_byte = invalid_url_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end); - if (url_char_byte == null) - cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); - else - break; - } + cur = regex_url.Find_fwd_while(trv, src, domain_bgn, src_end); if (cur - domain_bgn == 0) { bfr.Add_mid(src, prv, cur); prv = cur; @@ -116,14 +113,8 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls } int url_end = cur; - // get ws (if any) - int ws_bgn = -1; - while (true) { - Object space_byte = space_chars_trie.Match_at(trv, src, cur, src_end); - if (space_byte == null) break; - if (ws_bgn == -1) ws_bgn = cur; - cur += ((Int_obj_val)space_byte).Val(); - } + // skip ws + cur = regex_space.Find_fwd_while(trv, src, cur, src_end); // get text (if any) int text_bgn = -1, text_end = -1; @@ -244,27 +235,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls , Link_type__autonumber = Bry_.new_a7("autonumber") ; - private static final Btrie_slim_mgr - invalid_url_chars_trie = New__invalid_url_chars_trie() - , space_chars_trie = New__space_chars_trie() - , invalid_text_chars_trie = New__invalid_text_chars_trie() - ; - private static Btrie_slim_mgr New__invalid_url_chars_trie() { // REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker - Btrie_slim_mgr rv = Btrie_slim_mgr.cs(); - rv.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\""); - for (byte i = 0; i < 33; i++) { - rv.Add_bry_byte(new byte[] {i}, Byte_.Zero); - } - rv.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F - rv.Add_bry_byte(Bry_.New_by_ints(227, 128, 128), Byte_.Zero); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052 - return rv; - } - private static Btrie_slim_mgr New__space_chars_trie() { // REGEX:\p{Zs}; NOTE: val is key.length - Btrie_slim_mgr rv = Btrie_slim_mgr.cs(); - New__trie_itm__by_len(rv, 32); - New__trie_itm__by_len(rv, 227, 128, 128); // \p{Zs} // e3 80 80; https://phabricator.wikimedia.org/T21052 - return rv; - } + private static final Btrie_slim_mgr invalid_text_chars_trie = New__invalid_text_chars_trie(); private static Btrie_slim_mgr New__invalid_text_chars_trie() { // REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length Btrie_slim_mgr rv = Btrie_slim_mgr.cs(); New__trie_itm__by_len(rv, Byte_ascii.Brack_end); diff --git a/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr__tst.java index d189771cc..28311b621 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr__tst.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/lnkes/Xomw_lnke_wkr__tst.java @@ -45,7 +45,8 @@ class Xomw_lnke_wkr__fxt { private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); private boolean apos = true; public Xomw_lnke_wkr__fxt() { - wkr.Init_by_wiki(Xomw_parser.Protocols__dflt()); + Xomw_regex_space regex_space = new Xomw_regex_space(); + wkr.Init_by_wiki(Xomw_parser.Protocols__dflt(), new Xomw_regex_url(regex_space), regex_space); } public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); diff --git a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java index 54323a173..576d684f1 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr.java @@ -17,315 +17,229 @@ along with this program. If not, see . */ package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*; -import gplx.langs.phps.utls.*; -// public class Xomw_magiclinks_wkr { -// private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:" -// private final Btrie_rv trv = new Btrie_rv(); -// public Xomw_magiclinks_wkr() { -// } -// private static byte[] Tag__anch__rhs, Prefix__rfc, Prefix__pmid; -// -// private static final byte Space__tab = 1, Space__nbsp_ent = 2, Space__nbsp_dec = 3, Space__nbsp_hex = 4; -// private static Btrie_slim_mgr space_trie; -// // static final SPACE_NOT_NL = '(?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})'; -//// public void Test() { -//// regex.Add("\t", Space__tab); -//// regex.Add(" ", Space__nbsp__ent); -//// regex.Add(Regex.Make("&#").Star("0").Add("160;"), Space__nbsp__dec); -//// regex.Add(Regex.Make("&#").Brack("X", "x").Star("0").Brack("A", "a").Add("0"), Space__nbsp__hex); -//// } -// public int Find_fwd_space(byte[] src, int cur, int src_end) { -// return -1; -// } -// -// private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3, Regex__rfc = 5, Regex__isbn = 6, Regex__pmid = 7; -// public void Init_by_wiki() { -// regex_trie.Add_str_byte(""); -// Prefix__rfc = Bry_.new_a7("RFC"); -// Prefix__pmid = Bry_.new_a7("PMID"); -// space_trie = Btrie_slim_mgr.ci_a7() -// .Add_str_byte("\t", Space__tab) -// .Add_str_byte(" ", Space__nbsp_ent) -// .Add_str_byte("&#", Space__nbsp_dec) -// .Add_str_byte("&x", Space__nbsp_hex) -// ; -// } -// } -// } -// -// // Replace special strings like "ISBN xxx" and "RFC xxx" with -// // magic external links. -// public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { -// // XO.PBFR -// Bry_bfr src_bfr = pbfr.Src(); -// byte[] src = src_bfr.Bfr(); -// int src_bgn = 0; -// int src_end = src_bfr.Len(); -// Bry_bfr bfr = pbfr.Trg(); -// -// int cur = src_bgn; -// int prv = cur; -// boolean dirty = true; -// while (true) { -// if (cur == src_end) { -// if (dirty) -// bfr.Add_mid(src, prv, src_end); -// break; -// } -// -// byte b = src[cur]; -// Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end); -// // current byte doesn't look like magiclink; continue; -// if (o == null) { -// cur++; -// continue; -// } -// // looks like magiclink; do additional processing -// byte regex_tid = ((Byte_obj_ref)o).Val(); -// int trv_pos = trv.Pos(); -// int nxt_pos = trv_pos; -// boolean regex_valid = true; -// switch (regex_tid) { -// case Regex__anch: // (].*?) | // m[1]: Skip link text -// if (trv_pos < src_end) { -// // find ws in "[ \t\r\n>]" -// byte ws_byte = src[cur]; -// switch (ws_byte) { -// case Byte_ascii.Space: -// case Byte_ascii.Tab: -// case Byte_ascii.Cr: -// case Byte_ascii.Nl: -// break; -// default: -// regex_valid = false; -// break; -// } -// if (regex_valid) { -// // find -// nxt_pos++; -// int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end); -// if (anch_end == Bry_find_.Not_found) { -// regex_valid = false; -// } -// else { -// cur = anch_end + Tag__anch__rhs.length; -// } -// } -// } -// else { -// regex_valid = false; -// } -// break; -// case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside -// // just find ">" -// int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end); -// if (elem_end == Bry_find_.Not_found) -// regex_valid = false; -// else -// cur = elem_end + 1; -// break; -// case Regex__free: -// // addr; urlchar -// break; -// case Regex__rfc: -// case Regex__pmid: -// // byte[] prefix = regex == Regex__rfc ? Prefix__rfc : Prefix__pmid; -// // match previous for case sensitivity -//// if (Bry_.Eq(src, trv_pos - prefix.length - 1, trv_pos - 1, prefix)) { -//// -//// } -//// else { -//// regex_valid = false; -//// } -// break; -// } -// -//// '!(?: // Start cases -//// (].*?) | // m[1]: Skip link text -//// (<.*?>) | // m[2]: Skip stuff inside -//// // HTML elements' . " -//// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links -//// // m[4]: Post-protocol path -//// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number -//// ([0-9]+)\b | -//// \bISBN $spaces ( // m[6]: ISBN, capture number -//// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix -//// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters -//// [0-9Xx] // check digit -//// )\b -// -// } -// if (dirty) -// pbfr.Switch(); +import gplx.langs.phps.utls.*; import gplx.xowa.mws.htmls.*; +import gplx.langs.regxs.*; +public class Xomw_magiclinks_wkr { + private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:" + private final Btrie_rv trv = new Btrie_rv(); + private static byte[] Tag__anch__rhs; + private Xomw_regex_boundary regex_boundary; + private Xomw_regex_url regex_url; + private Xomw_linker linker; + private byte[] page_title; -// $prots = wfUrlProtocolsWithoutProtRel(); -// $urlChar = self::EXT_LINK_URL_CLASS; -// $addr = self::EXT_LINK_ADDR; -// $space = self::SPACE_NOT_NL; // non-newline space -// $spdash = "(?:-|$space)"; // a dash or a non-newline space -// $spaces = "$space++"; // possessive match of 1 or more spaces -// $text = preg_replace_callback( -// '!(?: // Start cases -// (].*?) | // m[1]: Skip link text -// (<.*?>) | // m[2]: Skip stuff inside -// // HTML elements' . " -// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links -// // m[4]: Post-protocol path -// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number -// ([0-9]+)\b | -// \bISBN $spaces ( // m[6]: ISBN, capture number -// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix -// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters -// [0-9Xx] // check digit -// )\b -// )!xu", [ &$this, 'magicLinkCallback' ], $text); -// return $text; -// } + private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3; + public void Init_by_wiki(Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) { + this.linker = linker; + this.regex_boundary = regex_boundary; + this.regex_url = regex_url; + regex_trie.Add_str_byte("makeFreeExternalLink($m[0], strlen($m[4])); -// } else if (isset($m[5]) && $m[5] !== '') { -// // RFC or PMID -// if (substr($m[0], 0, 3) === 'RFC') { -// if (!$this->mOptions->getMagicRFCLinks()) { -// return $m[0]; -// } -// $keyword = 'RFC'; -// $urlmsg = 'rfcurl'; -// $cssClass = 'mw-magiclink-rfc'; -// $trackingCat = 'magiclink-tracking-rfc'; -// $id = $m[5]; -// } else if (substr($m[0], 0, 4) === 'PMID') { -// if (!$this->mOptions->getMagicPMIDLinks()) { -// return $m[0]; -// } -// $keyword = 'PMID'; -// $urlmsg = 'pubmedurl'; -// $cssClass = 'mw-magiclink-pmid'; -// $trackingCat = 'magiclink-tracking-pmid'; -// $id = $m[5]; -// } else { -// throw new MWException(__METHOD__ . ': unrecognised match type "' . -// substr($m[0], 0, 20) . '"'); -// } -// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text(); -// $this->addTrackingCategory($trackingCat); -// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle); -// } else if (isset($m[6]) && $m[6] !== '' -// && $this->mOptions->getMagicISBNLinks() -// ) { -// // ISBN -// $isbn = $m[6]; -// $space = self::SPACE_NOT_NL; // non-newline space -// $isbn = preg_replace("/$space/", ' ', $isbn); -// $num = strtr($isbn, [ -// '-' => '', -// ' ' => '', -// 'x' => 'X', -// ]); -// $this->addTrackingCategory('magiclink-tracking-isbn'); -// return $this->getLinkRenderer()->makeKnownLink( -// SpecialPage::getTitleFor('Booksources', $num), -// "ISBN $isbn", -// [ -// 'class' => '@gplx.Internal protected mw-magiclink-isbn', -// 'title' => false // suppress title attribute -// ] -// ); -// } else { -// return $m[0]; -// } + if (Tag__anch__rhs == null) { + synchronized (Type_adp_.ClassOf_obj(this)) { + Tag__anch__rhs = Bry_.new_a7(""); + } + } + } - // Make a free external link, given a user-supplied URL -// public void Make_free_external_link(byte[] url, int num_post_proto) { -// byte[] trail = Bry_.Empty; + // Replace special strings like "ISBN xxx" and "RFC xxx" with + // magic external links. + public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + // XO.PBFR + Bry_bfr src_bfr = pbfr.Src(); + byte[] src = src_bfr.Bfr(); + int src_bgn = 0; + int src_end = src_bfr.Len(); + Bry_bfr bfr = pbfr.Trg(); - // The characters '<' and '>' (which were escaped by - // removeHTMLtags()) should not be included in - // URLs, per RFC 2396. - // Make   terminate a URL as well (bug T84937) -// $m2 = []; -// if (preg_match( -// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/', -// $url, -// $m2, -// PREG_OFFSET_CAPTURE -// )) { -// trail = substr($url, $m2[0][1]) . $trail; -// $url = substr($url, 0, $m2[0][1]); -// } + int cur = src_bgn; + int prv = cur; + boolean dirty = true; + // PORTED.REGEX: handle below + // NOTE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links + //'!(?: // Start cases + // (].*?) | // m[1]: Skip link text + // (<.*?>) | // m[2]: Skip stuff inside + // // HTML elements' . " + // (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links + // // m[4]: Post-protocol path + // \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number + // ([0-9]+)\b | + // \bISBN $spaces ( // m[6]: ISBN, capture number + // (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix + // (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters + // [0-9Xx] // check digit + // )\b + while (true) { + if (cur == src_end) { + if (dirty) + bfr.Add_mid(src, prv, src_end); + break; + } - // Move trailing punctuation to $trail -// $sep = ',;\.:!?'; - // If there is no left bracket, then consider right brackets fair game too -// if (strpos($url, '(') === false) { -// $sep .= ')'; -// } + byte b = src[cur]; + Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end); + // current byte doesn't look like magiclink; continue; + if (o == null) { + cur++; + continue; + } -// $urlRev = strrev($url); -// $numSepChars = strspn($urlRev, $sep); - // Don't break a trailing HTML entity by moving the ; into $trail - // This is in hot code, so use substr_compare to avoid having to - // create a new String Object for the comparison -// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) { - // more optimization: instead of running preg_match with a $ - // anchor, which can be slow, do the match on the reversed - // String starting at the desired offset. - // un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i -// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) { -// $numSepChars--; -// } -// } -// if ($numSepChars) { -// $trail = substr($url, -$numSepChars) . $trail; -// $url = substr($url, 0, -$numSepChars); -// } + // looks like magiclink; do additional processing + byte regex_tid = ((Byte_obj_val)o).Val(); + int old_pos = cur; + int trv_pos = trv.Pos(); + int nxt_pos = trv_pos; + boolean regex_valid = true; + switch (regex_tid) { + case Regex__anch: // (].*?) | // m[1]: Skip link text + if (trv_pos < src_end) { + // find ws in "[ \t\r\n>]" + byte ws_byte = src[cur]; + switch (ws_byte) { + case Byte_ascii.Space: + case Byte_ascii.Tab: + case Byte_ascii.Cr: + case Byte_ascii.Nl: + break; + default: + regex_valid = false; + break; + } + if (regex_valid) { + // find + nxt_pos++; + int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, nxt_pos, src_end); + if (anch_end == Bry_find_.Not_found) { + regex_valid = false; + } + else { + cur = anch_end + Tag__anch__rhs.length; + } + } + } + else { + regex_valid = false; + } + break; + case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside + // just find ">" + int elem_end = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, nxt_pos, src_end); + if (elem_end == Bry_find_.Not_found) + regex_valid = false; + else + cur = elem_end + 1; + break; + case Regex__free: + if (regex_boundary.Is_boundary_prv(src, cur)) { + int url_end = regex_url.Find_fwd_while(trv, src, nxt_pos, src_end); + if (url_end == nxt_pos) { + regex_valid = false; + } + else + cur = url_end; + } + else + regex_valid = false; + break; + } + if (!regex_valid) { + cur++; + } + else { + if (regex_tid == Regex__free) { + this.page_title = pctx.Page_title().Full_db(); + dirty = true; + bfr.Add_mid(src, prv, old_pos); + this.Make_free_external_link(bfr, Bry_.Mid(src, old_pos, cur), 0); + prv = cur; + } + else { + } + } + } + if (dirty) { + pbfr.Switch(); + } + } - // Verify that we still have a real URL after trail removal, and - // not just lone protocol -// if (strlen($trail) >= $numPostProto) { -// return $url . $trail; -// } + // Make a free external link, given a user-supplied URL + public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) { +// byte[] trail = Bry_.Empty; + + // The characters '<' and '>' (which were escaped by + // removeHTMLtags()) should not be included in + // URLs, per RFC 2396. + // Make   terminate a URL as well (bug T84937) + +// $m2 = []; +// if (preg_match( +// '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/', +// $url, +// $m2, +// PREG_OFFSET_CAPTURE +// )) { +// trail = substr($url, $m2[0][1]) . $trail; +// $url = substr($url, 0, $m2[0][1]); +// } + + // Move trailing punctuation to $trail +// $sep = ',;\.:!?'; + // If there is no left bracket, then consider right brackets fair game too +// if (strpos($url, '(') === false) { +// $sep .= ')'; +// } -// $url = Sanitizer::cleanUrl($url); +// $urlRev = strrev($url); +// $numSepChars = strspn($urlRev, $sep); + // Don't break a trailing HTML entity by moving the ; into $trail + // This is in hot code, so use substr_compare to avoid having to + // create a new String Object for the comparison - // Is this an external image? -// $text = $this->maybeMakeExternalImage($url); -// if ($text === false) { - // Not an image, make a link -// $text = Linker::makeExternalLink($url, -// $this->getConverterLanguage()->markNoConversion($url, true), -// true, 'free', -// $this->getExternalLinkAttribs($url), $this->mTitle); - // Register it in the output Object... - // Replace unnecessary URL escape codes with their equivalent characters -// $pasteurized = self::normalizeLinkUrl($url); -// $this->mOutput->addExternalLink($pasteurized); +// if ($numSepChars && substr_compare($url, ";", -$numSepChars, 1) === 0) { + // more optimization: instead of running preg_match with a $ + // anchor, which can be slow, do the match on the reversed + // String starting at the desired offset. + // un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i +// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars)) { +// $numSepChars--; // } -// return $text . $trail; // } -// } -// } +// if ($numSepChars) { +// $trail = substr($url, -$numSepChars) . $trail; +// $url = substr($url, 0, -$numSepChars); +// } + + // Verify that we still have a real URL after trail removal, and + // not just lone protocol +// if (strlen($trail) >= $numPostProto) { +// return $url . $trail; +// } + +// $url = Sanitizer::cleanUrl($url); + + // Is this an external image? + byte[] text = null; // $this->maybeMakeExternalImage($url); + if (text == null) { + // Not an image, make a link + linker.Make_external_link(bfr, url + , url // $this->getConverterLanguage()->markNoConversion($url, true), + , true, Bry_.new_a7("free") + , new Xomwh_atr_mgr() // $this->getExternalLinkAttribs($url) + , page_title); + // Register it in the output Object... + // Replace unnecessary URL escape codes with their equivalent characters +// $pasteurized = self::normalizeLinkUrl($url); +// $this->mOutput->addExternalLink($pasteurized); + } +// return $text . $trail; + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java new file mode 100644 index 000000000..842a00271 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java @@ -0,0 +1,45 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; +import org.junit.*; +public class Xomw_magiclinks_wkr__tst { + private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt(); + @Test public void Basic() {fxt.Test__parse("a https://b.org c", "a https://b.org c");} + @Test public void Invalid() {fxt.Test__parse("a _https://b.org c", "a _https://b.org c");} +} +class Xomw_magiclinks_wkr__fxt { + private final Xomw_magiclinks_wkr wkr = new Xomw_magiclinks_wkr(); + private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); + private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); + private boolean apos = true; + public Xomw_magiclinks_wkr__fxt() { + Xoae_app app = Xoa_app_fxt.Make__app__edit(); + Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); + + Xomw_regex_space regex_space = new Xomw_regex_space(); + pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1"))); + wkr.Init_by_wiki(new Xomw_linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space)); + } + public void Test__parse(String src_str, String expd) { + byte[] src_bry = Bry_.new_u8(src_str); + pbfr.Init(src_bry); + wkr.Do_magic_links(pctx, pbfr); + if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd); + Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); + } +}