From d98639c18a302779d4ade2aa14bdc18e55eff38e Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sun, 8 Jul 2018 07:51:43 -0400 Subject: [PATCH] Scribunto: Handle fake capturing groups from balanced regex --- 100_core/src/gplx/langs/regxs/Regx_match.java | 18 ++- .../scribunto/libs/Scrib_lib_ustring.java | 21 ++- .../libs/Scrib_lib_ustring__match__tst.java | 8 +- .../Scrib_lib_ustring__shell_cmd__tst.java | 6 +- .../scribunto/libs/Scrib_regx_converter.java | 148 ++++++++++-------- .../libs/Scrib_regx_converter_tst.java | 4 +- .../scribunto/libs/Scrib_regx_group_mgr.java | 90 +++++++++++ 7 files changed, 207 insertions(+), 88 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_group_mgr.java diff --git a/100_core/src/gplx/langs/regxs/Regx_match.java b/100_core/src/gplx/langs/regxs/Regx_match.java index 72ee22be9..7eed05e5a 100644 --- a/100_core/src/gplx/langs/regxs/Regx_match.java +++ b/100_core/src/gplx/langs/regxs/Regx_match.java @@ -14,13 +14,19 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.langs.regxs; import gplx.*; import gplx.langs.*; -public class Regx_match { - public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {this.rslt = rslt; this.find_bgn = find_bgn; this.find_end = find_end; this.groups = groups;} - public boolean Rslt() {return rslt;} private boolean rslt; +public class Regx_match { + public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) { + this.rslt = rslt; + this.find_bgn = find_bgn; + this.find_end = find_end; + this.groups = groups; + } + public boolean Rslt() {return rslt;} private final boolean rslt; public boolean Rslt_none() {return !rslt;} // NOTE: was "|| find_end - find_bgn == 0"; DATE:2013-04-11; DATE:2014-09-02 - public int Find_bgn() {return find_bgn;} int find_bgn; - public int Find_end() {return find_end;} int find_end; + public int Find_bgn() {return find_bgn;} private final int find_bgn; + public int Find_end() {return find_end;} private final int find_end; public int Find_len() {return find_end - find_bgn;} - public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty; + public Regx_group[] Groups() {return groups;} private final Regx_group[] groups; + public static final Regx_match[] Ary_empty = new Regx_match[0]; } diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index 23b3bf51f..dd7eb67b3 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -23,7 +23,6 @@ public class Scrib_lib_ustring implements Scrib_lib { public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod; public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max; public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000; - private Scrib_regx_converter regx_converter = new Scrib_regx_converter(); public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;} public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);} public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) { @@ -74,7 +73,8 @@ public class Scrib_lib_ustring implements Scrib_lib { : rslt.Init_ary_empty() ; } - regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G); + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); + regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G); Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04 int len = regx_rslts.length; @@ -103,7 +103,8 @@ public class Scrib_lib_ustring implements Scrib_lib { public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) { String text = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 - String regx = regx_converter.Parse(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G); + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); + String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G); int bgn = args.Cast_int_or(2, 1); bgn = Bgn_adjust(text, bgn); Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx); @@ -112,6 +113,7 @@ public class Scrib_lib_ustring implements Scrib_lib { if (len == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 // TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23 + regx_rslts = regx_converter.Adjust_balanced(regx_rslts); List_adp tmp_list = List_adp_.New(); AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true); return rslt.Init_many_list(tmp_list); @@ -122,6 +124,7 @@ public class Scrib_lib_ustring implements Scrib_lib { public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { boolean rv = false; synchronized (gsub_mgr_lock) { // handle recursive gsub calls; PAGE:en.d:כלב; DATE:2016-01-22 + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); int new_len = gsub_mgr_len + 1; if (new_len == gsub_mgr_max) { this.gsub_mgr_max = new_len == 0 ? 2 : new_len * 2; @@ -143,7 +146,8 @@ public class Scrib_lib_ustring implements Scrib_lib { public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { // String text = Scrib_kv_utl_.Val_to_str(values, 0); byte[] regx = args.Pull_bry(1); - String pcre = regx_converter.Parse(regx, Scrib_regx_converter.Anchor_null); + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); + String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null); return rslt.Init_many_objs(pcre, regx_converter.Capt_ary()); } public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) { @@ -204,7 +208,7 @@ class Scrib_lib_ustring_gsub_mgr { String regx = args.Xstr_str_or_null(1); // NOTE: @pattern sometimes int; PAGE:en.d:λύω; DATE:2014-09-02 if (args.Len() == 2) return rslt.Init_obj(text); // if no replace arg, return self; PAGE:en.d:'orse; DATE:2013-10-13 Object repl_obj = args.Cast_obj_or_null(2); - regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow); + regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow); int limit = args.Cast_int_or(3, -1); repl_count = 0; Identify_repl(repl_obj); @@ -251,6 +255,7 @@ class Scrib_lib_ustring_gsub_mgr { if ( rslts.length == 0 // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php || regx_mgr.Pattern_is_invalid() // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02 ) return text; + rslts = regx_converter.Adjust_balanced(rslts); Bry_bfr tmp_bfr = Bry_bfr_.New(); int len = rslts.length; int pos = 0; @@ -338,10 +343,11 @@ class Scrib_lib_ustring_gsub_mgr { break; } case Repl_tid_luacbk: { + // TOMBSTONE: was causing garbled text on PAGE:en.w:Template:Infobox_kommune DATE:2018-07-02 + /* String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end()); Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str); - /* - TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02 + */ Keyval[] luacbk_args = null; Regx_group[] grps = match.Groups(); int grps_len = grps.length; @@ -357,6 +363,7 @@ class Scrib_lib_ustring_gsub_mgr { luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str); } } + /* */ Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args); if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22 diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java index 31be2ac34..9c14586a8 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java @@ -38,9 +38,15 @@ public class Scrib_lib_ustring__match__tst { @Test public void Args_out_of_order() { fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]"))); } - @Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + @Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] "); } + @Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas + Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9"); + } + @Test public void Balanced__numbered_2() { + Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9"); + } // @Test public void Match_viwiktionary() { // fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match); // Exec_match("tr" , "()(r)", 1, ";"); // should return all matches diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java index 507c0c0e0..1668959b4 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java @@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst { @Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1"); - fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02 + fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"; } @Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2"); - fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02 - fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02 + fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"; + fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]"; } @Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java index 9b2021154..02d7f2976 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java @@ -17,64 +17,73 @@ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import import gplx.core.brys.fmtrs.*; import gplx.langs.regxs.*; public class Scrib_regx_converter { - private final List_adp capt_list = List_adp_.New(), grps_parens = List_adp_.New(); private final List_adp grps_open = List_adp_.New(); - private final Bry_bfr tmp_bfr = Bry_bfr_.New(); + private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs(); + private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr(); + private final Bry_bfr bfr = Bry_bfr_.New(); + private Bry_bfr tmp_bfr; + private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced; public Scrib_regx_converter() {Init();} public String Regx() {return regx;} private String regx; - public List_adp Capt_list() {return capt_list;} - public Keyval[] Capt_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);} - private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced; - public String Parse(byte[] src, byte[] anchor) { - int len = src.length; + public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();} + public boolean Any_pos() {return any_pos;} private boolean any_pos; + public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);} + public String patternToRegex(byte[] pat, byte[] anchor) { + // TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey)) + grp_mgr.Clear(); + any_pos = false; boolean q_flag = false; - capt_list.Clear(); grps_open.Clear(); grps_parens.Clear(); + + // bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start + int len = pat.length; int grps_len = 0; int bct = 0; - // bfr.Add_byte(Byte_ascii.Slash); // NOTE: do not add PHP "/" at start for (int i = 0; i < len; i++) { + int i_end = i + 1; q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08 - byte cur = src[i]; + byte cur = pat[i]; switch (cur) { case Byte_ascii.Pow: q_flag = i != 0; - bfr.Add(anchor == Anchor_null || q_flag ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07 + bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07 break; case Byte_ascii.Dollar: q_flag = i < len - 1; bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal); break; case Byte_ascii.Paren_bgn: { - if (i + 1 >= len) throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i)); - boolean capt_itm = src[i + 1] == Byte_ascii.Paren_end; // current is "()" - ++grps_len; - capt_list.Add(Keyval_.int_(grps_len, capt_itm)); - bfr.Add_byte(Byte_ascii.Paren_bgn); - grps_open.Add(grps_len); - grps_parens.Add(i + 1); + if (i + 1 >= len) + throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i_end)); + int grp_idx = grp_mgr.Capt__len() + 1; + boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; // current is "()" + if (is_empty_capture) + any_pos = true; + bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?"; + grp_mgr.Capt__add__real(grp_idx, is_empty_capture); break; } case Byte_ascii.Paren_end: - if (grps_open.Count() == 0) - throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i)); - List_adp_.Del_at_last(grps_open); + if (grp_mgr.Open__len() <= 0) + throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i_end)); + grp_mgr.Open__pop(); bfr.Add_byte(Byte_ascii.Paren_end); break; case Byte_ascii.Percent: - ++i; - if (i >= len) throw Err_.new_wo_type("malformed pattern (ends with '%')"); - Object percent_obj = percent_hash.Get_by_mid(src, i, i + 1); + i++; + if (i >= len) + throw Err_.new_wo_type("malformed pattern (ends with '%')"); + Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1); if (percent_obj != null) { bfr.Add((byte[])percent_obj); q_flag = true; } else { - byte nxt = src[i]; + byte nxt = pat[i]; switch (nxt) { case Byte_ascii.Ltr_b: // EX: "%b()" i += 2; if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')"); - byte char_0 = src[i - 1]; - byte char_1 = src[i]; + byte char_0 = pat[i - 1]; + byte char_1 = pat[i]; if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1"; bfr.Add(Bry_bf0_seg_0); Regx_quote(bfr, char_0); @@ -85,40 +94,46 @@ public class Scrib_regx_converter { } else { // diff char: harder regex; REF.MW: $bfr .= "(?$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; if (fmtr_balanced == null) { - // JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + // JAVA:recursive regex not possible, so need complicated regex // REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099 - fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2"); + // PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\~{3})(.*\\~{2}(?!.*\\~{4}).*))(?=.*?\\~{2}(?!.*?\\~{4})(.*)).)+?.*?(?=\\~{3})[^\\~{1}]*(?=\\~{4}$)", "unused", "1", "2", "3", "4"); bfr_balanced = Bry_bfr_.Reset(255); } synchronized (fmtr_balanced) { ++bct; - fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1)); + int balanced_idx = grp_mgr.Full__len(); + fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2)); + grp_mgr.Capt__add__fake(2); bfr.Add(bfr_balanced.To_bry_and_clear()); } } break; case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21 - ++i; - if (i + 1 >= len || src[i] != Byte_ascii.Brack_bgn) throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character $ii"); + if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn) + throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end)); // %f always followed by bracketed term; convert lua bracketed term to regex - i = bracketedCharSetToRegex(tmp_bfr, src, i, len); - byte[] bracketed_regx = tmp_bfr.To_bry_and_clear(); + if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New(); + i = bracketedCharSetToRegex(tmp_bfr, pat, i, len); + byte[] re2 = tmp_bfr.To_bry_and_clear(); // scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".' - // if the bracketed_regx is a negative class it will match \0; so, \W means anything not a word char, which will match \0; \w means word char which will not match \0 - if (Regx_adp_.Match("\0", String_.new_u8(bracketed_regx))) - bfr.Add_str_a7("(? capt_list.Count() || grps_open_Has(grps_open, grps_len)) + case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: { + int grp_idx = nxt - Byte_ascii.Num_0; + if (grp_idx == 0 || grp_idx > grp_mgr.Capt__len() || grp_mgr.Open__has(grp_idx)) throw Err_.new_wo_type("invalid capture index %" + grps_len + " at pattern character " + Int_.To_str(i)); - bfr.Add(Bry_bf2_seg_0).Add_int_variable(grps_len);//.Add(Bry_bf2_seg_1); // $bfr .= "\\g{m$grps_len}"; + bfr.Add(Bry_bf2_seg_0); + grp_mgr.Idx__add(bfr, grp_idx); break; + } default: Regx_quote(bfr, nxt); q_flag = true; @@ -127,21 +142,22 @@ public class Scrib_regx_converter { } break; case Byte_ascii.Brack_bgn: - i = bracketedCharSetToRegex(bfr, src, i, len); + i = bracketedCharSetToRegex(bfr, pat, i, len); q_flag = true; break; - case Byte_ascii.Brack_end: throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i)); + case Byte_ascii.Brack_end: + throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end)); case Byte_ascii.Dot: - q_flag = true; bfr.Add_byte(Byte_ascii.Dot); + q_flag = true; break; default: - q_flag = true; Regx_quote(bfr, cur); + q_flag = true; break; } if (q_flag && i + 1 < len) { - byte tmp_b = src[i + 1]; + byte tmp_b = pat[i + 1]; switch (tmp_b) { case Byte_ascii.Star: case Byte_ascii.Plus: @@ -150,17 +166,18 @@ public class Scrib_regx_converter { ++i; break; case Byte_ascii.Dash: - bfr.Add(Bry_regx_dash); - ++i; + bfr.Add(Bry_star_question); + i++; break; } } } - if (grps_open.Count() > 0) throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + Int_.Cast(grps_open.Get_at(0))); -// bfr.Add(Bry_regx_end); // NOTE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere + if (grp_mgr.Open__len() > 0) + throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + grp_mgr.Open__get_at(0)); + // bfr.Add(Bry_regx_end); // TOMBSTONE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere regx = bfr.To_str_and_clear(); return regx; - } private Bry_bfr bfr = Bry_bfr_.New(); + } private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) { bfr.Add_byte(Byte_ascii.Brack_bgn); i++; @@ -210,14 +227,6 @@ public class Scrib_regx_converter { } return i; } - boolean grps_open_Has(List_adp list, int v) { - int len = list.Count(); - for (int i = 0; i < len; i++) { - Object o = list.Get_at(i); - if (Int_.Cast(o) == v) return true; - } - return false; - } private void Regx_quote(Bry_bfr bfr, byte b) { if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash); bfr.Add_byte(b); @@ -239,32 +248,34 @@ public class Scrib_regx_converter { , Bry_dollar_literal = Bry_.new_a7("$"), Bry_dollar_escaped = Bry_.new_a7("\\$") , Bry_bf0_seg_0 = Bry_.new_a7("{"), Bry_bf0_seg_1 = Bry_.new_a7("}[^"), Bry_bf0_seg_2 = Bry_.new_a7("]*") , Bry_bf2_seg_0 = Bry_.new_a7("\\")//, Bry_bf2_seg_1 = Bry_.new_a7("") - , Bry_regx_dash = Bry_.new_a7("*?") // was *? + , Bry_star_question = Bry_.new_a7("*?") // was *? ; public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^"); private void Init() { String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10 String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10 - Init_itm(Bool_.Y, "d", "\\p{Nd}"); - Init_itm(Bool_.Y, "l", "\\p{Ll}"); - Init_itm(Bool_.Y, "u", "\\p{Lu}"); Init_itm(Bool_.Y, "a", "\\p{L}"); Init_itm(Bool_.Y, "c", "\\p{Cc}"); + Init_itm(Bool_.Y, "d", "\\p{Nd}"); + Init_itm(Bool_.Y, "l", "\\p{Ll}"); Init_itm(Bool_.Y, "p", "\\p{P}"); - Init_itm(Bool_.Y, "s", "\\s"); + Init_itm(Bool_.Y, "s", "\\s"); // JAVA: \p{Xps} not valid; REF: https://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html + Init_itm(Bool_.Y, "u", "\\p{Lu}"); Init_itm(Bool_.Y, "w", regx_w); Init_itm(Bool_.Y, "x", "[0-9A-Fa-f0-9A-Fa-f]"); Init_itm(Bool_.Y, "z", "\\x00"); - Init_itm(Bool_.Y, "D", "\\P{Nd}"); - Init_itm(Bool_.Y, "L", "\\P{Ll}"); - Init_itm(Bool_.Y, "U", "\\P{Lu}"); + Init_itm(Bool_.Y, "A", "\\P{L}"); Init_itm(Bool_.Y, "C", "\\P{Cc}"); + Init_itm(Bool_.Y, "D", "\\P{Nd}"); + Init_itm(Bool_.Y, "L", "\\P{Ll}"); Init_itm(Bool_.Y, "P", "\\P{P}"); Init_itm(Bool_.Y, "S", "\\S"); // JAVA: \P{Xps} not valid + Init_itm(Bool_.Y, "U", "\\P{Lu}"); Init_itm(Bool_.Y, "W", regx_W); Init_itm(Bool_.Y, "X", "[^0-9A-Fa-f0-9A-Fa-f]"); Init_itm(Bool_.Y, "Z", "[^\\x00]"); + Init_itm(Bool_.N, "w", regx_w); Init_itm(Bool_.N, "x", "0-9A-Fa-f0-9A-Fa-f"); Init_itm(Bool_.N, "W", regx_W); @@ -282,5 +293,4 @@ public class Scrib_regx_converter { brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions } } - private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs(); } diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java index 8503a9ab2..7c58f6b52 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java @@ -61,11 +61,11 @@ class Scrib_regx_converter_fxt { } } public void Test_parse(String raw, String expd) { - under.Parse(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G); + under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G); Tfds.Eq(expd, under.Regx()); } public void Test_replace(String text, String find, String replace, String expd) { - String regex_str = under.Parse(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G); + String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G); String actl = Regx_adp_.Replace(text, regex_str, replace); Tfds.Eq(expd, actl); } diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_group_mgr.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_group_mgr.java new file mode 100644 index 000000000..8ed44dab4 --- /dev/null +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_group_mgr.java @@ -0,0 +1,90 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; +import gplx.langs.regxs.*; +class Scrib_regx_grp_mgr { + private final List_adp capt_list = List_adp_.New(); + private final List_adp full_list = List_adp_.New(); + private final List_adp open_list = List_adp_.New(); + private final Hash_adp idx_list = Hash_adp_.New(); + private int fake_count; + public void Clear() { + open_list.Clear(); + capt_list.Clear(); + full_list.Clear(); + idx_list.Clear(); + fake_count = 0; + } + public int Full__len() {return full_list.Len();} + public int Open__len() {return open_list.Len();} + public int Open__get_at(int idx) {return Int_.Cast(open_list.Get_at(idx));} + public void Open__pop() {List_adp_.Del_at_last(open_list);} + public boolean Open__has(int v) { + int len = open_list.Count(); + for (int i = 0; i < len; i++) { + Object o = open_list.Get_at(i); + if (Int_.Cast(o) == v) return true; + } + return false; + } + + public int Capt__len() {return capt_list.Count();} + public Keyval[] Capt__to_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);} + public void Capt__add__real(int grp_idx, boolean is_empty_capture) { + capt_list.Add(Keyval_.int_(grp_idx, is_empty_capture)); + open_list.Add(grp_idx); + full_list.Add(new Scrib_regx_grp_itm(Bool_.N, is_empty_capture, full_list.Len())); + idx_list.Add(grp_idx, full_list.Len()); + } + public void Capt__add__fake(int count) { + for (int i = 0; i < count; i++) + full_list.Add(new Scrib_regx_grp_itm(Bool_.Y, Bool_.N, full_list.Len())); + fake_count += count; + } + public void Idx__add(Bry_bfr bfr, int regx_idx) { + int actl_idx = Int_.Cast(idx_list.Get_by(regx_idx)); + bfr.Add_int_variable(actl_idx); + } + public Regx_match[] Adjust_balanced(Regx_match[] matches) { + if (fake_count == 0) return matches; + + int matches_len = matches.length; + Regx_match[] rv = new Regx_match[matches_len]; + for (int i = 0; i < matches_len; i++) { + Regx_match match = matches[i]; + Regx_group[] old_groups = match.Groups(); + Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count]; + int group_idx = 0; + for (int j = 0; j < old_groups.length; j++) { + Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j); + if (itm.Is_fake()) continue; + new_groups[group_idx++] = old_groups[j]; + } + rv[i] = new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups); + } + return rv; + } +} +class Scrib_regx_grp_itm { + public Scrib_regx_grp_itm(boolean is_fake, boolean is_empty_capture, int idx) { + this.is_fake = is_fake; + this.is_empty_capture = is_empty_capture; + this.idx = idx; + } + public boolean Is_fake() {return is_fake;} private final boolean is_fake; + public boolean Is_empty_capture() {return is_empty_capture;} private final boolean is_empty_capture; + public int Idx() {return idx;} private final int idx; +}