diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index 34559e873..23b3bf51f 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -161,10 +161,10 @@ public class Scrib_lib_ustring implements Scrib_lib { return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list)); } private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch - Regx_group[] grps = rslt.Groups(); - int grps_len = grps.length; int capts_len = capts == null ? 0 : capts.length; - if (grps_len > 0) { + if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + Regx_group[] grps = rslt.Groups(); + int grps_len = grps.length; for (int j = 0; j < grps_len; j++) { Regx_group grp = grps[j]; if ( j < capts_len // bounds check b/c null can be passed @@ -338,6 +338,10 @@ class Scrib_lib_ustring_gsub_mgr { break; } case Repl_tid_luacbk: { + String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end()); + Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str); + /* + TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02 Keyval[] luacbk_args = null; Regx_group[] grps = match.Groups(); int grps_len = grps.length; @@ -353,6 +357,7 @@ class Scrib_lib_ustring_gsub_mgr { luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str); } } + */ Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args); if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22 return false; diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java index 265b946ac..f945bbf44 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__gsub__tst.java @@ -27,7 +27,8 @@ public class Scrib_lib_ustring__gsub__tst { Exec_gsub("a" , "(a)" , 1, "%%%1" , "%a;1"); Exec_gsub("à{b}c", "{b}" , 1, "b" , "àbc;1"); // utf8 Exec_gsub("àbc", "^%s*(.-)%s*$" , 1, "%1" , "àbc;1"); // utf8; regx is for trim line - Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20 + // TOMBSTONE: tested with local MW and {{#invoke:Test|test16|a|[^]|b}} -> Lua error: Missing close-bracket for character set beginning at pattern character 1.; DATE:2018-07-02 + // Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20 } @Test public void Replace__none() {// PURPOSE: gsub with no replace argument should not fail; EX:d:'orse; DATE:2013-10-14 fxt.Test__proc__objs__flat(lib, Scrib_lib_ustring.Invk_gsub, Object_.Ary("text", "regx") , "text"); // NOTE: repl, limit deliberately omitted diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java index e008f01c1..31be2ac34 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java @@ -38,6 +38,9 @@ public class Scrib_lib_ustring__match__tst { @Test public void Args_out_of_order() { fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]"))); } + @Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] "); + } // @Test public void Match_viwiktionary() { // fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match); // Exec_match("tr" , "()(r)", 1, ";"); // should return all matches diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java index 198ec11f9..507c0c0e0 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__shell_cmd__tst.java @@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst { @Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1"); - fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]" + fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02 } @Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2"); - fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]" - fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]" + fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02 + fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02 } @Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06 fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub); diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java index 31e68ad3e..9b2021154 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter.java @@ -85,7 +85,9 @@ public class Scrib_regx_converter { } else { // diff char: harder regex; REF.MW: $bfr .= "(?$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; if (fmtr_balanced == null) { - fmtr_balanced = Bry_fmtr.new_("(?\\~{1}(?:(?>[^\\~{1}\\~{2}]*)|\\~{1}[^\\~{1}\\~{2}]*\\~{2})*\\~{2})", "0", "1", "2"); // NOTE: complicated regex; represents 3 level depth of balanced parens; 4+ won't work; EX:(3(2(1)2)3) PAGE:en.w:Electricity_sector_in_Switzerland DATE:2015-01-23 + // JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + // REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099 + fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2"); bfr_balanced = Bry_bfr_.Reset(255); } synchronized (fmtr_balanced) { @@ -159,68 +161,53 @@ public class Scrib_regx_converter { regx = bfr.To_str_and_clear(); return regx; } private Bry_bfr bfr = Bry_bfr_.New(); - private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] src, int i, int len) { + private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) { bfr.Add_byte(Byte_ascii.Brack_bgn); - ++i; - if (i < len && src[i] == Byte_ascii.Pow) { // ^ + i++; + if (i < len && pat[i] == Byte_ascii.Pow) { // ^ bfr.Add_byte(Byte_ascii.Pow); - ++i; + i++; } - boolean stop = false; - for (; i < len; i++) { - byte tmp_b = src[i]; - switch (tmp_b) { - case Byte_ascii.Brack_end: - stop = true; - break; - case Byte_ascii.Percent: - ++i; - if (i >= len) - stop = true; - else { - Object brack_obj = brack_hash.Get_by_mid(src, i, i + 1); - if (brack_obj != null) - bfr.Add((byte[])brack_obj); - else - Regx_quote(bfr, src[i]); - } - break; - default: - boolean normal = true; - int lhs_pos = i; // NOTE: following block handles MBCS; EX:[𠀀-𯨟] PAGE:en.d:どう DATE:2016-01-22 - int lhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[lhs_pos]); - int dash_pos = i + lhs_len; - if (dash_pos < len) { - byte dash_char = src[dash_pos]; - if (dash_char == Byte_ascii.Dash) { - int rhs_pos = dash_pos + 1; - if (rhs_pos < len) { - byte rhs_byte = src[rhs_pos]; - if (rhs_byte != Byte_ascii.Brack_end) {// ignore dash if followed by brack_end; EX: [a-]; PAGE:en.d:frei; DATE:2016-01-23 - int rhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(rhs_byte); - if (lhs_len == 1) - Regx_quote(bfr, src[i]); - else - bfr.Add_mid(src, i, i + lhs_len); - bfr.Add_byte(Byte_ascii.Dash); - if (rhs_len == 1) - Regx_quote(bfr, src[rhs_pos]); - else - bfr.Add_mid(src, rhs_pos, rhs_pos + rhs_len); - i = rhs_pos + rhs_len - 1; // -1 b/c for() will do ++i - normal = false; - } - } - } - } - if (normal) - Regx_quote(bfr, src[i]); + for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) { + if (pat[i] == Byte_ascii.Percent) { + i++; + if (i >= len) { break; + } + Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1); + if (brack_obj != null) + bfr.Add((byte[])brack_obj); + else + Regx_quote(bfr, pat[i]); + } + else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) { + if (pat[i] <= pat[i + 2]) { + Regx_quote(bfr, pat[i]); + bfr.Add_byte(Byte_ascii.Dash); + Regx_quote(bfr, pat[i + 2]); + } + i += 2; + } + else { + Regx_quote(bfr, pat[i]); } - if (stop) break; } - if (i >= len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos"); + if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos"); bfr.Add_byte(Byte_ascii.Brack_end); + + // TOMBSTONE: below code will never run as it's not possible to generate "[]" or "[^]"; DATE:2018-07-01 + // Lua just ignores invalid ranges, while pcre throws an error. + // We filter them out above, but then we need to special-case empty sets + int bfr_len = bfr.Len(); + byte[] bfr_bry = bfr.Bfr(); + if (bfr_len == 2 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Brack_end) { + // Can't directly quantify (*FAIL), so wrap it. + // "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33 + bfr.Clear().Add_str_a7("(?:(*FAIL))"); + } + else if (bfr_len == 3 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Pow && bfr_bry[2] == Byte_ascii.Brack_end) { + bfr.Clear().Add_str_a7(".");// 's' modifier is always used, so this works + } return i; } boolean grps_open_Has(List_adp list, int v) { diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java index b97df73bb..8503a9ab2 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_regx_converter_tst.java @@ -27,7 +27,7 @@ public class Scrib_regx_converter_tst { @Test public void Percent_has() {fxt.Test_parse("%a" , "\\p{L}");} @Test public void Percent_na() {fxt.Test_parse("%y" , "y");} @Test public void Percent_b00() {fxt.Test_parse("%b00" , "{0}[^0]*0");} - @Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?\\0(?:(?>[^\\0\\1]*)|\\0[^\\0\\1]*\\1)*\\1)");} + @Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?=\\0)(?:(?=.*?\\0(?!.*?\\1)(.*\\1(?!.*\\2).*))(?=.*?\\1(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\0]*(?=\\2$)");} // @Test public void Percent_num() {fxt.Test_parse("()%1" , "(?)\\g{m1}");} @Test public void Percent_text() {fxt.Test_parse("%e" , "e");} @Test public void Brack_pow() {fxt.Test_parse("[^a]" , "[^a]");} @@ -42,11 +42,16 @@ public class Scrib_regx_converter_tst { @Test public void Balanced() { fxt.Test_replace("a(1)c" , "%b()", "b", "abc"); fxt.Test_replace("a(2(1)2)c" , "%b()", "b", "abc"); - fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "a(3b3)c"); + fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "abc"); + } + @Test public void Balanced_nested() { // handle nested; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + fxt.Test_replace("[[a|b[[c]]d]] p1" , "%b[]", "z", "z p1"); } @Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails fxt.Test_replace("𠀀" , "[𠀀-𯨟]" , "a", "a"); } +// @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");} +// @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");} } class Scrib_regx_converter_fxt { private Scrib_regx_converter under;