1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Wikibase: Fix regex for balanced pairs

This commit is contained in:
gnosygnu 2018-07-02 22:56:26 -04:00
parent 070eb3c53a
commit eb9cca66ed
6 changed files with 66 additions and 65 deletions

View File

@ -161,10 +161,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
Regx_group[] grps = rslt.Groups();
int grps_len = grps.length;
int capts_len = capts == null ? 0 : capts.length;
if (grps_len > 0) {
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Regx_group[] grps = rslt.Groups();
int grps_len = grps.length;
for (int j = 0; j < grps_len; j++) {
Regx_group grp = grps[j];
if ( j < capts_len // bounds check b/c null can be passed
@ -338,6 +338,10 @@ class Scrib_lib_ustring_gsub_mgr {
break;
}
case Repl_tid_luacbk: {
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
/*
TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02
Keyval[] luacbk_args = null;
Regx_group[] grps = match.Groups();
int grps_len = grps.length;
@ -353,6 +357,7 @@ class Scrib_lib_ustring_gsub_mgr {
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str);
}
}
*/
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22
return false;

View File

@ -27,7 +27,8 @@ public class Scrib_lib_ustring__gsub__tst {
Exec_gsub("a" , "(a)" , 1, "%%%1" , "%a;1");
Exec_gsub("à{b}c", "{b}" , 1, "b" , "àbc;1"); // utf8
Exec_gsub("àbc", "^%s*(.-)%s*$" , 1, "%1" , "àbc;1"); // utf8; regx is for trim line
Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20
// TOMBSTONE: tested with local MW and {{#invoke:Test|test16|a|[^]|b}} -> Lua error: Missing close-bracket for character set beginning at pattern character 1.; DATE:2018-07-02
// Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20
}
@Test public void Replace__none() {// PURPOSE: gsub with no replace argument should not fail; EX:d:'orse; DATE:2013-10-14
fxt.Test__proc__objs__flat(lib, Scrib_lib_ustring.Invk_gsub, Object_.Ary("text", "regx") , "text"); // NOTE: repl, limit deliberately omitted

View File

@ -38,6 +38,9 @@ public class Scrib_lib_ustring__match__tst {
@Test public void Args_out_of_order() {
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
}
@Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
}
// @Test public void Match_viwiktionary() {
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches

View File

@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst {
@Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1");
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
}
@Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2");
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"
fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]"
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02
}
@Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);

View File

@ -85,7 +85,9 @@ public class Scrib_regx_converter {
}
else { // diff char: harder regex; REF.MW: $bfr .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
if (fmtr_balanced == null) {
fmtr_balanced = Bry_fmtr.new_("(?<b~{0}>\\~{1}(?:(?>[^\\~{1}\\~{2}]*)|\\~{1}[^\\~{1}\\~{2}]*\\~{2})*\\~{2})", "0", "1", "2"); // NOTE: complicated regex; represents 3 level depth of balanced parens; 4+ won't work; EX:(3(2(1)2)3) PAGE:en.w:Electricity_sector_in_Switzerland DATE:2015-01-23
// JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
// REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2");
bfr_balanced = Bry_bfr_.Reset(255);
}
synchronized (fmtr_balanced) {
@ -159,68 +161,53 @@ public class Scrib_regx_converter {
regx = bfr.To_str_and_clear();
return regx;
} private Bry_bfr bfr = Bry_bfr_.New();
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] src, int i, int len) {
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
bfr.Add_byte(Byte_ascii.Brack_bgn);
++i;
if (i < len && src[i] == Byte_ascii.Pow) { // ^
i++;
if (i < len && pat[i] == Byte_ascii.Pow) { // ^
bfr.Add_byte(Byte_ascii.Pow);
++i;
i++;
}
boolean stop = false;
for (; i < len; i++) {
byte tmp_b = src[i];
switch (tmp_b) {
case Byte_ascii.Brack_end:
stop = true;
break;
case Byte_ascii.Percent:
++i;
if (i >= len)
stop = true;
else {
Object brack_obj = brack_hash.Get_by_mid(src, i, i + 1);
if (brack_obj != null)
bfr.Add((byte[])brack_obj);
else
Regx_quote(bfr, src[i]);
}
break;
default:
boolean normal = true;
int lhs_pos = i; // NOTE: following block handles MBCS; EX:[𠀀-𯨟] PAGE:en.d:どう DATE:2016-01-22
int lhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[lhs_pos]);
int dash_pos = i + lhs_len;
if (dash_pos < len) {
byte dash_char = src[dash_pos];
if (dash_char == Byte_ascii.Dash) {
int rhs_pos = dash_pos + 1;
if (rhs_pos < len) {
byte rhs_byte = src[rhs_pos];
if (rhs_byte != Byte_ascii.Brack_end) {// ignore dash if followed by brack_end; EX: [a-]; PAGE:en.d:frei; DATE:2016-01-23
int rhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(rhs_byte);
if (lhs_len == 1)
Regx_quote(bfr, src[i]);
else
bfr.Add_mid(src, i, i + lhs_len);
bfr.Add_byte(Byte_ascii.Dash);
if (rhs_len == 1)
Regx_quote(bfr, src[rhs_pos]);
else
bfr.Add_mid(src, rhs_pos, rhs_pos + rhs_len);
i = rhs_pos + rhs_len - 1; // -1 b/c for() will do ++i
normal = false;
}
}
}
}
if (normal)
Regx_quote(bfr, src[i]);
for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) {
if (pat[i] == Byte_ascii.Percent) {
i++;
if (i >= len) {
break;
}
Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1);
if (brack_obj != null)
bfr.Add((byte[])brack_obj);
else
Regx_quote(bfr, pat[i]);
}
else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) {
if (pat[i] <= pat[i + 2]) {
Regx_quote(bfr, pat[i]);
bfr.Add_byte(Byte_ascii.Dash);
Regx_quote(bfr, pat[i + 2]);
}
i += 2;
}
else {
Regx_quote(bfr, pat[i]);
}
if (stop) break;
}
if (i >= len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
bfr.Add_byte(Byte_ascii.Brack_end);
// TOMBSTONE: below code will never run as it's not possible to generate "[]" or "[^]"; DATE:2018-07-01
// Lua just ignores invalid ranges, while pcre throws an error.
// We filter them out above, but then we need to special-case empty sets
int bfr_len = bfr.Len();
byte[] bfr_bry = bfr.Bfr();
if (bfr_len == 2 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Brack_end) {
// Can't directly quantify (*FAIL), so wrap it.
// "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
bfr.Clear().Add_str_a7("(?:(*FAIL))");
}
else if (bfr_len == 3 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Pow && bfr_bry[2] == Byte_ascii.Brack_end) {
bfr.Clear().Add_str_a7(".");// 's' modifier is always used, so this works
}
return i;
}
boolean grps_open_Has(List_adp list, int v) {

View File

@ -27,7 +27,7 @@ public class Scrib_regx_converter_tst {
@Test public void Percent_has() {fxt.Test_parse("%a" , "\\p{L}");}
@Test public void Percent_na() {fxt.Test_parse("%y" , "y");}
@Test public void Percent_b00() {fxt.Test_parse("%b00" , "{0}[^0]*0");}
@Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?<b1>\\0(?:(?>[^\\0\\1]*)|\\0[^\\0\\1]*\\1)*\\1)");}
@Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?=\\0)(?:(?=.*?\\0(?!.*?\\1)(.*\\1(?!.*\\2).*))(?=.*?\\1(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\0]*(?=\\2$)");}
// @Test public void Percent_num() {fxt.Test_parse("()%1" , "(?<m1>)\\g{m1}");}
@Test public void Percent_text() {fxt.Test_parse("%e" , "e");}
@Test public void Brack_pow() {fxt.Test_parse("[^a]" , "[^a]");}
@ -42,11 +42,16 @@ public class Scrib_regx_converter_tst {
@Test public void Balanced() {
fxt.Test_replace("a(1)c" , "%b()", "b", "abc");
fxt.Test_replace("a(2(1)2)c" , "%b()", "b", "abc");
fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "a(3b3)c");
fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "abc");
}
@Test public void Balanced_nested() { // handle nested; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
fxt.Test_replace("[[a|b[[c]]d]] p1" , "%b[]", "z", "z p1");
}
@Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails
fxt.Test_replace("𠀀" , "[𠀀-𯨟]" , "a", "a");
}
// @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");}
// @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");}
}
class Scrib_regx_converter_fxt {
private Scrib_regx_converter under;