mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Wikibase: Fix regex for balanced pairs
This commit is contained in:
parent
070eb3c53a
commit
eb9cca66ed
@ -161,10 +161,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
|
||||
}
|
||||
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
|
||||
int capts_len = capts == null ? 0 : capts.length;
|
||||
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
Regx_group[] grps = rslt.Groups();
|
||||
int grps_len = grps.length;
|
||||
int capts_len = capts == null ? 0 : capts.length;
|
||||
if (grps_len > 0) {
|
||||
for (int j = 0; j < grps_len; j++) {
|
||||
Regx_group grp = grps[j];
|
||||
if ( j < capts_len // bounds check b/c null can be passed
|
||||
@ -338,6 +338,10 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
break;
|
||||
}
|
||||
case Repl_tid_luacbk: {
|
||||
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
|
||||
Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
|
||||
/*
|
||||
TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02
|
||||
Keyval[] luacbk_args = null;
|
||||
Regx_group[] grps = match.Groups();
|
||||
int grps_len = grps.length;
|
||||
@ -353,6 +357,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str);
|
||||
}
|
||||
}
|
||||
*/
|
||||
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
|
||||
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22
|
||||
return false;
|
||||
|
@ -27,7 +27,8 @@ public class Scrib_lib_ustring__gsub__tst {
|
||||
Exec_gsub("a" , "(a)" , 1, "%%%1" , "%a;1");
|
||||
Exec_gsub("à{b}c", "{b}" , 1, "b" , "àbc;1"); // utf8
|
||||
Exec_gsub("àbc", "^%s*(.-)%s*$" , 1, "%1" , "àbc;1"); // utf8; regx is for trim line
|
||||
Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20
|
||||
// TOMBSTONE: tested with local MW and {{#invoke:Test|test16|a|[^]|b}} -> Lua error: Missing close-bracket for character set beginning at pattern character 1.; DATE:2018-07-02
|
||||
// Exec_gsub("a" , "[^]" , 1, "b" , "a;0"); // invalid regx should not fail; should return self; DATE:2013-10-20
|
||||
}
|
||||
@Test public void Replace__none() {// PURPOSE: gsub with no replace argument should not fail; EX:d:'orse; DATE:2013-10-14
|
||||
fxt.Test__proc__objs__flat(lib, Scrib_lib_ustring.Invk_gsub, Object_.Ary("text", "regx") , "text"); // NOTE: repl, limit deliberately omitted
|
||||
|
@ -38,6 +38,9 @@ public class Scrib_lib_ustring__match__tst {
|
||||
@Test public void Args_out_of_order() {
|
||||
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
|
||||
}
|
||||
@Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
|
||||
}
|
||||
// @Test public void Match_viwiktionary() {
|
||||
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
|
||||
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches
|
||||
|
@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst {
|
||||
@Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1");
|
||||
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"
|
||||
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
|
||||
}
|
||||
@Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2");
|
||||
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]"
|
||||
fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]"
|
||||
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
|
||||
fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02
|
||||
}
|
||||
@Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
|
@ -85,7 +85,9 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
else { // diff char: harder regex; REF.MW: $bfr .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
|
||||
if (fmtr_balanced == null) {
|
||||
fmtr_balanced = Bry_fmtr.new_("(?<b~{0}>\\~{1}(?:(?>[^\\~{1}\\~{2}]*)|\\~{1}[^\\~{1}\\~{2}]*\\~{2})*\\~{2})", "0", "1", "2"); // NOTE: complicated regex; represents 3 level depth of balanced parens; 4+ won't work; EX:(3(2(1)2)3) PAGE:en.w:Electricity_sector_in_Switzerland DATE:2015-01-23
|
||||
// JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
// REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099
|
||||
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2");
|
||||
bfr_balanced = Bry_bfr_.Reset(255);
|
||||
}
|
||||
synchronized (fmtr_balanced) {
|
||||
@ -159,68 +161,53 @@ public class Scrib_regx_converter {
|
||||
regx = bfr.To_str_and_clear();
|
||||
return regx;
|
||||
} private Bry_bfr bfr = Bry_bfr_.New();
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] src, int i, int len) {
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
|
||||
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||
++i;
|
||||
if (i < len && src[i] == Byte_ascii.Pow) { // ^
|
||||
i++;
|
||||
if (i < len && pat[i] == Byte_ascii.Pow) { // ^
|
||||
bfr.Add_byte(Byte_ascii.Pow);
|
||||
++i;
|
||||
i++;
|
||||
}
|
||||
boolean stop = false;
|
||||
for (; i < len; i++) {
|
||||
byte tmp_b = src[i];
|
||||
switch (tmp_b) {
|
||||
case Byte_ascii.Brack_end:
|
||||
stop = true;
|
||||
for (int j = i; i < len && (j == i || pat[i] != Byte_ascii.Brack_end); i++) {
|
||||
if (pat[i] == Byte_ascii.Percent) {
|
||||
i++;
|
||||
if (i >= len) {
|
||||
break;
|
||||
case Byte_ascii.Percent:
|
||||
++i;
|
||||
if (i >= len)
|
||||
stop = true;
|
||||
else {
|
||||
Object brack_obj = brack_hash.Get_by_mid(src, i, i + 1);
|
||||
}
|
||||
Object brack_obj = brack_hash.Get_by_mid(pat, i, i + 1);
|
||||
if (brack_obj != null)
|
||||
bfr.Add((byte[])brack_obj);
|
||||
else
|
||||
Regx_quote(bfr, src[i]);
|
||||
Regx_quote(bfr, pat[i]);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
boolean normal = true;
|
||||
int lhs_pos = i; // NOTE: following block handles MBCS; EX:[𠀀-] PAGE:en.d:どう DATE:2016-01-22
|
||||
int lhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[lhs_pos]);
|
||||
int dash_pos = i + lhs_len;
|
||||
if (dash_pos < len) {
|
||||
byte dash_char = src[dash_pos];
|
||||
if (dash_char == Byte_ascii.Dash) {
|
||||
int rhs_pos = dash_pos + 1;
|
||||
if (rhs_pos < len) {
|
||||
byte rhs_byte = src[rhs_pos];
|
||||
if (rhs_byte != Byte_ascii.Brack_end) {// ignore dash if followed by brack_end; EX: [a-]; PAGE:en.d:frei; DATE:2016-01-23
|
||||
int rhs_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(rhs_byte);
|
||||
if (lhs_len == 1)
|
||||
Regx_quote(bfr, src[i]);
|
||||
else
|
||||
bfr.Add_mid(src, i, i + lhs_len);
|
||||
else if (i + 2 < len && pat[i + 1] == Byte_ascii.Dash && pat[i + 2] != Byte_ascii.Brack_end && pat[i + 2] != Byte_ascii.Hash) {
|
||||
if (pat[i] <= pat[i + 2]) {
|
||||
Regx_quote(bfr, pat[i]);
|
||||
bfr.Add_byte(Byte_ascii.Dash);
|
||||
if (rhs_len == 1)
|
||||
Regx_quote(bfr, src[rhs_pos]);
|
||||
else
|
||||
bfr.Add_mid(src, rhs_pos, rhs_pos + rhs_len);
|
||||
i = rhs_pos + rhs_len - 1; // -1 b/c for() will do ++i
|
||||
normal = false;
|
||||
Regx_quote(bfr, pat[i + 2]);
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
else {
|
||||
Regx_quote(bfr, pat[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (normal)
|
||||
Regx_quote(bfr, src[i]);
|
||||
break;
|
||||
}
|
||||
if (stop) break;
|
||||
}
|
||||
if (i >= len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
|
||||
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
|
||||
bfr.Add_byte(Byte_ascii.Brack_end);
|
||||
|
||||
// TOMBSTONE: below code will never run as it's not possible to generate "[]" or "[^]"; DATE:2018-07-01
|
||||
// Lua just ignores invalid ranges, while pcre throws an error.
|
||||
// We filter them out above, but then we need to special-case empty sets
|
||||
int bfr_len = bfr.Len();
|
||||
byte[] bfr_bry = bfr.Bfr();
|
||||
if (bfr_len == 2 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Brack_end) {
|
||||
// Can't directly quantify (*FAIL), so wrap it.
|
||||
// "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
|
||||
bfr.Clear().Add_str_a7("(?:(*FAIL))");
|
||||
}
|
||||
else if (bfr_len == 3 && bfr_bry[0] == Byte_ascii.Brack_bgn && bfr_bry[1] == Byte_ascii.Pow && bfr_bry[2] == Byte_ascii.Brack_end) {
|
||||
bfr.Clear().Add_str_a7(".");// 's' modifier is always used, so this works
|
||||
}
|
||||
return i;
|
||||
}
|
||||
boolean grps_open_Has(List_adp list, int v) {
|
||||
|
@ -27,7 +27,7 @@ public class Scrib_regx_converter_tst {
|
||||
@Test public void Percent_has() {fxt.Test_parse("%a" , "\\p{L}");}
|
||||
@Test public void Percent_na() {fxt.Test_parse("%y" , "y");}
|
||||
@Test public void Percent_b00() {fxt.Test_parse("%b00" , "{0}[^0]*0");}
|
||||
@Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?<b1>\\0(?:(?>[^\\0\\1]*)|\\0[^\\0\\1]*\\1)*\\1)");}
|
||||
@Test public void Percent_b01() {fxt.Test_parse("%b01" , "(?=\\0)(?:(?=.*?\\0(?!.*?\\1)(.*\\1(?!.*\\2).*))(?=.*?\\1(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\0]*(?=\\2$)");}
|
||||
// @Test public void Percent_num() {fxt.Test_parse("()%1" , "(?<m1>)\\g{m1}");}
|
||||
@Test public void Percent_text() {fxt.Test_parse("%e" , "e");}
|
||||
@Test public void Brack_pow() {fxt.Test_parse("[^a]" , "[^a]");}
|
||||
@ -42,11 +42,16 @@ public class Scrib_regx_converter_tst {
|
||||
@Test public void Balanced() {
|
||||
fxt.Test_replace("a(1)c" , "%b()", "b", "abc");
|
||||
fxt.Test_replace("a(2(1)2)c" , "%b()", "b", "abc");
|
||||
fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "a(3b3)c");
|
||||
fxt.Test_replace("a(3(2(1)2)3)c" , "%b()", "b", "abc");
|
||||
}
|
||||
@Test public void Balanced_nested() { // handle nested; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
fxt.Test_replace("[[a|b[[c]]d]] p1" , "%b[]", "z", "z p1");
|
||||
}
|
||||
@Test public void Mbcs() { // PURPOSE: handle regex for multi-byte chars; PAGE:en.d:どう; DATE:2016-01-22; .NET.REGX:fails
|
||||
fxt.Test_replace("𠀀" , "[𠀀-]" , "a", "a");
|
||||
}
|
||||
// @Test public void Brack_empty_all() {fxt.Test_parse("[]" , "(?:(*FAIL))");}
|
||||
// @Test public void Brack_empty_not() {fxt.Test_parse("[^]" , ".");}
|
||||
}
|
||||
class Scrib_regx_converter_fxt {
|
||||
private Scrib_regx_converter under;
|
||||
|
Loading…
Reference in New Issue
Block a user