1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Scribunto: Handle fake capturing groups from balanced regex

This commit is contained in:
gnosygnu 2018-07-08 07:51:43 -04:00
parent f85cf8ad77
commit d98639c18a
7 changed files with 207 additions and 88 deletions

View File

@ -14,13 +14,19 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
public class Regx_match {
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {this.rslt = rslt; this.find_bgn = find_bgn; this.find_end = find_end; this.groups = groups;}
public boolean Rslt() {return rslt;} private boolean rslt;
public class Regx_match {
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {
this.rslt = rslt;
this.find_bgn = find_bgn;
this.find_end = find_end;
this.groups = groups;
}
public boolean Rslt() {return rslt;} private final boolean rslt;
public boolean Rslt_none() {return !rslt;} // NOTE: was "|| find_end - find_bgn == 0"; DATE:2013-04-11; DATE:2014-09-02
public int Find_bgn() {return find_bgn;} int find_bgn;
public int Find_end() {return find_end;} int find_end;
public int Find_bgn() {return find_bgn;} private final int find_bgn;
public int Find_end() {return find_end;} private final int find_end;
public int Find_len() {return find_end - find_bgn;}
public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty;
public Regx_group[] Groups() {return groups;} private final Regx_group[] groups;
public static final Regx_match[] Ary_empty = new Regx_match[0];
}

View File

@ -23,7 +23,6 @@ public class Scrib_lib_ustring implements Scrib_lib {
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000;
private Scrib_regx_converter regx_converter = new Scrib_regx_converter();
public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;}
public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);}
public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) {
@ -74,7 +73,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
: rslt.Init_ary_empty()
;
}
regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
int len = regx_rslts.length;
@ -103,7 +103,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
String regx = regx_converter.Parse(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
int bgn = args.Cast_int_or(2, 1);
bgn = Bgn_adjust(text, bgn);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
@ -112,6 +113,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
if (len == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:; DATE:2015-01-30
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
@ -122,6 +124,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
boolean rv = false;
synchronized (gsub_mgr_lock) { // handle recursive gsub calls; PAGE:en.d:כלב; DATE:2016-01-22
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
int new_len = gsub_mgr_len + 1;
if (new_len == gsub_mgr_max) {
this.gsub_mgr_max = new_len == 0 ? 2 : new_len * 2;
@ -143,7 +146,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
byte[] regx = args.Pull_bry(1);
String pcre = regx_converter.Parse(regx, Scrib_regx_converter.Anchor_null);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
}
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
@ -204,7 +208,7 @@ class Scrib_lib_ustring_gsub_mgr {
String regx = args.Xstr_str_or_null(1); // NOTE: @pattern sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
if (args.Len() == 2) return rslt.Init_obj(text); // if no replace arg, return self; PAGE:en.d:'orse; DATE:2013-10-13
Object repl_obj = args.Cast_obj_or_null(2);
regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
int limit = args.Cast_int_or(3, -1);
repl_count = 0;
Identify_repl(repl_obj);
@ -251,6 +255,7 @@ class Scrib_lib_ustring_gsub_mgr {
if ( rslts.length == 0 // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
|| regx_mgr.Pattern_is_invalid() // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02
) return text;
rslts = regx_converter.Adjust_balanced(rslts);
Bry_bfr tmp_bfr = Bry_bfr_.New();
int len = rslts.length;
int pos = 0;
@ -338,10 +343,11 @@ class Scrib_lib_ustring_gsub_mgr {
break;
}
case Repl_tid_luacbk: {
// TOMBSTONE: was causing garbled text on PAGE:en.w:Template:Infobox_kommune DATE:2018-07-02
/*
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
/*
TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02
*/
Keyval[] luacbk_args = null;
Regx_group[] grps = match.Groups();
int grps_len = grps.length;
@ -357,6 +363,7 @@ class Scrib_lib_ustring_gsub_mgr {
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str);
}
}
/*
*/
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22

View File

@ -38,9 +38,15 @@ public class Scrib_lib_ustring__match__tst {
@Test public void Args_out_of_order() {
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
}
@Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
@Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
}
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9");
}
@Test public void Balanced__numbered_2() {
Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9");
}
// @Test public void Match_viwiktionary() {
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches

View File

@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst {
@Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1");
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]";
}
@Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2");
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]";
fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]";
}
@Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);

View File

@ -17,64 +17,73 @@ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import
import gplx.core.brys.fmtrs.*;
import gplx.langs.regxs.*;
public class Scrib_regx_converter {
private final List_adp capt_list = List_adp_.New(), grps_parens = List_adp_.New(); private final List_adp grps_open = List_adp_.New();
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
private final Bry_bfr bfr = Bry_bfr_.New();
private Bry_bfr tmp_bfr;
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
public Scrib_regx_converter() {Init();}
public String Regx() {return regx;} private String regx;
public List_adp Capt_list() {return capt_list;}
public Keyval[] Capt_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);}
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
public String Parse(byte[] src, byte[] anchor) {
int len = src.length;
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
public boolean Any_pos() {return any_pos;} private boolean any_pos;
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
public String patternToRegex(byte[] pat, byte[] anchor) {
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
grp_mgr.Clear();
any_pos = false;
boolean q_flag = false;
capt_list.Clear(); grps_open.Clear(); grps_parens.Clear();
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
int len = pat.length;
int grps_len = 0;
int bct = 0;
// bfr.Add_byte(Byte_ascii.Slash); // NOTE: do not add PHP "/" at start
for (int i = 0; i < len; i++) {
int i_end = i + 1;
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
byte cur = src[i];
byte cur = pat[i];
switch (cur) {
case Byte_ascii.Pow:
q_flag = i != 0;
bfr.Add(anchor == Anchor_null || q_flag ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
break;
case Byte_ascii.Dollar:
q_flag = i < len - 1;
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
break;
case Byte_ascii.Paren_bgn: {
if (i + 1 >= len) throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i));
boolean capt_itm = src[i + 1] == Byte_ascii.Paren_end; // current is "()"
++grps_len;
capt_list.Add(Keyval_.int_(grps_len, capt_itm));
bfr.Add_byte(Byte_ascii.Paren_bgn);
grps_open.Add(grps_len);
grps_parens.Add(i + 1);
if (i + 1 >= len)
throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i_end));
int grp_idx = grp_mgr.Capt__len() + 1;
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; // current is "()"
if (is_empty_capture)
any_pos = true;
bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?<m$n>";
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
break;
}
case Byte_ascii.Paren_end:
if (grps_open.Count() == 0)
throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i));
List_adp_.Del_at_last(grps_open);
if (grp_mgr.Open__len() <= 0)
throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i_end));
grp_mgr.Open__pop();
bfr.Add_byte(Byte_ascii.Paren_end);
break;
case Byte_ascii.Percent:
++i;
if (i >= len) throw Err_.new_wo_type("malformed pattern (ends with '%')");
Object percent_obj = percent_hash.Get_by_mid(src, i, i + 1);
i++;
if (i >= len)
throw Err_.new_wo_type("malformed pattern (ends with '%')");
Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1);
if (percent_obj != null) {
bfr.Add((byte[])percent_obj);
q_flag = true;
}
else {
byte nxt = src[i];
byte nxt = pat[i];
switch (nxt) {
case Byte_ascii.Ltr_b: // EX: "%b()"
i += 2;
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
byte char_0 = src[i - 1];
byte char_1 = src[i];
byte char_0 = pat[i - 1];
byte char_1 = pat[i];
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
bfr.Add(Bry_bf0_seg_0);
Regx_quote(bfr, char_0);
@ -85,40 +94,46 @@ public class Scrib_regx_converter {
}
else { // diff char: harder regex; REF.MW: $bfr .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
if (fmtr_balanced == null) {
// JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
// JAVA:recursive regex not possible, so need complicated regex
// REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2");
// PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\~{3})(.*\\~{2}(?!.*\\~{4}).*))(?=.*?\\~{2}(?!.*?\\~{4})(.*)).)+?.*?(?=\\~{3})[^\\~{1}]*(?=\\~{4}$)", "unused", "1", "2", "3", "4");
bfr_balanced = Bry_bfr_.Reset(255);
}
synchronized (fmtr_balanced) {
++bct;
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1));
int balanced_idx = grp_mgr.Full__len();
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
grp_mgr.Capt__add__fake(2);
bfr.Add(bfr_balanced.To_bry_and_clear());
}
}
break;
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
++i;
if (i + 1 >= len || src[i] != Byte_ascii.Brack_bgn) throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character $ii");
if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn)
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
// %f always followed by bracketed term; convert lua bracketed term to regex
i = bracketedCharSetToRegex(tmp_bfr, src, i, len);
byte[] bracketed_regx = tmp_bfr.To_bry_and_clear();
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
i = bracketedCharSetToRegex(tmp_bfr, pat, i, len);
byte[] re2 = tmp_bfr.To_bry_and_clear();
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
// if the bracketed_regx is a negative class it will match \0; so, \W means anything not a word char, which will match \0; \w means word char which will not match \0
if (Regx_adp_.Match("\0", String_.new_u8(bracketed_regx)))
bfr.Add_str_a7("(?<!^)(?<!").Add(bracketed_regx).Add_str_a7(")(?=").Add(bracketed_regx).Add_str_a7("|$)"); // match bgn / end of String
// if the re2 is a negative class it will match \0; so, \W means anything not a word char, which will match \0; \w means word char which will not match \0
if (Regx_adp_.Match("\0", String_.new_u8(re2)))
bfr.Add_str_a7("(?<!^)(?<!").Add(re2).Add_str_a7(")(?=").Add(re2).Add_str_a7("|$)"); // match bgn / end of String
else
bfr .Add_str_a7("(?<!").Add(bracketed_regx).Add_str_a7(")(?=").Add(bracketed_regx).Add_str_a7( ")");
bfr.Add_str_a7("(?<!" ).Add(re2).Add_str_a7(")(?=").Add(re2).Add_str_a7( ")");
break;
}
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
grps_len = nxt - Byte_ascii.Num_0;
if (grps_len == 0 || grps_len > capt_list.Count() || grps_open_Has(grps_open, grps_len))
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: {
int grp_idx = nxt - Byte_ascii.Num_0;
if (grp_idx == 0 || grp_idx > grp_mgr.Capt__len() || grp_mgr.Open__has(grp_idx))
throw Err_.new_wo_type("invalid capture index %" + grps_len + " at pattern character " + Int_.To_str(i));
bfr.Add(Bry_bf2_seg_0).Add_int_variable(grps_len);//.Add(Bry_bf2_seg_1); // $bfr .= "\\g{m$grps_len}";
bfr.Add(Bry_bf2_seg_0);
grp_mgr.Idx__add(bfr, grp_idx);
break;
}
default:
Regx_quote(bfr, nxt);
q_flag = true;
@ -127,21 +142,22 @@ public class Scrib_regx_converter {
}
break;
case Byte_ascii.Brack_bgn:
i = bracketedCharSetToRegex(bfr, src, i, len);
i = bracketedCharSetToRegex(bfr, pat, i, len);
q_flag = true;
break;
case Byte_ascii.Brack_end: throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i));
case Byte_ascii.Brack_end:
throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end));
case Byte_ascii.Dot:
q_flag = true;
bfr.Add_byte(Byte_ascii.Dot);
q_flag = true;
break;
default:
q_flag = true;
Regx_quote(bfr, cur);
q_flag = true;
break;
}
if (q_flag && i + 1 < len) {
byte tmp_b = src[i + 1];
byte tmp_b = pat[i + 1];
switch (tmp_b) {
case Byte_ascii.Star:
case Byte_ascii.Plus:
@ -150,17 +166,18 @@ public class Scrib_regx_converter {
++i;
break;
case Byte_ascii.Dash:
bfr.Add(Bry_regx_dash);
++i;
bfr.Add(Bry_star_question);
i++;
break;
}
}
}
if (grps_open.Count() > 0) throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + Int_.Cast(grps_open.Get_at(0)));
// bfr.Add(Bry_regx_end); // NOTE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere
if (grp_mgr.Open__len() > 0)
throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + grp_mgr.Open__get_at(0));
// bfr.Add(Bry_regx_end); // TOMBSTONE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere
regx = bfr.To_str_and_clear();
return regx;
} private Bry_bfr bfr = Bry_bfr_.New();
}
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
bfr.Add_byte(Byte_ascii.Brack_bgn);
i++;
@ -210,14 +227,6 @@ public class Scrib_regx_converter {
}
return i;
}
boolean grps_open_Has(List_adp list, int v) {
int len = list.Count();
for (int i = 0; i < len; i++) {
Object o = list.Get_at(i);
if (Int_.Cast(o) == v) return true;
}
return false;
}
private void Regx_quote(Bry_bfr bfr, byte b) {
if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash);
bfr.Add_byte(b);
@ -239,32 +248,34 @@ public class Scrib_regx_converter {
, Bry_dollar_literal = Bry_.new_a7("$"), Bry_dollar_escaped = Bry_.new_a7("\\$")
, Bry_bf0_seg_0 = Bry_.new_a7("{"), Bry_bf0_seg_1 = Bry_.new_a7("}[^"), Bry_bf0_seg_2 = Bry_.new_a7("]*")
, Bry_bf2_seg_0 = Bry_.new_a7("\\")//, Bry_bf2_seg_1 = Bry_.new_a7("")
, Bry_regx_dash = Bry_.new_a7("*?") // was *?
, Bry_star_question = Bry_.new_a7("*?") // was *?
;
public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^");
private void Init() {
String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A_(musical_note) DATE:2015-06-10
String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A_(musical_note) DATE:2015-06-10
Init_itm(Bool_.Y, "d", "\\p{Nd}");
Init_itm(Bool_.Y, "l", "\\p{Ll}");
Init_itm(Bool_.Y, "u", "\\p{Lu}");
Init_itm(Bool_.Y, "a", "\\p{L}");
Init_itm(Bool_.Y, "c", "\\p{Cc}");
Init_itm(Bool_.Y, "d", "\\p{Nd}");
Init_itm(Bool_.Y, "l", "\\p{Ll}");
Init_itm(Bool_.Y, "p", "\\p{P}");
Init_itm(Bool_.Y, "s", "\\s");
Init_itm(Bool_.Y, "s", "\\s"); // JAVA: \p{Xps} not valid; REF: https://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html
Init_itm(Bool_.Y, "u", "\\p{Lu}");
Init_itm(Bool_.Y, "w", regx_w);
Init_itm(Bool_.Y, "x", "[0-9A-Fa-f0-9A-Fa-f]");
Init_itm(Bool_.Y, "z", "\\x00");
Init_itm(Bool_.Y, "D", "\\P{Nd}");
Init_itm(Bool_.Y, "L", "\\P{Ll}");
Init_itm(Bool_.Y, "U", "\\P{Lu}");
Init_itm(Bool_.Y, "A", "\\P{L}");
Init_itm(Bool_.Y, "C", "\\P{Cc}");
Init_itm(Bool_.Y, "D", "\\P{Nd}");
Init_itm(Bool_.Y, "L", "\\P{Ll}");
Init_itm(Bool_.Y, "P", "\\P{P}");
Init_itm(Bool_.Y, "S", "\\S"); // JAVA: \P{Xps} not valid
Init_itm(Bool_.Y, "U", "\\P{Lu}");
Init_itm(Bool_.Y, "W", regx_W);
Init_itm(Bool_.Y, "X", "[^0-9A-Fa-f0-9A-Fa-f]");
Init_itm(Bool_.Y, "Z", "[^\\x00]");
Init_itm(Bool_.N, "w", regx_w);
Init_itm(Bool_.N, "x", "0-9A-Fa-f0-9A-Fa-f");
Init_itm(Bool_.N, "W", regx_W);
@ -282,5 +293,4 @@ public class Scrib_regx_converter {
brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions
}
}
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
}

View File

@ -61,11 +61,11 @@ class Scrib_regx_converter_fxt {
}
}
public void Test_parse(String raw, String expd) {
under.Parse(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
Tfds.Eq(expd, under.Regx());
}
public void Test_replace(String text, String find, String replace, String expd) {
String regex_str = under.Parse(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
String actl = Regx_adp_.Replace(text, regex_str, replace);
Tfds.Eq(expd, actl);
}

View File

@ -0,0 +1,90 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.langs.regxs.*;
class Scrib_regx_grp_mgr {
private final List_adp capt_list = List_adp_.New();
private final List_adp full_list = List_adp_.New();
private final List_adp open_list = List_adp_.New();
private final Hash_adp idx_list = Hash_adp_.New();
private int fake_count;
public void Clear() {
open_list.Clear();
capt_list.Clear();
full_list.Clear();
idx_list.Clear();
fake_count = 0;
}
public int Full__len() {return full_list.Len();}
public int Open__len() {return open_list.Len();}
public int Open__get_at(int idx) {return Int_.Cast(open_list.Get_at(idx));}
public void Open__pop() {List_adp_.Del_at_last(open_list);}
public boolean Open__has(int v) {
int len = open_list.Count();
for (int i = 0; i < len; i++) {
Object o = open_list.Get_at(i);
if (Int_.Cast(o) == v) return true;
}
return false;
}
public int Capt__len() {return capt_list.Count();}
public Keyval[] Capt__to_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);}
public void Capt__add__real(int grp_idx, boolean is_empty_capture) {
capt_list.Add(Keyval_.int_(grp_idx, is_empty_capture));
open_list.Add(grp_idx);
full_list.Add(new Scrib_regx_grp_itm(Bool_.N, is_empty_capture, full_list.Len()));
idx_list.Add(grp_idx, full_list.Len());
}
public void Capt__add__fake(int count) {
for (int i = 0; i < count; i++)
full_list.Add(new Scrib_regx_grp_itm(Bool_.Y, Bool_.N, full_list.Len()));
fake_count += count;
}
public void Idx__add(Bry_bfr bfr, int regx_idx) {
int actl_idx = Int_.Cast(idx_list.Get_by(regx_idx));
bfr.Add_int_variable(actl_idx);
}
public Regx_match[] Adjust_balanced(Regx_match[] matches) {
if (fake_count == 0) return matches;
int matches_len = matches.length;
Regx_match[] rv = new Regx_match[matches_len];
for (int i = 0; i < matches_len; i++) {
Regx_match match = matches[i];
Regx_group[] old_groups = match.Groups();
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
int group_idx = 0;
for (int j = 0; j < old_groups.length; j++) {
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
if (itm.Is_fake()) continue;
new_groups[group_idx++] = old_groups[j];
}
rv[i] = new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
}
return rv;
}
}
class Scrib_regx_grp_itm {
public Scrib_regx_grp_itm(boolean is_fake, boolean is_empty_capture, int idx) {
this.is_fake = is_fake;
this.is_empty_capture = is_empty_capture;
this.idx = idx;
}
public boolean Is_fake() {return is_fake;} private final boolean is_fake;
public boolean Is_empty_capture() {return is_empty_capture;} private final boolean is_empty_capture;
public int Idx() {return idx;} private final int idx;
}