mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Scribunto: Handle fake capturing groups from balanced regex
This commit is contained in:
parent
f85cf8ad77
commit
d98639c18a
@ -14,13 +14,19 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.langs.regxs; import gplx.*; import gplx.langs.*;
|
||||
public class Regx_match {
|
||||
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {this.rslt = rslt; this.find_bgn = find_bgn; this.find_end = find_end; this.groups = groups;}
|
||||
public boolean Rslt() {return rslt;} private boolean rslt;
|
||||
public class Regx_match {
|
||||
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {
|
||||
this.rslt = rslt;
|
||||
this.find_bgn = find_bgn;
|
||||
this.find_end = find_end;
|
||||
this.groups = groups;
|
||||
}
|
||||
public boolean Rslt() {return rslt;} private final boolean rslt;
|
||||
public boolean Rslt_none() {return !rslt;} // NOTE: was "|| find_end - find_bgn == 0"; DATE:2013-04-11; DATE:2014-09-02
|
||||
public int Find_bgn() {return find_bgn;} int find_bgn;
|
||||
public int Find_end() {return find_end;} int find_end;
|
||||
public int Find_bgn() {return find_bgn;} private final int find_bgn;
|
||||
public int Find_end() {return find_end;} private final int find_end;
|
||||
public int Find_len() {return find_end - find_bgn;}
|
||||
public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty;
|
||||
public Regx_group[] Groups() {return groups;} private final Regx_group[] groups;
|
||||
|
||||
public static final Regx_match[] Ary_empty = new Regx_match[0];
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
|
||||
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
|
||||
public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000;
|
||||
private Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;}
|
||||
public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);}
|
||||
public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) {
|
||||
@ -74,7 +73,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
: rslt.Init_ary_empty()
|
||||
;
|
||||
}
|
||||
regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_G);
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
|
||||
Regx_match[] regx_rslts = regx_adp.Match_all(text_str, bgn_codepoint_idx); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
int len = regx_rslts.length;
|
||||
@ -103,7 +103,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
String text = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
|
||||
if (text == null) return rslt.Init_many_list(List_adp_.Noop); // if no text is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
|
||||
String regx = regx_converter.Parse(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
String regx = regx_converter.patternToRegex(args.Cast_bry_or_null(1), Scrib_regx_converter.Anchor_G);
|
||||
int bgn = args.Cast_int_or(2, 1);
|
||||
bgn = Bgn_adjust(text, bgn);
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
|
||||
@ -112,6 +113,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
if (len == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
||||
|
||||
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||
regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
AddCapturesFromMatch(tmp_list, regx_rslts[0], text, regx_converter.Capt_ary(), true);
|
||||
return rslt.Init_many_list(tmp_list);
|
||||
@ -122,6 +124,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
boolean rv = false;
|
||||
synchronized (gsub_mgr_lock) { // handle recursive gsub calls; PAGE:en.d:כלב; DATE:2016-01-22
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
int new_len = gsub_mgr_len + 1;
|
||||
if (new_len == gsub_mgr_max) {
|
||||
this.gsub_mgr_max = new_len == 0 ? 2 : new_len * 2;
|
||||
@ -143,7 +146,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
|
||||
byte[] regx = args.Pull_bry(1);
|
||||
String pcre = regx_converter.Parse(regx, Scrib_regx_converter.Anchor_null);
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
|
||||
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
|
||||
}
|
||||
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
@ -204,7 +208,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
String regx = args.Xstr_str_or_null(1); // NOTE: @pattern sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
|
||||
if (args.Len() == 2) return rslt.Init_obj(text); // if no replace arg, return self; PAGE:en.d:'orse; DATE:2013-10-13
|
||||
Object repl_obj = args.Cast_obj_or_null(2);
|
||||
regx = regx_converter.Parse(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
|
||||
regx = regx_converter.patternToRegex(Bry_.new_u8(regx), Scrib_regx_converter.Anchor_pow);
|
||||
int limit = args.Cast_int_or(3, -1);
|
||||
repl_count = 0;
|
||||
Identify_repl(repl_obj);
|
||||
@ -251,6 +255,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
if ( rslts.length == 0 // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
|
||||
|| regx_mgr.Pattern_is_invalid() // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02
|
||||
) return text;
|
||||
rslts = regx_converter.Adjust_balanced(rslts);
|
||||
Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
int len = rslts.length;
|
||||
int pos = 0;
|
||||
@ -338,10 +343,11 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
break;
|
||||
}
|
||||
case Repl_tid_luacbk: {
|
||||
// TOMBSTONE: was causing garbled text on PAGE:en.w:Template:Infobox_kommune DATE:2018-07-02
|
||||
/*
|
||||
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
|
||||
Keyval[] luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
|
||||
/*
|
||||
TOMBSTONE: was causing garbled text on PAGE:en.w:Portal:Bahamas DATE:2018-07-02
|
||||
*/
|
||||
Keyval[] luacbk_args = null;
|
||||
Regx_group[] grps = match.Groups();
|
||||
int grps_len = grps.length;
|
||||
@ -357,6 +363,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, find_str);
|
||||
}
|
||||
}
|
||||
/*
|
||||
*/
|
||||
Keyval[] rslts = core.Interpreter().CallFunction(repl_func.Id(), luacbk_args);
|
||||
if (rslts.length == 0) // will be 0 when gsub_proc returns nil; PAGE:en.d:tracer; DATE:2017-04-22
|
||||
|
@ -38,9 +38,15 @@ public class Scrib_lib_ustring__match__tst {
|
||||
@Test public void Args_out_of_order() {
|
||||
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
|
||||
}
|
||||
@Test public void Include_trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
@Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
|
||||
}
|
||||
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
|
||||
Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9");
|
||||
}
|
||||
@Test public void Balanced__numbered_2() {
|
||||
Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9");
|
||||
}
|
||||
// @Test public void Match_viwiktionary() {
|
||||
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
|
||||
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches
|
||||
|
@ -28,13 +28,13 @@ public class Scrib_lib_ustring__shell_cmd__tst {
|
||||
@Test public void Gsub_proc_w_grouped() { // PURPOSE: gsub_proc should pass matched String, not entire String; DATE:2013-12-01
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
Exec_gsub_regx_func_1("[[a]]", "%[%[([^#|%]]-)%]%]" , "A;1");
|
||||
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
|
||||
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]";
|
||||
}
|
||||
@Test public void Gsub_proc_w_grouped_2() {// PURPOSE: gsub_proc failed when passing multiple matches; DATE:2013-12-01
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
Exec_gsub_regx_func_2("[[a]] [[b]]", "%[%[([^#|%]]-)%]%]" , "A B;2");
|
||||
fxt.Test_log_rcvd(3, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[a]]\"}}"); // should be "[[a]]", not "a"; switched on DATE:2018-07-02
|
||||
fxt.Test_log_rcvd(4, "0000003B00000075{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"[[b]]\"}}"); // should be "[[b]]", not "b"; switched on DATE:2018-07-02
|
||||
fxt.Test_log_rcvd(3, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"a\"}}"); // should be "a", not "[[a]]";
|
||||
fxt.Test_log_rcvd(4, "000000370000006D{[\"op\"]=\"call\",[\"id\"]=1,[\"nargs\"]=1,[\"args\"]={[1]=\"b\"}}"); // should be "b", not "[[b]]";
|
||||
}
|
||||
@Test public void Gsub_int() { // PURPOSE: gsub with integer arg should not fail; DATE:2013-11-06
|
||||
fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_gsub);
|
||||
|
@ -17,64 +17,73 @@ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import
|
||||
import gplx.core.brys.fmtrs.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public class Scrib_regx_converter {
|
||||
private final List_adp capt_list = List_adp_.New(), grps_parens = List_adp_.New(); private final List_adp grps_open = List_adp_.New();
|
||||
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
|
||||
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private Bry_bfr tmp_bfr;
|
||||
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
|
||||
public Scrib_regx_converter() {Init();}
|
||||
public String Regx() {return regx;} private String regx;
|
||||
public List_adp Capt_list() {return capt_list;}
|
||||
public Keyval[] Capt_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);}
|
||||
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
|
||||
public String Parse(byte[] src, byte[] anchor) {
|
||||
int len = src.length;
|
||||
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
|
||||
public boolean Any_pos() {return any_pos;} private boolean any_pos;
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
|
||||
public String patternToRegex(byte[] pat, byte[] anchor) {
|
||||
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
|
||||
grp_mgr.Clear();
|
||||
any_pos = false;
|
||||
boolean q_flag = false;
|
||||
capt_list.Clear(); grps_open.Clear(); grps_parens.Clear();
|
||||
|
||||
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
|
||||
int len = pat.length;
|
||||
int grps_len = 0;
|
||||
int bct = 0;
|
||||
// bfr.Add_byte(Byte_ascii.Slash); // NOTE: do not add PHP "/" at start
|
||||
for (int i = 0; i < len; i++) {
|
||||
int i_end = i + 1;
|
||||
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
|
||||
byte cur = src[i];
|
||||
byte cur = pat[i];
|
||||
switch (cur) {
|
||||
case Byte_ascii.Pow:
|
||||
q_flag = i != 0;
|
||||
bfr.Add(anchor == Anchor_null || q_flag ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
|
||||
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
|
||||
break;
|
||||
case Byte_ascii.Dollar:
|
||||
q_flag = i < len - 1;
|
||||
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
|
||||
break;
|
||||
case Byte_ascii.Paren_bgn: {
|
||||
if (i + 1 >= len) throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i));
|
||||
boolean capt_itm = src[i + 1] == Byte_ascii.Paren_end; // current is "()"
|
||||
++grps_len;
|
||||
capt_list.Add(Keyval_.int_(grps_len, capt_itm));
|
||||
bfr.Add_byte(Byte_ascii.Paren_bgn);
|
||||
grps_open.Add(grps_len);
|
||||
grps_parens.Add(i + 1);
|
||||
if (i + 1 >= len)
|
||||
throw Err_.new_wo_type("Unmatched open-paren at pattern character " + Int_.To_str(i_end));
|
||||
int grp_idx = grp_mgr.Capt__len() + 1;
|
||||
boolean is_empty_capture = pat[i + 1] == Byte_ascii.Paren_end; // current is "()"
|
||||
if (is_empty_capture)
|
||||
any_pos = true;
|
||||
bfr.Add_byte(Byte_ascii.Paren_bgn); // $re .= "(?<m$n>";
|
||||
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
|
||||
break;
|
||||
}
|
||||
case Byte_ascii.Paren_end:
|
||||
if (grps_open.Count() == 0)
|
||||
throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i));
|
||||
List_adp_.Del_at_last(grps_open);
|
||||
if (grp_mgr.Open__len() <= 0)
|
||||
throw Err_.new_wo_type("Unmatched close-paren at pattern character " + Int_.To_str(i_end));
|
||||
grp_mgr.Open__pop();
|
||||
bfr.Add_byte(Byte_ascii.Paren_end);
|
||||
break;
|
||||
case Byte_ascii.Percent:
|
||||
++i;
|
||||
if (i >= len) throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
||||
Object percent_obj = percent_hash.Get_by_mid(src, i, i + 1);
|
||||
i++;
|
||||
if (i >= len)
|
||||
throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
||||
Object percent_obj = percent_hash.Get_by_mid(pat, i, i + 1);
|
||||
if (percent_obj != null) {
|
||||
bfr.Add((byte[])percent_obj);
|
||||
q_flag = true;
|
||||
}
|
||||
else {
|
||||
byte nxt = src[i];
|
||||
byte nxt = pat[i];
|
||||
switch (nxt) {
|
||||
case Byte_ascii.Ltr_b: // EX: "%b()"
|
||||
i += 2;
|
||||
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
|
||||
byte char_0 = src[i - 1];
|
||||
byte char_1 = src[i];
|
||||
byte char_0 = pat[i - 1];
|
||||
byte char_1 = pat[i];
|
||||
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
|
||||
bfr.Add(Bry_bf0_seg_0);
|
||||
Regx_quote(bfr, char_0);
|
||||
@ -85,40 +94,46 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
else { // diff char: harder regex; REF.MW: $bfr .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
|
||||
if (fmtr_balanced == null) {
|
||||
// JAVA:recursive regex not possible, but workaround is possible PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
// JAVA:recursive regex not possible, so need complicated regex
|
||||
// REF: https://stackoverflow.com/questions/47162098/is-it-possible-to-match-nested-brackets-with-regex-without-using-recursion-or-ba/47162099#47162099
|
||||
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\1)(.*\\~{2}(?!.*\\2).*))(?=.*?\\~{2}(?!.*?\\2)(.*)).)+?.*?(?=\\1)[^\\~{1}]*(?=\\2$)", "0", "1", "2");
|
||||
// PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||
fmtr_balanced = Bry_fmtr.new_("(?=\\~{1})(?:(?=.*?\\~{1}(?!.*?\\~{3})(.*\\~{2}(?!.*\\~{4}).*))(?=.*?\\~{2}(?!.*?\\~{4})(.*)).)+?.*?(?=\\~{3})[^\\~{1}]*(?=\\~{4}$)", "unused", "1", "2", "3", "4");
|
||||
bfr_balanced = Bry_bfr_.Reset(255);
|
||||
}
|
||||
synchronized (fmtr_balanced) {
|
||||
++bct;
|
||||
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1));
|
||||
int balanced_idx = grp_mgr.Full__len();
|
||||
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Byte_.Ary(char_0), Byte_.Ary(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
|
||||
grp_mgr.Capt__add__fake(2);
|
||||
bfr.Add(bfr_balanced.To_bry_and_clear());
|
||||
}
|
||||
}
|
||||
break;
|
||||
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
|
||||
++i;
|
||||
if (i + 1 >= len || src[i] != Byte_ascii.Brack_bgn) throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character $ii");
|
||||
if (i + 1 >= len || pat[++i] != Byte_ascii.Brack_bgn)
|
||||
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
|
||||
// %f always followed by bracketed term; convert lua bracketed term to regex
|
||||
i = bracketedCharSetToRegex(tmp_bfr, src, i, len);
|
||||
byte[] bracketed_regx = tmp_bfr.To_bry_and_clear();
|
||||
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
|
||||
i = bracketedCharSetToRegex(tmp_bfr, pat, i, len);
|
||||
byte[] re2 = tmp_bfr.To_bry_and_clear();
|
||||
|
||||
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
|
||||
// if the bracketed_regx is a negative class it will match \0; so, \W means anything not a word char, which will match \0; \w means word char which will not match \0
|
||||
if (Regx_adp_.Match("\0", String_.new_u8(bracketed_regx)))
|
||||
bfr.Add_str_a7("(?<!^)(?<!").Add(bracketed_regx).Add_str_a7(")(?=").Add(bracketed_regx).Add_str_a7("|$)"); // match bgn / end of String
|
||||
// if the re2 is a negative class it will match \0; so, \W means anything not a word char, which will match \0; \w means word char which will not match \0
|
||||
if (Regx_adp_.Match("\0", String_.new_u8(re2)))
|
||||
bfr.Add_str_a7("(?<!^)(?<!").Add(re2).Add_str_a7(")(?=").Add(re2).Add_str_a7("|$)"); // match bgn / end of String
|
||||
else
|
||||
bfr .Add_str_a7("(?<!").Add(bracketed_regx).Add_str_a7(")(?=").Add(bracketed_regx).Add_str_a7( ")");
|
||||
bfr.Add_str_a7("(?<!" ).Add(re2).Add_str_a7(")(?=").Add(re2).Add_str_a7( ")");
|
||||
break;
|
||||
}
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
|
||||
grps_len = nxt - Byte_ascii.Num_0;
|
||||
if (grps_len == 0 || grps_len > capt_list.Count() || grps_open_Has(grps_open, grps_len))
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: {
|
||||
int grp_idx = nxt - Byte_ascii.Num_0;
|
||||
if (grp_idx == 0 || grp_idx > grp_mgr.Capt__len() || grp_mgr.Open__has(grp_idx))
|
||||
throw Err_.new_wo_type("invalid capture index %" + grps_len + " at pattern character " + Int_.To_str(i));
|
||||
bfr.Add(Bry_bf2_seg_0).Add_int_variable(grps_len);//.Add(Bry_bf2_seg_1); // $bfr .= "\\g{m$grps_len}";
|
||||
bfr.Add(Bry_bf2_seg_0);
|
||||
grp_mgr.Idx__add(bfr, grp_idx);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Regx_quote(bfr, nxt);
|
||||
q_flag = true;
|
||||
@ -127,21 +142,22 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
break;
|
||||
case Byte_ascii.Brack_bgn:
|
||||
i = bracketedCharSetToRegex(bfr, src, i, len);
|
||||
i = bracketedCharSetToRegex(bfr, pat, i, len);
|
||||
q_flag = true;
|
||||
break;
|
||||
case Byte_ascii.Brack_end: throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i));
|
||||
case Byte_ascii.Brack_end:
|
||||
throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end));
|
||||
case Byte_ascii.Dot:
|
||||
q_flag = true;
|
||||
bfr.Add_byte(Byte_ascii.Dot);
|
||||
q_flag = true;
|
||||
break;
|
||||
default:
|
||||
q_flag = true;
|
||||
Regx_quote(bfr, cur);
|
||||
q_flag = true;
|
||||
break;
|
||||
}
|
||||
if (q_flag && i + 1 < len) {
|
||||
byte tmp_b = src[i + 1];
|
||||
byte tmp_b = pat[i + 1];
|
||||
switch (tmp_b) {
|
||||
case Byte_ascii.Star:
|
||||
case Byte_ascii.Plus:
|
||||
@ -150,17 +166,18 @@ public class Scrib_regx_converter {
|
||||
++i;
|
||||
break;
|
||||
case Byte_ascii.Dash:
|
||||
bfr.Add(Bry_regx_dash);
|
||||
++i;
|
||||
bfr.Add(Bry_star_question);
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (grps_open.Count() > 0) throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + Int_.Cast(grps_open.Get_at(0)));
|
||||
// bfr.Add(Bry_regx_end); // NOTE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere
|
||||
if (grp_mgr.Open__len() > 0)
|
||||
throw Err_.new_wo_type("Unclosed capture beginning at pattern character " + grp_mgr.Open__get_at(0));
|
||||
// bfr.Add(Bry_regx_end); // TOMBSTONE: do not add PHP /us at end; u=PCRE_UTF8 which is not needed for Java; s=PCRE_DOTALL which will be specified elsewhere
|
||||
regx = bfr.To_str_and_clear();
|
||||
return regx;
|
||||
} private Bry_bfr bfr = Bry_bfr_.New();
|
||||
}
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, byte[] pat, int i, int len) {
|
||||
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||
i++;
|
||||
@ -210,14 +227,6 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
return i;
|
||||
}
|
||||
boolean grps_open_Has(List_adp list, int v) {
|
||||
int len = list.Count();
|
||||
for (int i = 0; i < len; i++) {
|
||||
Object o = list.Get_at(i);
|
||||
if (Int_.Cast(o) == v) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
private void Regx_quote(Bry_bfr bfr, byte b) {
|
||||
if (Regx_char(b)) bfr.Add_byte(Byte_ascii.Backslash);
|
||||
bfr.Add_byte(b);
|
||||
@ -239,32 +248,34 @@ public class Scrib_regx_converter {
|
||||
, Bry_dollar_literal = Bry_.new_a7("$"), Bry_dollar_escaped = Bry_.new_a7("\\$")
|
||||
, Bry_bf0_seg_0 = Bry_.new_a7("{"), Bry_bf0_seg_1 = Bry_.new_a7("}[^"), Bry_bf0_seg_2 = Bry_.new_a7("]*")
|
||||
, Bry_bf2_seg_0 = Bry_.new_a7("\\")//, Bry_bf2_seg_1 = Bry_.new_a7("")
|
||||
, Bry_regx_dash = Bry_.new_a7("*?") // was *?
|
||||
, Bry_star_question = Bry_.new_a7("*?") // was *?
|
||||
;
|
||||
public static final byte[] Anchor_null = null, Anchor_G = Bry_.new_a7("\\G"), Anchor_pow = Bry_.new_a7("^");
|
||||
private void Init() {
|
||||
String regx_w = "\\w"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
|
||||
String regx_W = "\\W"; // JRE.7: \w not support in JRE.6; PAGE:en.w:A♯_(musical_note) DATE:2015-06-10
|
||||
Init_itm(Bool_.Y, "d", "\\p{Nd}");
|
||||
Init_itm(Bool_.Y, "l", "\\p{Ll}");
|
||||
Init_itm(Bool_.Y, "u", "\\p{Lu}");
|
||||
Init_itm(Bool_.Y, "a", "\\p{L}");
|
||||
Init_itm(Bool_.Y, "c", "\\p{Cc}");
|
||||
Init_itm(Bool_.Y, "d", "\\p{Nd}");
|
||||
Init_itm(Bool_.Y, "l", "\\p{Ll}");
|
||||
Init_itm(Bool_.Y, "p", "\\p{P}");
|
||||
Init_itm(Bool_.Y, "s", "\\s");
|
||||
Init_itm(Bool_.Y, "s", "\\s"); // JAVA: \p{Xps} not valid; REF: https://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html
|
||||
Init_itm(Bool_.Y, "u", "\\p{Lu}");
|
||||
Init_itm(Bool_.Y, "w", regx_w);
|
||||
Init_itm(Bool_.Y, "x", "[0-9A-Fa-f0-9A-Fa-f]");
|
||||
Init_itm(Bool_.Y, "z", "\\x00");
|
||||
Init_itm(Bool_.Y, "D", "\\P{Nd}");
|
||||
Init_itm(Bool_.Y, "L", "\\P{Ll}");
|
||||
Init_itm(Bool_.Y, "U", "\\P{Lu}");
|
||||
|
||||
Init_itm(Bool_.Y, "A", "\\P{L}");
|
||||
Init_itm(Bool_.Y, "C", "\\P{Cc}");
|
||||
Init_itm(Bool_.Y, "D", "\\P{Nd}");
|
||||
Init_itm(Bool_.Y, "L", "\\P{Ll}");
|
||||
Init_itm(Bool_.Y, "P", "\\P{P}");
|
||||
Init_itm(Bool_.Y, "S", "\\S"); // JAVA: \P{Xps} not valid
|
||||
Init_itm(Bool_.Y, "U", "\\P{Lu}");
|
||||
Init_itm(Bool_.Y, "W", regx_W);
|
||||
Init_itm(Bool_.Y, "X", "[^0-9A-Fa-f0-9A-Fa-f]");
|
||||
Init_itm(Bool_.Y, "Z", "[^\\x00]");
|
||||
|
||||
Init_itm(Bool_.N, "w", regx_w);
|
||||
Init_itm(Bool_.N, "x", "0-9A-Fa-f0-9A-Fa-f");
|
||||
Init_itm(Bool_.N, "W", regx_W);
|
||||
@ -282,5 +293,4 @@ public class Scrib_regx_converter {
|
||||
brack_hash.Add_if_dupe_use_nth(lua_bry, php_bry); // replace percent_hash definitions
|
||||
}
|
||||
}
|
||||
private final Hash_adp_bry percent_hash = Hash_adp_bry.cs(), brack_hash = Hash_adp_bry.cs();
|
||||
}
|
||||
|
@ -61,11 +61,11 @@ class Scrib_regx_converter_fxt {
|
||||
}
|
||||
}
|
||||
public void Test_parse(String raw, String expd) {
|
||||
under.Parse(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
|
||||
under.patternToRegex(Bry_.new_u8(raw), Scrib_regx_converter.Anchor_G);
|
||||
Tfds.Eq(expd, under.Regx());
|
||||
}
|
||||
public void Test_replace(String text, String find, String replace, String expd) {
|
||||
String regex_str = under.Parse(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
|
||||
String regex_str = under.patternToRegex(Bry_.new_u8(find), Scrib_regx_converter.Anchor_G);
|
||||
String actl = Regx_adp_.Replace(text, regex_str, replace);
|
||||
Tfds.Eq(expd, actl);
|
||||
}
|
||||
|
@ -0,0 +1,90 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import gplx.langs.regxs.*;
|
||||
class Scrib_regx_grp_mgr {
|
||||
private final List_adp capt_list = List_adp_.New();
|
||||
private final List_adp full_list = List_adp_.New();
|
||||
private final List_adp open_list = List_adp_.New();
|
||||
private final Hash_adp idx_list = Hash_adp_.New();
|
||||
private int fake_count;
|
||||
public void Clear() {
|
||||
open_list.Clear();
|
||||
capt_list.Clear();
|
||||
full_list.Clear();
|
||||
idx_list.Clear();
|
||||
fake_count = 0;
|
||||
}
|
||||
public int Full__len() {return full_list.Len();}
|
||||
public int Open__len() {return open_list.Len();}
|
||||
public int Open__get_at(int idx) {return Int_.Cast(open_list.Get_at(idx));}
|
||||
public void Open__pop() {List_adp_.Del_at_last(open_list);}
|
||||
public boolean Open__has(int v) {
|
||||
int len = open_list.Count();
|
||||
for (int i = 0; i < len; i++) {
|
||||
Object o = open_list.Get_at(i);
|
||||
if (Int_.Cast(o) == v) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int Capt__len() {return capt_list.Count();}
|
||||
public Keyval[] Capt__to_ary() {return capt_list.Count() == 0 ? null : (Keyval[])capt_list.To_ary(Keyval.class);}
|
||||
public void Capt__add__real(int grp_idx, boolean is_empty_capture) {
|
||||
capt_list.Add(Keyval_.int_(grp_idx, is_empty_capture));
|
||||
open_list.Add(grp_idx);
|
||||
full_list.Add(new Scrib_regx_grp_itm(Bool_.N, is_empty_capture, full_list.Len()));
|
||||
idx_list.Add(grp_idx, full_list.Len());
|
||||
}
|
||||
public void Capt__add__fake(int count) {
|
||||
for (int i = 0; i < count; i++)
|
||||
full_list.Add(new Scrib_regx_grp_itm(Bool_.Y, Bool_.N, full_list.Len()));
|
||||
fake_count += count;
|
||||
}
|
||||
public void Idx__add(Bry_bfr bfr, int regx_idx) {
|
||||
int actl_idx = Int_.Cast(idx_list.Get_by(regx_idx));
|
||||
bfr.Add_int_variable(actl_idx);
|
||||
}
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] matches) {
|
||||
if (fake_count == 0) return matches;
|
||||
|
||||
int matches_len = matches.length;
|
||||
Regx_match[] rv = new Regx_match[matches_len];
|
||||
for (int i = 0; i < matches_len; i++) {
|
||||
Regx_match match = matches[i];
|
||||
Regx_group[] old_groups = match.Groups();
|
||||
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
|
||||
int group_idx = 0;
|
||||
for (int j = 0; j < old_groups.length; j++) {
|
||||
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
|
||||
if (itm.Is_fake()) continue;
|
||||
new_groups[group_idx++] = old_groups[j];
|
||||
}
|
||||
rv[i] = new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
class Scrib_regx_grp_itm {
|
||||
public Scrib_regx_grp_itm(boolean is_fake, boolean is_empty_capture, int idx) {
|
||||
this.is_fake = is_fake;
|
||||
this.is_empty_capture = is_empty_capture;
|
||||
this.idx = idx;
|
||||
}
|
||||
public boolean Is_fake() {return is_fake;} private final boolean is_fake;
|
||||
public boolean Is_empty_capture() {return is_empty_capture;} private final boolean is_empty_capture;
|
||||
public int Idx() {return idx;} private final int idx;
|
||||
}
|
Loading…
Reference in New Issue
Block a user