mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Scribunto: Add initial support for LuaJ StringLib as replacement for Regex [#413]
This commit is contained in:
parent
2fc03f6211
commit
31c7604f03
@ -24,6 +24,7 @@ public interface Unicode_string {
|
|||||||
int Val_codes(int i);
|
int Val_codes(int i);
|
||||||
int Pos_codes_to_bytes(int i);
|
int Pos_codes_to_bytes(int i);
|
||||||
int Pos_codes_to_chars(int i);
|
int Pos_codes_to_chars(int i);
|
||||||
|
int Pos_bytes_to_chars(int i);
|
||||||
int Pos_bytes_to_codes(int i);
|
int Pos_bytes_to_codes(int i);
|
||||||
int Pos_chars_to_codes(int i);
|
int Pos_chars_to_codes(int i);
|
||||||
}
|
}
|
||||||
@ -43,7 +44,8 @@ class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
|
|||||||
public int Len_bytes() {return codes_len;}
|
public int Len_bytes() {return codes_len;}
|
||||||
public int Val_codes(int i) {return codes[i];}
|
public int Val_codes(int i) {return codes[i];}
|
||||||
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||||
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||||
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||||
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
|
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||||
|
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ class Unicode_string_multi implements Unicode_string {
|
|||||||
private final int[] codes;
|
private final int[] codes;
|
||||||
private final int[] codes_to_bytes;
|
private final int[] codes_to_bytes;
|
||||||
private final int[] codes_to_chars;
|
private final int[] codes_to_chars;
|
||||||
|
private final int[] bytes_to_chars;
|
||||||
private final int[] bytes_to_codes;
|
private final int[] bytes_to_codes;
|
||||||
private final int[] chars_to_codes;
|
private final int[] chars_to_codes;
|
||||||
|
|
||||||
@ -34,6 +35,7 @@ class Unicode_string_multi implements Unicode_string {
|
|||||||
this.codes_to_bytes = new int[codes_len + Adj_end];
|
this.codes_to_bytes = new int[codes_len + Adj_end];
|
||||||
this.codes_to_chars = new int[codes_len + Adj_end];
|
this.codes_to_chars = new int[codes_len + Adj_end];
|
||||||
this.bytes_to_codes = New_int_ary(bytes_len);
|
this.bytes_to_codes = New_int_ary(bytes_len);
|
||||||
|
this.bytes_to_chars = New_int_ary(bytes_len);
|
||||||
this.chars_to_codes = New_int_ary(chars_len);
|
this.chars_to_codes = New_int_ary(chars_len);
|
||||||
|
|
||||||
// init loop
|
// init loop
|
||||||
@ -46,6 +48,7 @@ class Unicode_string_multi implements Unicode_string {
|
|||||||
// update
|
// update
|
||||||
codes_to_bytes[codes_pos] = bytes_pos;
|
codes_to_bytes[codes_pos] = bytes_pos;
|
||||||
codes_to_chars[codes_pos] = chars_pos;
|
codes_to_chars[codes_pos] = chars_pos;
|
||||||
|
bytes_to_chars[bytes_pos] = chars_pos;
|
||||||
bytes_to_codes[bytes_pos] = codes_pos;
|
bytes_to_codes[bytes_pos] = codes_pos;
|
||||||
chars_to_codes[chars_pos] = codes_pos;
|
chars_to_codes[chars_pos] = codes_pos;
|
||||||
|
|
||||||
@ -67,6 +70,7 @@ class Unicode_string_multi implements Unicode_string {
|
|||||||
public int Val_codes(int i) {return codes[i];}
|
public int Val_codes(int i) {return codes[i];}
|
||||||
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
|
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
|
||||||
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
|
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
|
||||||
|
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
|
||||||
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
|
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
|
||||||
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
|
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import
|
|||||||
import gplx.core.intls.*; import gplx.langs.regxs.*;
|
import gplx.core.intls.*; import gplx.langs.regxs.*;
|
||||||
import gplx.xowa.parsers.*;
|
import gplx.xowa.parsers.*;
|
||||||
import gplx.xowa.xtns.scribunto.procs.*;
|
import gplx.xowa.xtns.scribunto.procs.*;
|
||||||
|
import gplx.xowa.xtns.scribunto.libs.patterns.*;
|
||||||
public class Scrib_lib_ustring implements Scrib_lib {
|
public class Scrib_lib_ustring implements Scrib_lib {
|
||||||
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
|
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
|
||||||
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
|
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
|
||||||
@ -98,7 +99,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
|
|
||||||
// run regex
|
// run regex
|
||||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||||
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
|
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||||
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
|
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
|
||||||
|
|
||||||
// add to tmp_list
|
// add to tmp_list
|
||||||
@ -123,7 +124,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
|
|
||||||
// run regex
|
// run regex
|
||||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||||
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
|
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||||
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
||||||
|
|
||||||
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||||
@ -140,7 +141,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
|
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
|
||||||
String regx = args.Pull_str(1);
|
String regx = args.Pull_str(1);
|
||||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||||
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
|
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
|
||||||
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
|
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
|
||||||
}
|
}
|
||||||
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||||
@ -148,7 +149,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
String regx = args.Pull_str(1);
|
String regx = args.Pull_str(1);
|
||||||
Keyval[] capt = args.Cast_kv_ary_or_null(2);
|
Keyval[] capt = args.Cast_kv_ary_or_null(2);
|
||||||
int pos = args.Pull_int(3);
|
int pos = args.Pull_int(3);
|
||||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
|
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
|
||||||
Regx_match[] regx_rslts = regx_adp.Match_all(text, pos);
|
Regx_match[] regx_rslts = regx_adp.Match_all(text, pos);
|
||||||
int len = regx_rslts.length;
|
int len = regx_rslts.length;
|
||||||
if (len == 0) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
|
if (len == 0) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
|
||||||
@ -178,14 +179,6 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
bgn_as_codes = 0;
|
bgn_as_codes = 0;
|
||||||
return bgn_as_codes;
|
return bgn_as_codes;
|
||||||
}
|
}
|
||||||
private Regx_match[] Run_regex_or_null(Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
|
||||||
// convert regex from lua to java
|
|
||||||
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G);
|
|
||||||
|
|
||||||
// run regex
|
|
||||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
|
|
||||||
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
|
||||||
}
|
|
||||||
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
|
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
|
||||||
int capts_len = capts == null ? 0 : capts.length;
|
int capts_len = capts == null ? 0 : capts.length;
|
||||||
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
|
||||||
@ -205,12 +198,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
|||||||
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
|
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
|
||||||
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
|
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
|
||||||
}
|
}
|
||||||
public static Regx_adp RegxAdp_new_(Xop_ctx ctx, String regx) {
|
public static Regx_adp RegxAdp_new_(Xoa_url url, String regx) {
|
||||||
Regx_adp rv = Regx_adp_.new_(regx);
|
Regx_adp rv = Regx_adp_.new_(regx);
|
||||||
if (rv.Pattern_is_invalid()) {
|
if (rv.Pattern_is_invalid()) {
|
||||||
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
|
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
|
||||||
Exception exc = rv.Pattern_is_invalid_exception();
|
Exception exc = rv.Pattern_is_invalid_exception();
|
||||||
ctx.App().Usr_dlg().Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, ctx.Page().Ttl().Page_db(), Err_.Message_gplx_log(exc));
|
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, url.To_bry(), Err_.Message_gplx_log(exc));
|
||||||
}
|
}
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ public class Scrib_lib_ustring__match__tst {
|
|||||||
Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn
|
Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn
|
||||||
Exec_match("abcd" , "b(c)" , 1, "c"); // group
|
Exec_match("abcd" , "b(c)" , 1, "c"); // group
|
||||||
Exec_match(" a b " , "^%s*(.-)%s*$" , 1, "a b"); // trim; NOTE: changed back from "a b;" to "a b"; DATE:2017-04-23; changed from "a b" to "a b;"; DATE:2015-01-30
|
Exec_match(" a b " , "^%s*(.-)%s*$" , 1, "a b"); // trim; NOTE: changed back from "a b;" to "a b"; DATE:2017-04-23; changed from "a b" to "a b;"; DATE:2015-01-30
|
||||||
Exec_match("abcd" , "a" , 0, "a"); // handle 0; note that php/lua is super-1, but some modules pass in 0; ru.w:Module:Infocards; DATE:2013-11-08
|
Exec_match("abcd" , "a" , 0, "a"); // handle 0; note that php/lua is BASE_1, but some modules pass in 0; ru.w:Module:Infocards; DATE:2013-11-08
|
||||||
Exec_match("abcd" , "." , -1, "d"); // -1
|
Exec_match("abcd" , "." , -1, "d"); // -1
|
||||||
Exec_match("aaa" , "a" , 1, "a"); // should return 1st match not many
|
Exec_match("aaa" , "a" , 1, "a"); // should return 1st match not many
|
||||||
Exec_match("aaa" , "(a)" , 1, "a"); // should return 1st match only; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
Exec_match("aaa" , "(a)" , 1, "a"); // should return 1st match only; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||||
|
@ -32,7 +32,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
|||||||
|
|
||||||
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
|
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
|
||||||
String regx = args.Xstr_str_or_null(1);
|
String regx = args.Xstr_str_or_null(1);
|
||||||
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow);
|
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow, true);
|
||||||
|
|
||||||
// get @repl
|
// get @repl
|
||||||
Object repl_obj = args.Cast_obj_or_null(2);
|
Object repl_obj = args.Cast_obj_or_null(2);
|
||||||
@ -82,7 +82,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
|||||||
}
|
}
|
||||||
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
|
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
|
||||||
// parse regx
|
// parse regx
|
||||||
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
|
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
|
||||||
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
|
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
|
||||||
|
|
||||||
// exec regx
|
// exec regx
|
||||||
|
@ -30,7 +30,7 @@ public class Scrib_regx_converter {
|
|||||||
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
|
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
|
||||||
public boolean Any_pos() {return any_pos;} private boolean any_pos;
|
public boolean Any_pos() {return any_pos;} private boolean any_pos;
|
||||||
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
|
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
|
||||||
public String patternToRegex(String pat_str, byte[] anchor) {
|
public String patternToRegex(String pat_str, byte[] anchor, boolean mode_is_regx) {
|
||||||
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
|
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
|
||||||
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
|
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
|
||||||
grp_mgr.Clear();
|
grp_mgr.Clear();
|
||||||
@ -49,10 +49,18 @@ public class Scrib_regx_converter {
|
|||||||
int cur = pat_ucs.Val_codes(i);
|
int cur = pat_ucs.Val_codes(i);
|
||||||
switch (cur) {
|
switch (cur) {
|
||||||
case Byte_ascii.Pow:
|
case Byte_ascii.Pow:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Pow);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
q_flag = i != 0;
|
q_flag = i != 0;
|
||||||
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
|
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
|
||||||
break;
|
break;
|
||||||
case Byte_ascii.Dollar:
|
case Byte_ascii.Dollar:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Dollar);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
q_flag = i < len - 1;
|
q_flag = i < len - 1;
|
||||||
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
|
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
|
||||||
break;
|
break;
|
||||||
@ -78,6 +86,10 @@ public class Scrib_regx_converter {
|
|||||||
bfr.Add_byte(Byte_ascii.Paren_end);
|
bfr.Add_byte(Byte_ascii.Paren_end);
|
||||||
break;
|
break;
|
||||||
case Byte_ascii.Percent:
|
case Byte_ascii.Percent:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Percent);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
i++;
|
i++;
|
||||||
if (i >= len)
|
if (i >= len)
|
||||||
throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
||||||
@ -114,7 +126,8 @@ public class Scrib_regx_converter {
|
|||||||
++bct;
|
++bct;
|
||||||
int balanced_idx = grp_mgr.Full__len();
|
int balanced_idx = grp_mgr.Full__len();
|
||||||
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
|
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
|
||||||
grp_mgr.Capt__add__fake(2);
|
if (mode_is_regx)
|
||||||
|
grp_mgr.Capt__add__fake(2);
|
||||||
bfr.Add(bfr_balanced.To_bry_and_clear());
|
bfr.Add(bfr_balanced.To_bry_and_clear());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,16 +165,32 @@ public class Scrib_regx_converter {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Byte_ascii.Brack_bgn:
|
case Byte_ascii.Brack_bgn:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
|
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
|
||||||
q_flag = true;
|
q_flag = true;
|
||||||
break;
|
break;
|
||||||
case Byte_ascii.Brack_end:
|
case Byte_ascii.Brack_end:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Brack_end);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end));
|
throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end));
|
||||||
case Byte_ascii.Dot:
|
case Byte_ascii.Dot:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_byte(Byte_ascii.Dot);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
bfr.Add_byte(Byte_ascii.Dot);
|
bfr.Add_byte(Byte_ascii.Dot);
|
||||||
q_flag = true;
|
q_flag = true;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
if (!mode_is_regx) {
|
||||||
|
bfr.Add_u8_int(cur);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
Regx_quote(bfr, cur);
|
Regx_quote(bfr, cur);
|
||||||
q_flag = true;
|
q_flag = true;
|
||||||
break;
|
break;
|
||||||
|
@ -64,11 +64,11 @@ class Scrib_regx_converter_fxt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
public void Test_parse(String raw, String expd) {
|
public void Test_parse(String raw, String expd) {
|
||||||
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G);
|
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G, true);
|
||||||
Tfds.Eq(expd, under.Regx());
|
Tfds.Eq(expd, under.Regx());
|
||||||
}
|
}
|
||||||
public void Test_replace(String text, String find, String replace, String expd) {
|
public void Test_replace(String text, String find, String replace, String expd) {
|
||||||
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G);
|
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G, true);
|
||||||
String actl = Regx_adp_.Replace(text, regex_str, replace);
|
String actl = Regx_adp_.Replace(text, regex_str, replace);
|
||||||
Tfds.Eq(expd, actl);
|
Tfds.Eq(expd, actl);
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||||
|
import gplx.core.intls.*;
|
||||||
|
import gplx.langs.regxs.*;
|
||||||
|
public interface Scrib_pattern_matcher {
|
||||||
|
Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes);
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||||
|
import gplx.core.intls.*;
|
||||||
|
import gplx.langs.regxs.*;
|
||||||
|
public class Scrib_pattern_matcher_ {
|
||||||
|
private static final Scrib_pattern_matcher instance = New();
|
||||||
|
private static Scrib_pattern_matcher New() {
|
||||||
|
return new Scrib_pattern_matcher__regx();
|
||||||
|
// return new Scrib_pattern_matcher__luaj();
|
||||||
|
}
|
||||||
|
public static Scrib_pattern_matcher Instance() {return instance;}
|
||||||
|
}
|
||||||
|
class Scrib_pattern_matcher__regx implements Scrib_pattern_matcher {
|
||||||
|
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||||
|
// convert regex from lua to java
|
||||||
|
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G, true);
|
||||||
|
|
||||||
|
// run regex
|
||||||
|
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(url, find_str);
|
||||||
|
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
XOWA: the XOWA Offline Wiki Application
|
||||||
|
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||||
|
|
||||||
|
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||||
|
or alternatively under the terms of the Apache License Version 2.0.
|
||||||
|
|
||||||
|
You may use XOWA according to either of these licenses as is most appropriate
|
||||||
|
for your project on a case-by-case basis.
|
||||||
|
|
||||||
|
The terms of each license can be found in the source code repository:
|
||||||
|
|
||||||
|
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||||
|
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||||
|
*/
|
||||||
|
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||||
|
import gplx.core.intls.*;
|
||||||
|
import gplx.langs.regxs.*;
|
||||||
|
import org.luaj.vm2.lib.StringLib;
|
||||||
|
import org.luaj.vm2.lib.Str_find_mgr;
|
||||||
|
import org.luaj.vm2.lib.Str_find_mgr__regx;
|
||||||
|
class Scrib_pattern_matcher__luaj implements Scrib_pattern_matcher {
|
||||||
|
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||||
|
// int src_bgn = bgn_as_codes < 0 ? bgn_as_codes : text_ucs.Pos_codes_to_bytes(bgn_as_codes);
|
||||||
|
int src_bgn = bgn_as_codes < 0 ? Int_.Base1 : bgn_as_codes + Int_.Base1;
|
||||||
|
src_bgn = src_bgn >= text_ucs.Len_codes() ? text_ucs.Len_codes() : text_ucs.Pos_codes_to_bytes(src_bgn);
|
||||||
|
Str_find_mgr__regx mgr = new Str_find_mgr__regx(text_ucs.Src_string(), find_str, src_bgn, false, true);
|
||||||
|
mgr.Process();
|
||||||
|
|
||||||
|
// convert to Regx_match
|
||||||
|
int find_bgn = mgr.Bgn() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.Bgn());
|
||||||
|
int find_end = mgr.End() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.End());
|
||||||
|
boolean found = find_bgn != -1;
|
||||||
|
if (!found) {
|
||||||
|
return Regx_match.Ary_empty;
|
||||||
|
}
|
||||||
|
int[] captures = mgr.Capture_ints();
|
||||||
|
Regx_group[] groups = null;
|
||||||
|
if (found && captures != null) {
|
||||||
|
int captures_len = captures.length;
|
||||||
|
groups = new Regx_group[captures_len / 2];
|
||||||
|
for (int i = 0; i < captures_len; i += 2) {
|
||||||
|
groups[i / 2] = new Regx_group(true, captures[i], captures[i + 1], String_.Mid(text_ucs.Src_string(), text_ucs.Pos_bytes_to_chars(captures[i]), text_ucs.Pos_bytes_to_chars(captures[i + 1])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Regx_match rv = new Regx_match(found, find_bgn, find_end, groups);
|
||||||
|
return new Regx_match[] {rv};
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user