1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-09-28 14:30:51 +00:00

Scribunto: Add initial support for LuaJ StringLib as replacement for Regex [#413]

This commit is contained in:
gnosygnu 2019-04-01 22:34:45 -04:00
parent 2fc03f6211
commit 31c7604f03
11 changed files with 158 additions and 24 deletions

View File

@ -24,6 +24,7 @@ public interface Unicode_string {
int Val_codes(int i);
int Pos_codes_to_bytes(int i);
int Pos_codes_to_chars(int i);
int Pos_bytes_to_chars(int i);
int Pos_bytes_to_codes(int i);
int Pos_chars_to_codes(int i);
}
@ -43,7 +44,8 @@ class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
public int Len_bytes() {return codes_len;}
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i);return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
}

View File

@ -18,6 +18,7 @@ class Unicode_string_multi implements Unicode_string {
private final int[] codes;
private final int[] codes_to_bytes;
private final int[] codes_to_chars;
private final int[] bytes_to_chars;
private final int[] bytes_to_codes;
private final int[] chars_to_codes;
@ -34,6 +35,7 @@ class Unicode_string_multi implements Unicode_string {
this.codes_to_bytes = new int[codes_len + Adj_end];
this.codes_to_chars = new int[codes_len + Adj_end];
this.bytes_to_codes = New_int_ary(bytes_len);
this.bytes_to_chars = New_int_ary(bytes_len);
this.chars_to_codes = New_int_ary(chars_len);
// init loop
@ -46,6 +48,7 @@ class Unicode_string_multi implements Unicode_string {
// update
codes_to_bytes[codes_pos] = bytes_pos;
codes_to_chars[codes_pos] = chars_pos;
bytes_to_chars[bytes_pos] = chars_pos;
bytes_to_codes[bytes_pos] = codes_pos;
chars_to_codes[chars_pos] = codes_pos;
@ -67,6 +70,7 @@ class Unicode_string_multi implements Unicode_string {
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}

View File

@ -17,6 +17,7 @@ package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import
import gplx.core.intls.*; import gplx.langs.regxs.*;
import gplx.xowa.parsers.*;
import gplx.xowa.xtns.scribunto.procs.*;
import gplx.xowa.xtns.scribunto.libs.patterns.*;
public class Scrib_lib_ustring implements Scrib_lib {
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
@ -98,7 +99,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
// add to tmp_list
@ -123,7 +124,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Run_regex_or_null(text_ucs, regx_converter, find_str, bgn_as_codes);
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:; DATE:2015-01-30
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
@ -140,7 +141,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null);
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
}
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
@ -148,7 +149,7 @@ public class Scrib_lib_ustring implements Scrib_lib {
String regx = args.Pull_str(1);
Keyval[] capt = args.Cast_kv_ary_or_null(2);
int pos = args.Pull_int(3);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
Regx_match[] regx_rslts = regx_adp.Match_all(text, pos);
int len = regx_rslts.length;
if (len == 0) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
@ -178,14 +179,6 @@ public class Scrib_lib_ustring implements Scrib_lib {
bgn_as_codes = 0;
return bgn_as_codes;
}
private Regx_match[] Run_regex_or_null(Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// convert regex from lua to java
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), find_str);
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
int capts_len = capts == null ? 0 : capts.length;
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
@ -205,12 +198,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
public static Regx_adp RegxAdp_new_(Xop_ctx ctx, String regx) {
public static Regx_adp RegxAdp_new_(Xoa_url url, String regx) {
Regx_adp rv = Regx_adp_.new_(regx);
if (rv.Pattern_is_invalid()) {
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
Exception exc = rv.Pattern_is_invalid_exception();
ctx.App().Usr_dlg().Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, ctx.Page().Ttl().Page_db(), Err_.Message_gplx_log(exc));
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, url.To_bry(), Err_.Message_gplx_log(exc));
}
return rv;
}

View File

@ -27,7 +27,7 @@ public class Scrib_lib_ustring__match__tst {
Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn
Exec_match("abcd" , "b(c)" , 1, "c"); // group
Exec_match(" a b " , "^%s*(.-)%s*$" , 1, "a b"); // trim; NOTE: changed back from "a b;" to "a b"; DATE:2017-04-23; changed from "a b" to "a b;"; DATE:2015-01-30
Exec_match("abcd" , "a" , 0, "a"); // handle 0; note that php/lua is super-1, but some modules pass in 0; ru.w:Module:Infocards; DATE:2013-11-08
Exec_match("abcd" , "a" , 0, "a"); // handle 0; note that php/lua is BASE_1, but some modules pass in 0; ru.w:Module:Infocards; DATE:2013-11-08
Exec_match("abcd" , "." , -1, "d"); // -1
Exec_match("aaa" , "a" , 1, "a"); // should return 1st match not many
Exec_match("aaa" , "(a)" , 1, "a"); // should return 1st match only; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23

View File

@ -32,7 +32,7 @@ class Scrib_lib_ustring_gsub_mgr {
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
String regx = args.Xstr_str_or_null(1);
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow);
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow, true);
// get @repl
Object repl_obj = args.Cast_obj_or_null(2);
@ -82,7 +82,7 @@ class Scrib_lib_ustring_gsub_mgr {
}
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
// parse regx
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx(), regx);
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
// exec regx

View File

@ -30,7 +30,7 @@ public class Scrib_regx_converter {
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
public boolean Any_pos() {return any_pos;} private boolean any_pos;
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
public String patternToRegex(String pat_str, byte[] anchor) {
public String patternToRegex(String pat_str, byte[] anchor, boolean mode_is_regx) {
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
grp_mgr.Clear();
@ -49,10 +49,18 @@ public class Scrib_regx_converter {
int cur = pat_ucs.Val_codes(i);
switch (cur) {
case Byte_ascii.Pow:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Pow);
continue;
}
q_flag = i != 0;
bfr.Add((anchor == Anchor_null || q_flag) ? Bry_pow_escaped : anchor); // NOTE: must add anchor \G when using offsets; EX:cs.n:Category:1._zárí_2008; DATE:2014-05-07
break;
case Byte_ascii.Dollar:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Dollar);
continue;
}
q_flag = i < len - 1;
bfr.Add(q_flag ? Bry_dollar_escaped : Bry_dollar_literal);
break;
@ -78,6 +86,10 @@ public class Scrib_regx_converter {
bfr.Add_byte(Byte_ascii.Paren_end);
break;
case Byte_ascii.Percent:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Percent);
continue;
}
i++;
if (i >= len)
throw Err_.new_wo_type("malformed pattern (ends with '%')");
@ -114,7 +126,8 @@ public class Scrib_regx_converter {
++bct;
int balanced_idx = grp_mgr.Full__len();
fmtr_balanced.Bld_bfr(bfr_balanced, Int_.To_bry(bct), Utf16_.Encode_int_to_bry(char_0), Utf16_.Encode_int_to_bry(char_1), Int_.To_bry(balanced_idx + 1), Int_.To_bry(balanced_idx + 2));
grp_mgr.Capt__add__fake(2);
if (mode_is_regx)
grp_mgr.Capt__add__fake(2);
bfr.Add(bfr_balanced.To_bry_and_clear());
}
}
@ -152,16 +165,32 @@ public class Scrib_regx_converter {
}
break;
case Byte_ascii.Brack_bgn:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Brack_bgn);
continue;
}
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
q_flag = true;
break;
case Byte_ascii.Brack_end:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Brack_end);
continue;
}
throw Err_.new_wo_type("Unmatched close-bracket at pattern character " + Int_.To_str(i_end));
case Byte_ascii.Dot:
if (!mode_is_regx) {
bfr.Add_byte(Byte_ascii.Dot);
continue;
}
bfr.Add_byte(Byte_ascii.Dot);
q_flag = true;
break;
default:
if (!mode_is_regx) {
bfr.Add_u8_int(cur);
continue;
}
Regx_quote(bfr, cur);
q_flag = true;
break;

View File

@ -64,11 +64,11 @@ class Scrib_regx_converter_fxt {
}
}
public void Test_parse(String raw, String expd) {
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G);
under.patternToRegex(raw, Scrib_regx_converter.Anchor_G, true);
Tfds.Eq(expd, under.Regx());
}
public void Test_replace(String text, String find, String replace, String expd) {
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G);
String regex_str = under.patternToRegex(find, Scrib_regx_converter.Anchor_G, true);
String actl = Regx_adp_.Replace(text, regex_str, replace);
Tfds.Eq(expd, actl);
}

View File

@ -0,0 +1,21 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.langs.regxs.*;
public interface Scrib_pattern_matcher {
Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes);
}

View File

@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.langs.regxs.*;
public class Scrib_pattern_matcher_ {
private static final Scrib_pattern_matcher instance = New();
private static Scrib_pattern_matcher New() {
return new Scrib_pattern_matcher__regx();
// return new Scrib_pattern_matcher__luaj();
}
public static Scrib_pattern_matcher Instance() {return instance;}
}
class Scrib_pattern_matcher__regx implements Scrib_pattern_matcher {
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// convert regex from lua to java
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G, true);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(url, find_str);
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
}
}

View File

@ -0,0 +1,49 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.langs.regxs.*;
import org.luaj.vm2.lib.StringLib;
import org.luaj.vm2.lib.Str_find_mgr;
import org.luaj.vm2.lib.Str_find_mgr__regx;
class Scrib_pattern_matcher__luaj implements Scrib_pattern_matcher {
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// int src_bgn = bgn_as_codes < 0 ? bgn_as_codes : text_ucs.Pos_codes_to_bytes(bgn_as_codes);
int src_bgn = bgn_as_codes < 0 ? Int_.Base1 : bgn_as_codes + Int_.Base1;
src_bgn = src_bgn >= text_ucs.Len_codes() ? text_ucs.Len_codes() : text_ucs.Pos_codes_to_bytes(src_bgn);
Str_find_mgr__regx mgr = new Str_find_mgr__regx(text_ucs.Src_string(), find_str, src_bgn, false, true);
mgr.Process();
// convert to Regx_match
int find_bgn = mgr.Bgn() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.Bgn());
int find_end = mgr.End() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.End());
boolean found = find_bgn != -1;
if (!found) {
return Regx_match.Ary_empty;
}
int[] captures = mgr.Capture_ints();
Regx_group[] groups = null;
if (found && captures != null) {
int captures_len = captures.length;
groups = new Regx_group[captures_len / 2];
for (int i = 0; i < captures_len; i += 2) {
groups[i / 2] = new Regx_group(true, captures[i], captures[i + 1], String_.Mid(text_ucs.Src_string(), text_ucs.Pos_bytes_to_chars(captures[i]), text_ucs.Pos_bytes_to_chars(captures[i + 1])));
}
}
Regx_match rv = new Regx_match(found, find_bgn, find_end, groups);
return new Regx_match[] {rv};
}
}

Binary file not shown.