1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Scribunto: Use Luaj for pattern-matching (instead of Java Regex) [#413]

This commit is contained in:
gnosygnu
2019-04-28 17:31:33 -04:00
parent 4a1b2e25c0
commit f860edf064
51 changed files with 2045 additions and 729 deletions

View File

@@ -1,51 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public interface Unicode_string {
boolean Tid_is_single();
String Src_string();
byte[] Src_bytes();
int Len_codes();
int Len_chars();
int Len_bytes();
int Val_codes(int i);
int Pos_codes_to_bytes(int i);
int Pos_codes_to_chars(int i);
int Pos_bytes_to_chars(int i);
int Pos_bytes_to_codes(int i);
int Pos_chars_to_codes(int i);
}
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
private final int[] codes;
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
this.src_string = src_string;
this.src_bytes = src_bytes;
this.codes = codes;
this.codes_len = codes_len;
}
public boolean Tid_is_single() {return true;}
public String Src_string() {return src_string;} private final String src_string;
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return codes_len;}
public int Len_bytes() {return codes_len;}
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
}

View File

@@ -1,48 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Unicode_string_ {
public static Unicode_string New(String orig) {
// null
if (orig == null)
return new Unicode_string_single(null, null, null, 0);
// init bytes
byte[] bytes = Bry_.new_u8(orig);
int bytes_len = bytes.length;
// init codes
int[] codes = new int[bytes_len];
int codes_len = 0;
// loop
int bytes_pos = 0;
int chars_pos = 0;
while (bytes_pos < bytes_len) {
// set codes
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_len += 1;
}
return codes_len == bytes_len
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
}
}

View File

@@ -1,85 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
class Unicode_string_multi implements Unicode_string {
private final int[] codes;
private final int[] codes_to_bytes;
private final int[] codes_to_chars;
private final int[] bytes_to_chars;
private final int[] bytes_to_codes;
private final int[] chars_to_codes;
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
// set member vars
this.src = src;
this.bytes = bytes;
this.bytes_len = bytes_len;
this.codes = codes;
this.codes_len = codes_len;
this.chars_len = chars_len;
// init maps
this.codes_to_bytes = new int[codes_len + Adj_end];
this.codes_to_chars = new int[codes_len + Adj_end];
this.bytes_to_codes = New_int_ary(bytes_len);
this.bytes_to_chars = New_int_ary(bytes_len);
this.chars_to_codes = New_int_ary(chars_len);
// init loop
int codes_pos = 0;
int bytes_pos = 0;
int chars_pos = 0;
// loop till EOS
while (true) {
// update
codes_to_bytes[codes_pos] = bytes_pos;
codes_to_chars[codes_pos] = chars_pos;
bytes_to_chars[bytes_pos] = chars_pos;
bytes_to_codes[bytes_pos] = codes_pos;
chars_to_codes[chars_pos] = codes_pos;
if (bytes_pos == bytes_len) break;
// increment
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
bytes_pos += cur_byte_len;
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
codes_pos += 1;
}
}
public boolean Tid_is_single() {return false;}
public String Src_string() {return src;} private final String src;
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
public int Len_codes() {return codes_len;} private final int codes_len;
public int Len_chars() {return chars_len;} private final int chars_len;
public int Len_bytes() {return bytes_len;} private final int bytes_len;
public int Val_codes(int i) {return codes[i];}
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
private static int[] New_int_ary(int len) {
int rv_len = len + Adj_end;
int[] rv = new int[rv_len];
for (int i = 0; i < rv_len; i++)
rv[i] = Invalid;
return rv;
}
}

View File

@@ -1,110 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Unicode_string_tst {
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
@Test public void Null() {
fxt.Init(null);
fxt.Test__Len(0, 0, 0);
}
@Test public void Blank() {
fxt.Init("");
fxt.Test__Len(0, 0, 0);
}
@Test public void Single() {
fxt.Init("Abc");
fxt.Test__Len(3, 3, 3);
fxt.Test__Val_codes(65, 98, 99);
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
}
@Test public void Multi() {
fxt.Init("a¢€𤭢");
fxt.Test__Len(4, 5, 10);
fxt.Test__Val_codes(97, 162, 8364, 150370);
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
}
}
class Unicode_string_fxt {
private Unicode_string under;
public void Init(String src) {
this.under = Unicode_string_.New(src);
}
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
}
public void Test__Val_codes(int... expd) {
int actl_len = under.Len_codes();
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Val_codes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_bytes(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_bytes(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_codes_to_chars(int... expd) {
int actl_len = under.Len_codes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++)
actl[i] = under.Pos_codes_to_chars(i);
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_bytes_to_codes(int... expd) {
int actl_len = under.Len_bytes() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_bytes_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
public void Test__Pos_chars_to_codes(int... expd) {
int actl_len = under.Len_chars() + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int val = 0;
try {
val = under.Pos_chars_to_codes(i);
}
catch (Exception exc) {
val = -1;
Err_.Noop(exc);
}
actl[i] = val;
}
Gftest.Eq__ary(expd, actl);
}
}

View File

@@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.objects.strings.unicodes.*;
import gplx.core.intls.*; import gplx.langs.regxs.*;
import gplx.xowa.parsers.*;
import gplx.xowa.xtns.scribunto.procs.*;
@@ -55,10 +56,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
boolean plain = args.Cast_bool_or_n(3);
// init text vars
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// convert bgn from base_1 to base_0
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
/*
int offset = 0;
@@ -80,34 +81,35 @@ public class Scrib_lib_ustring implements Scrib_lib {
// if plain, just do literal match of find and exit
if (plain) {
// find pos by literal match
Unicode_string find_ucs = Unicode_string_.New(find_str);
byte[] find_bry = find_ucs.Src_bytes();
int pos = Bry_find_.Find_fwd(text_ucs.Src_bytes(), find_bry, text_ucs.Pos_codes_to_bytes(bgn_as_codes));
Ustring find_ucs = Ustring_.New_codepoints(find_str);
int pos = String_.FindFwd(text_str, find_str, bgn_as_codes);
// nothing found; return empty
if (pos == Bry_find_.Not_found)
// if nothing found, return empty
if (pos == String_.Find_none)
return rslt.Init_ary_empty();
// else, convert char_idx to code_idx
else
pos = text_ucs.Map_char_to_data(pos);
// bgn: convert pos from bytes back to codes; also adjust for base1
int bgn = text_ucs.Pos_bytes_to_codes(pos) + Base1;
int bgn = pos + Base1;
// end: add find.Len_in_codes and adjust end for PHP/LUA
int end = bgn + find_ucs.Len_codes() - End_adj;
int end = bgn + find_ucs.Len_in_data() - End_adj;
return rslt.Init_many_objs(bgn, end);
}
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
// run regex; NOTE: take only 1st result; DATE:2014-08-27
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11
// add to tmp_list
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
@@ -119,41 +121,42 @@ public class Scrib_lib_ustring implements Scrib_lib {
// validate / adjust
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
return rslt.Init_many_list(List_adp_.Noop);
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
// run regex
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
// run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, regx_rslts[0], text_str, regx_converter.Capt_ary(), true);
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
}
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core, new Scrib_regx_converter());
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core);
return gsub_mgr.Exec(args, rslt);
}
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
if (Scrib_pattern_matcher.Mode_is_xowa())
regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
else
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
}
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically;
String regx = args.Pull_str(1);
Keyval[] capt = args.Cast_kv_ary_or_null(2);
int pos = args.Pull_int(3);
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
Regx_match[] regx_rslts = regx_adp.Match_all(text, pos);
int len = regx_rslts.length;
if (len == 0) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result
Ustring text_ucs = Ustring_.New_codepoints(text);
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data());
Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false);
if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
@@ -198,12 +201,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
public static Regx_adp RegxAdp_new_(Xoa_url url, String regx) {
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) {
Regx_adp rv = Regx_adp_.new_(regx);
if (rv.Pattern_is_invalid()) {
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
Exception exc = rv.Pattern_is_invalid_exception();
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, url.To_bry(), Err_.Message_gplx_log(exc));
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc));
}
return rv;
}

View File

@@ -40,7 +40,7 @@ public class Scrib_lib_ustring__find__tst {
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.N, "3;3"); // bytes=4
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
fxt.Test__find("abcd" , "x" , 1, Bool_.N, String_.Null_mark); // no-match
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
}
@Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
@@ -64,6 +64,9 @@ public class Scrib_lib_ustring__find__tst {
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "5;4"); // 8 b/c \n starts at pos 8 (super 1)
}
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
fxt.Test__find("[[5]]XccY", "%b[]X(%a)%1Y", 1, Bool_.N, "1;9;c");
}
}
class Scrib_lib_ustring__find__fxt {
private boolean dbg = false;

View File

@@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
public class Scrib_lib_ustring__gmatch__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() {
fxt.Clear();
lib = fxt.Core().Lib_ustring().Init();
@@ -48,4 +48,7 @@ public class Scrib_lib_ustring__gmatch__tst {
, " 1=2"
));
}
@Test public void Callback__pattern() {
fxt.Test__proc__objs__nest(lib, Scrib_lib_ustring.Invk_gmatch_callback, Object_.Ary("a", "%a+", Scrib_kv_utl_.base1_many_(false), 1) , "1=1\n2="); // fails if "a" is returned; note that 1 should be eos
}
}

View File

@@ -43,6 +43,15 @@ public class Scrib_lib_ustring__gsub__tst {
@Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21
Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0");
}
@Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1");
}
@Test public void Replace__balanced_and_grouping() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
Exec_gsub("[[b]]", "%[(%b[])%]" , -1, "z" , "z;1"); // NOTE: not "[z]"
}
@Test public void Replace__initial() { // PURPOSE:whitespace being replaced during gsub replacement; DATE:2019-04-21
Exec_gsub("a b c", "^%s*", -1, "x", "xa b c;1"); // fails if xabxc
}
@Test public void Replace__table() {
Exec_gsub("abcd", "[ac]" , -1, Scrib_kv_utl_.flat_many_("a", "A", "c", "C") , "AbCd;2");
Exec_gsub("abc" , "[ab]" , -1, Scrib_kv_utl_.flat_many_("a", "A") , "Abc;2"); // PURPOSE: match not in regex should still print itself; in this case [c] is not in tbl regex; DATE:2014-03-31
@@ -122,6 +131,17 @@ public class Scrib_lib_ustring__gsub__tst {
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2");
}
@Test public void Luacbk__balanced() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
String text = "}a{{b}}c{{d}}";
String regx = "%b{}"; // "()" is anypos, which inserts find_pos to results
Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{b}}"}, new Object[]{"y", "{{d}}"});
fxt.Init__cbk(proc);
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "}axcy;2");
}
// Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"});
// fxt.Init__cbk(proc);
// Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|280\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); }
//
private void Exec_gsub(String text, Object regx, int limit, Object repl, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(text, regx, repl, limit), expd);
}

View File

@@ -15,35 +15,39 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.langs.regxs.*;
import gplx.objects.strings.unicodes.*;
import gplx.xowa.xtns.scribunto.libs.patterns.*;
import gplx.xowa.xtns.scribunto.procs.*;
class Scrib_lib_ustring_gsub_mgr {
public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
private final Scrib_core core;
private final Scrib_regx_converter regx_converter;
private String src_str;
private String pat_str;
private int limit;
private byte repl_tid;
private byte[] repl_bry; private Hash_adp repl_hash; private Scrib_lua_proc repl_func;
private int repl_count = 0;
public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) {
public int repl_count = 0;
public Scrib_lib_ustring_gsub_mgr(Scrib_core core) {
this.core = core;
this.regx_converter = regx_converter;
}
public void Repl_count__add() {repl_count++;}
public boolean Repl_count__done() {return repl_count == limit;}
public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get @text; NOTE: sometimes int; DATE:2013-11-06
String text = args.Xstr_str_or_null(0);
if (args.Len() == 2) return rslt.Init_obj(text); // if no @replace, return @text; PAGE:en.d:'orse; DATE:2013-10-13
// get @src_str; NOTE: sometimes int; DATE:2013-11-06
this.src_str = args.Xstr_str_or_null(0);
if (args.Len() == 2) return rslt.Init_obj(src_str); // if no @replace, return @src_str; PAGE:en.d:'orse; DATE:2013-10-13
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
String regx = args.Xstr_str_or_null(1);
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow, true);
this.pat_str = args.Xstr_str_or_null(1);
// get @repl
Object repl_obj = args.Cast_obj_or_null(2);
byte repl_tid = Identify_repl(repl_obj);
this.repl_tid = Identify_repl(repl_obj);
// get @limit; reset repl_count
int limit = args.Cast_int_or(3, -1);
repl_count = 0;
this.limit = args.Cast_int_or(3, -1);
// do repl
String repl = Exec_repl(repl_tid, text, regx, limit);
String repl = Scrib_pattern_matcher.New(core.Page_url()).Gsub(this, Ustring_.New_codepoints(src_str), pat_str, 0);
return rslt.Init_many_objs(repl, repl_count);
}
private byte Identify_repl(Object repl_obj) {
@@ -80,44 +84,7 @@ class Scrib_lib_ustring_gsub_mgr {
throw Err_.new_unhandled(Type_.Name(repl_type));
return repl_tid;
}
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
// parse regx
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
// exec regx
Regx_match[] rslts = regx_mgr.Match_all(text, 0);
if (rslts.length == 0) return text; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
rslts = regx_converter.Adjust_balanced(rslts);
Bry_bfr tmp_bfr = Bry_bfr_.New();
int rslts_len = rslts.length;
int text_pos = 0;
for (int i = 0; i < rslts_len; i++) {
if (repl_count == limit) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified
// add text up to find.bgn
Regx_match rslt = rslts[i];
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
// replace result
if (!Exec_repl_itm(tmp_bfr, repl_tid, text, rslt)) {
// will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
// update
text_pos = rslt.Find_end();
repl_count++;
}
// add rest of String
int text_len = String_.Len(text);
if (text_pos < text_len)
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
return tmp_bfr.To_str_and_clear();
}
private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, String text, Regx_match match) {
public boolean Exec_repl_itm(Bry_bfr tmp_bfr, Scrib_regx_converter regx_converter, Regx_match match) {
switch (repl_tid) {
case Repl_tid_string:
int len = repl_bry.length;
@@ -137,15 +104,15 @@ class Scrib_lib_ustring_gsub_mgr {
// REF.MW: https://github.com/wikimedia/mediawiki-extensions-Scribunto/blob/master/includes/engines/LuaCommon/UstringLibrary.php#L785-L796
// NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
if (idx == 0)
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
tmp_bfr.Add_str_u8(String_.Mid(src_str, match.Find_bgn(), match.Find_end()));
// NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
Regx_group grp = match.Groups()[idx - 1];
tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
}
// NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20
else if (idx == 1) {
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
tmp_bfr.Add_str_u8(String_.Mid(src_str, match.Find_bgn(), match.Find_end()));
}
else {
throw Err_.new_wo_type("invalid capture index %" + Char_.To_str(b) + " in replacement String");
@@ -180,7 +147,7 @@ class Scrib_lib_ustring_gsub_mgr {
match_bgn = grp.Bgn();
match_end = grp.End();
}
String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
String find_str = String_.Mid(src_str, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
Object actl_repl_obj = repl_hash.Get_by(find_str);
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
tmp_bfr.Add_str_u8(find_str);
@@ -194,7 +161,7 @@ class Scrib_lib_ustring_gsub_mgr {
int grps_len = grps.length;
// no grps; pass 1 arg based on @match: EX: ("ace", "[b-d]"); args -> ("c")
if (grps_len == 0) {
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
String find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end());
luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
}
// grps exist; pass n args based on grp[n].match; EX: ("acfg", "([b-d])([e-g])"); args -> ("c", "f")
@@ -202,7 +169,7 @@ class Scrib_lib_ustring_gsub_mgr {
// memoize any_pos args for loop
boolean any_pos = regx_converter.Any_pos();
Keyval[] capt_ary = regx_converter.Capt_ary();
int capt_ary_len = capt_ary.length;
int capt_ary_len = capt_ary == null ? 0 : capt_ary.length; // capt_ary can be null b/c xowa_gsub will always create one group;
// loop grps; for each grp, create corresponding arg in luacbk
luacbk_args = new Keyval[grps_len];
@@ -212,7 +179,7 @@ class Scrib_lib_ustring_gsub_mgr {
// anypos will create @offset arg; everything else creates a @match arg based on grp
Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val())
? (Object)grp.Bgn()
: (Object)String_.Mid(text, grp.Bgn(), grp.End());
: (Object)String_.Mid(src_str, grp.Bgn(), grp.End());
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
}
}

View File

@@ -14,31 +14,33 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.core.brys.fmtrs.*; import gplx.core.intls.*;
import gplx.objects.strings.unicodes.*;
import gplx.core.intls.*;
import gplx.core.brys.fmtrs.*;
import gplx.langs.regxs.*;
public class Scrib_regx_converter {
public class Scrib_regx_converter {// THREAD.UNSAFE:MULTIPLE_RETURN_VALUES
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
private final Bry_bfr bfr = Bry_bfr_.New();
private Bry_bfr tmp_bfr;
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
private final Lua_cls_to_regx_map percent_map, brack_map;
public Scrib_regx_converter() {
percent_map = Lua_cls_matcher.Instance.Percent();
brack_map = Lua_cls_matcher.Instance.Brack();
}
public String Regx() {return regx;} private String regx;
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
public boolean Any_pos() {return any_pos;} private boolean any_pos;
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced_many(rslts);}
public Regx_match Adjust_balanced_one(Regx_match rslt) {return grp_mgr.Adjust_balanced_one(rslt);}
public String patternToRegex(String pat_str, byte[] anchor, boolean mode_is_regx) {
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
Ustring pat_ucs = Ustring_.New_codepoints(pat_str);
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
grp_mgr.Clear();
any_pos = false;
boolean q_flag = false;
Bry_bfr bfr = Bry_bfr_.New();
Bry_bfr tmp_bfr = null;
Bry_fmtr fmtr_balanced = null;
Bry_bfr bfr_balanced = null;
Lua_cls_to_regx_map percent_map = Lua_cls_matcher.Instance.Percent();
Lua_cls_to_regx_map brack_map = Lua_cls_matcher.Instance.Brack();
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
int len = pat_ucs.Len_codes();
int len = pat_ucs.Len_in_data();
int grps_len = 0;
int bct = 0;
@@ -46,7 +48,7 @@ public class Scrib_regx_converter {
for (int i = 0; i < len; i++) {
int i_end = i + 1;
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
int cur = pat_ucs.Val_codes(i);
int cur = pat_ucs.Get_data(i);
switch (cur) {
case Byte_ascii.Pow:
if (!mode_is_regx) {
@@ -71,7 +73,7 @@ public class Scrib_regx_converter {
int grp_idx = grp_mgr.Capt__len() + 1;
// check for "()"; enables anypos flag
boolean is_empty_capture = pat_ucs.Val_codes(i + 1) == Byte_ascii.Paren_end;
boolean is_empty_capture = pat_ucs.Get_data(i + 1) == Byte_ascii.Paren_end;
if (is_empty_capture)
any_pos = true;
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
@@ -93,19 +95,19 @@ public class Scrib_regx_converter {
i++;
if (i >= len)
throw Err_.new_wo_type("malformed pattern (ends with '%')");
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Val_codes(i));
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Get_data(i));
if (percent_bry != null) {
bfr.Add(percent_bry);
q_flag = true;
}
else {
int nxt = pat_ucs.Val_codes(i);
int nxt = pat_ucs.Get_data(i);
switch (nxt) {
case Byte_ascii.Ltr_b: // EX: "%b()"
i += 2;
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
int char_0 = pat_ucs.Val_codes(i - 1);
int char_1 = pat_ucs.Val_codes(i);
int char_0 = pat_ucs.Get_data(i - 1);
int char_1 = pat_ucs.Get_data(i);
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
bfr.Add(Bry_bf0_seg_0);
Regx_quote(bfr, char_0);
@@ -133,11 +135,11 @@ public class Scrib_regx_converter {
}
break;
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
if (i + 1 >= len || pat_ucs.Val_codes(++i) != Byte_ascii.Brack_bgn)
if (i + 1 >= len || pat_ucs.Get_data(++i) != Byte_ascii.Brack_bgn)
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
// %f always followed by bracketed term; convert lua bracketed term to regex
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
i = bracketedCharSetToRegex(tmp_bfr, pat_ucs, i, len);
i = bracketedCharSetToRegex(tmp_bfr, brack_map, pat_ucs, i, len);
byte[] re2 = tmp_bfr.To_bry_and_clear();
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
@@ -169,7 +171,7 @@ public class Scrib_regx_converter {
bfr.Add_byte(Byte_ascii.Brack_bgn);
continue;
}
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
i = bracketedCharSetToRegex(bfr, brack_map, pat_ucs, i, len);
q_flag = true;
break;
case Byte_ascii.Brack_end:
@@ -196,7 +198,7 @@ public class Scrib_regx_converter {
break;
}
if (q_flag && i + 1 < len) {
int tmp_b = pat_ucs.Val_codes(i + 1);
int tmp_b = pat_ucs.Get_data(i + 1);
switch (tmp_b) {
case Byte_ascii.Star:
case Byte_ascii.Plus:
@@ -217,35 +219,35 @@ public class Scrib_regx_converter {
regx = bfr.To_str_and_clear();
return regx;
}
private int bracketedCharSetToRegex(Bry_bfr bfr, Unicode_string pat_ucs, int i, int len) {
private int bracketedCharSetToRegex(Bry_bfr bfr, Lua_cls_to_regx_map brack_map, Ustring pat_ucs, int i, int len) {
bfr.Add_byte(Byte_ascii.Brack_bgn);
i++;
if (i < len && pat_ucs.Val_codes(i) == Byte_ascii.Pow) { // ^
if (i < len && pat_ucs.Get_data(i) == Byte_ascii.Pow) { // ^
bfr.Add_byte(Byte_ascii.Pow);
i++;
}
for (int j = i; i < len && (j == i || pat_ucs.Val_codes(i) != Byte_ascii.Brack_end); i++) {
if (pat_ucs.Val_codes(i) == Byte_ascii.Percent) {
for (int j = i; i < len && (j == i || pat_ucs.Get_data(i) != Byte_ascii.Brack_end); i++) {
if (pat_ucs.Get_data(i) == Byte_ascii.Percent) {
i++;
if (i >= len) {
break;
}
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Val_codes(i));
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Get_data(i));
if (brack_bry != null)
bfr.Add(brack_bry);
else
Regx_quote(bfr, pat_ucs.Val_codes(i));
Regx_quote(bfr, pat_ucs.Get_data(i));
}
else if (i + 2 < len && pat_ucs.Val_codes(i + 1) == Byte_ascii.Dash && pat_ucs.Val_codes(i + 2) != Byte_ascii.Brack_end && pat_ucs.Val_codes(i + 2) != Byte_ascii.Hash) {
if (pat_ucs.Val_codes(i) <= pat_ucs.Val_codes(i + 2)) {
Regx_quote(bfr, pat_ucs.Val_codes(i));
else if (i + 2 < len && pat_ucs.Get_data(i + 1) == Byte_ascii.Dash && pat_ucs.Get_data(i + 2) != Byte_ascii.Brack_end && pat_ucs.Get_data(i + 2) != Byte_ascii.Hash) {
if (pat_ucs.Get_data(i) <= pat_ucs.Get_data(i + 2)) {
Regx_quote(bfr, pat_ucs.Get_data(i));
bfr.Add_byte(Byte_ascii.Dash);
Regx_quote(bfr, pat_ucs.Val_codes(i + 2));
Regx_quote(bfr, pat_ucs.Get_data(i + 2));
}
i += 2;
}
else {
Regx_quote(bfr, pat_ucs.Val_codes(i));
Regx_quote(bfr, pat_ucs.Get_data(i));
}
}
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");

View File

@@ -58,25 +58,29 @@ class Scrib_regx_grp_mgr {
int actl_idx = Int_.Cast(idx_list.Get_by(regx_idx));
bfr.Add_int_variable(actl_idx);
}
public Regx_match[] Adjust_balanced(Regx_match[] matches) {
public Regx_match[] Adjust_balanced_many(Regx_match[] matches) {
if (fake_count == 0) return matches;
int matches_len = matches.length;
Regx_match[] rv = new Regx_match[matches_len];
for (int i = 0; i < matches_len; i++) {
Regx_match match = matches[i];
Regx_group[] old_groups = match.Groups();
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
int group_idx = 0;
for (int j = 0; j < old_groups.length; j++) {
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
if (itm.Is_fake()) continue;
new_groups[group_idx++] = old_groups[j];
}
rv[i] = new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
rv[i] = Adjust_balanced_one(matches[i]);
}
return rv;
}
public Regx_match Adjust_balanced_one(Regx_match match) {
if (full_list.Len() == 0) return match; // no capture groups, so don't bother adjusting for balanced; DATE:2019-04-16
Regx_group[] old_groups = match.Groups();
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
int group_idx = 0;
for (int j = 0; j < old_groups.length; j++) {
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
if (itm.Is_fake()) continue;
new_groups[group_idx++] = old_groups[j];
}
return new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
}
}
class Scrib_regx_grp_itm {
public Scrib_regx_grp_itm(boolean is_fake, boolean is_empty_capture, int idx) {

View File

@@ -14,8 +14,19 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.objects.strings.unicodes.*;
import gplx.langs.regxs.*;
public interface Scrib_pattern_matcher {
Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes);
public abstract class Scrib_pattern_matcher {
protected final Scrib_regx_converter regx_converter = new Scrib_regx_converter();
public Keyval[] Capt_ary() {return regx_converter.Capt_ary();}
public abstract Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace);
public abstract String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes);
public static boolean Mode_is_xowa() {return false;}
public static Scrib_pattern_matcher New(byte[] page_url) {
return Mode_is_xowa()
? (Scrib_pattern_matcher)new Scrib_pattern_matcher__xowa(page_url)
: (Scrib_pattern_matcher)new Scrib_pattern_matcher__regx(page_url)
;
}
}

View File

@@ -1,36 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.langs.regxs.*;
public class Scrib_pattern_matcher_ {
private static final Scrib_pattern_matcher instance = New();
private static Scrib_pattern_matcher New() {
return new Scrib_pattern_matcher__regx();
// return new Scrib_pattern_matcher__luaj();
}
public static Scrib_pattern_matcher Instance() {return instance;}
}
class Scrib_pattern_matcher__regx implements Scrib_pattern_matcher {
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// convert regex from lua to java
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G, true);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(url, find_str);
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
}
}

View File

@@ -1,50 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.core.intls.*;
import gplx.langs.regxs.*;
import org.luaj.vm2.lib.StringLib;
//import org.luaj.vm2.lib.Str_find_mgr;
//import org.luaj.vm2.lib.Str_find_mgr__regx;
class Scrib_pattern_matcher__luaj implements Scrib_pattern_matcher {
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
// int src_bgn = bgn_as_codes < 0 ? bgn_as_codes : text_ucs.Pos_codes_to_bytes(bgn_as_codes);
// int src_bgn = bgn_as_codes < 0 ? Int_.Base1 : bgn_as_codes + Int_.Base1;
// src_bgn = src_bgn >= text_ucs.Len_codes() ? text_ucs.Len_codes() : text_ucs.Pos_codes_to_bytes(src_bgn);
// Str_find_mgr__regx mgr = new Str_find_mgr__regx(text_ucs.Src_string(), find_str, src_bgn, false, true);
// mgr.Process();
//
// // convert to Regx_match
// int find_bgn = mgr.Bgn() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.Bgn());
// int find_end = mgr.End() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.End());
// boolean found = find_bgn != -1;
// if (!found) {
// return Regx_match.Ary_empty;
// }
// int[] captures = mgr.Capture_ints();
// Regx_group[] groups = null;
// if (found && captures != null) {
// int captures_len = captures.length;
// groups = new Regx_group[captures_len / 2];
// for (int i = 0; i < captures_len; i += 2) {
// groups[i / 2] = new Regx_group(true, captures[i], captures[i + 1], String_.Mid(text_ucs.Src_string(), text_ucs.Pos_bytes_to_chars(captures[i]), text_ucs.Pos_bytes_to_chars(captures[i + 1])));
// }
// }
// Regx_match rv = new Regx_match(found, find_bgn, find_end, groups);
// return new Regx_match[] {rv};
return null;
}
}

View File

@@ -0,0 +1,74 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.objects.strings.unicodes.*;
import gplx.langs.regxs.*;
class Scrib_pattern_matcher__regx extends Scrib_pattern_matcher { private final byte[] page_url;
public Scrib_pattern_matcher__regx(byte[] page_url) {
this.page_url = page_url;
}
@Override public Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace) {
// convert lua pattern to java regex
if (replace) // note that replace will be false for Gmatch_callback (b/c Gmatch_init already converted)
pat_str = regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_G, true);
// run regex
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(page_url, pat_str);
Regx_match match = regx_adp.Match(src_ucs.Src(), src_ucs.Map_data_to_char(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
match = regx_converter.Adjust_balanced_one(match);
return match;
}
@Override public String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes) {
// convert lua pattern to java regex
pat_str = regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_pow, true);
String src_str = src_ucs.Src();
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(page_url, pat_str);
if (regx_adp.Pattern_is_invalid()) return src_str; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02
// run regex
Regx_match[] rslts = regx_adp.Match_all(src_str, src_ucs.Map_data_to_char(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
if (rslts.length == 0) return src_str; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
rslts = regx_converter.Adjust_balanced(rslts);
// replace results
Bry_bfr tmp_bfr = Bry_bfr_.New();
int rslts_len = rslts.length;
int text_pos = 0;
for (int i = 0; i < rslts_len; i++) {
if (gsub_mgr.Repl_count__done()) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified
// add text up to find.bgn
Regx_match rslt = rslts[i];
tmp_bfr.Add_str_u8(String_.Mid(src_str, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
// replace result
if (!gsub_mgr.Exec_repl_itm(tmp_bfr, regx_converter, rslt)) {
// will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
tmp_bfr.Add_str_u8(String_.Mid(src_str, rslt.Find_bgn(), rslt.Find_end()));
}
// update
text_pos = rslt.Find_end();
gsub_mgr.Repl_count__add();
}
// add rest of String
int text_len = String_.Len(src_str);
if (text_pos < text_len)
tmp_bfr.Add_str_u8(String_.Mid(src_str, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
return tmp_bfr.To_str_and_clear();
}
}

View File

@@ -0,0 +1,123 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
import gplx.objects.strings.unicodes.*;
import gplx.langs.regxs.*;
import gplx.objects.strings.unicodes.*;
import org.luaj.vm2.lib.StringLib;
import org.luaj.vm2.Buffer;
import org.luaj.vm2.LuaValue;
import org.luaj.vm2.lib.Match_state;
import org.luaj.vm2.lib.Str_find_mgr;
import org.luaj.vm2.lib.Str_find_mgr__xowa;
class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher { public Scrib_pattern_matcher__xowa(byte[] page_url) {}
@Override public Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace) {
regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_pow, true);
Str_find_mgr__xowa mgr = new Str_find_mgr__xowa(src_ucs, Ustring_.New_codepoints(pat_str), bgn_as_codes, false, false);
mgr.Process(false);
// convert to Regx_match
int find_bgn = mgr.Bgn();
int find_end = mgr.End();
boolean found = find_bgn != -1;
if (found) {
find_bgn = src_ucs.Map_data_to_char(find_bgn);
find_end = src_ucs.Map_data_to_char(find_end);
}
Regx_group[] groups = Make_groups(src_ucs, mgr.Captures_ary());
return new Regx_match(found, find_bgn, find_end, groups);
}
@Override public String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes) {
// get src vars
String src_str = src_ucs.Src();
int src_len = src_ucs.Len_in_data();
if (src_len == 0) {
return src_str;
}
int src_max = src_len + 1;
// get pat vars
regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_G, true);
Ustring pat = Ustring_.New_codepoints(pat_str);
int pat_len = pat.Len_in_data();
final boolean pat_is_anchored = pat_len > 0 && pat.Get_data(0) == '^';
// get match vars
Bry_bfr tmp_bfr = Bry_bfr_.New();
Str_find_mgr__xowa match_mgr = new Str_find_mgr__xowa(src_ucs, pat, bgn_as_codes, false, false);
Match_state ms = new Match_state(match_mgr);
int src_pos = 0;
int src_idx = 0;
while (src_idx < src_max) {
ms.reset();
int res = ms.match(src_pos, pat_is_anchored ? 1 : 0);
// match found
if (res != -1) {
if (gsub_mgr.Repl_count__done()) break;
src_idx++;
ms.push_captures(true, src_pos, res);
Regx_group[] groups = Make_groups(src_ucs, match_mgr.Captures_ary());
Regx_match match = new Regx_match(true, src_pos, res, groups);
if (!gsub_mgr.Exec_repl_itm(tmp_bfr, regx_converter, match)) {
tmp_bfr.Add_str_u8(src_ucs.Substring(match.Find_bgn(), match.Find_end()));
}
gsub_mgr.Repl_count__add();
}
// match found; set src_pos to match_end
if (res != -1 && res > src_pos)
src_pos = res;
// no match; add current byte
else if (src_pos < src_len) {
// lbuf.append( (byte) src.Get_data( src_pos++ ) );
tmp_bfr.Add_u8_int(src_ucs.Get_data(src_pos++));
}
else
break;
if (pat_is_anchored)
break;
if (src_pos > src_len) // XOWA:assert src_pos is in bounds, else will throw ArrayIndexOutOfBounds exception; DATE:2016-09-20
break;
}
tmp_bfr.Add_str_u8(src_ucs.Substring(src_pos, src_len));
return tmp_bfr.To_str_and_clear();
}
private Regx_group[] Make_groups(Ustring src_ucs, int[] captures) {
if (captures == null) {
return Regx_group.Ary_empty;
}
int captures_len = captures.length;
Regx_group[] groups = new Regx_group[captures_len / 2];
for (int i = 0; i < captures_len; i += 2) {
int capture_bgn = captures[i];
int capture_end = captures[i + 1];
capture_bgn = src_ucs.Map_data_to_char(capture_bgn);
capture_end = src_ucs.Map_data_to_char(capture_end);
groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end));
}
return groups;
}
}