mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Scribunto: Use Luaj for pattern-matching (instead of Java Regex) [#413]
This commit is contained in:
@@ -1,16 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="lib" path="lib/luaj_xowa.jar"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry combineaccessrules="false" exported="true" kind="src" path="/100_core"/>
|
||||
<classpathentry combineaccessrules="false" exported="true" kind="src" path="/140_dbs"/>
|
||||
<classpathentry combineaccessrules="false" exported="true" kind="src" path="/150_gfui"/>
|
||||
<classpathentry kind="src" path="src"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/luaj_xowa.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/jtidy_xowa.jar"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry combineaccessrules="false" kind="src" path="/gplx.gflucene"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/icu4j-57_1.jar"/>
|
||||
<classpathentry kind="lib" path="lib/vnu.jar"/>
|
||||
<classpathentry kind="lib" path="lib/Saxon-HE-9.9.1-2.jar"/>
|
||||
<classpathentry combineaccessrules="false" kind="src" path="/baselib"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public interface Unicode_string {
|
||||
boolean Tid_is_single();
|
||||
String Src_string();
|
||||
byte[] Src_bytes();
|
||||
int Len_codes();
|
||||
int Len_chars();
|
||||
int Len_bytes();
|
||||
int Val_codes(int i);
|
||||
int Pos_codes_to_bytes(int i);
|
||||
int Pos_codes_to_chars(int i);
|
||||
int Pos_bytes_to_chars(int i);
|
||||
int Pos_bytes_to_codes(int i);
|
||||
int Pos_chars_to_codes(int i);
|
||||
}
|
||||
class Unicode_string_single implements Unicode_string { // 1 byte == 1 codepoint
|
||||
private final int[] codes;
|
||||
public Unicode_string_single(String src_string, byte[] src_bytes, int[] codes, int codes_len) {
|
||||
this.src_string = src_string;
|
||||
this.src_bytes = src_bytes;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
}
|
||||
public boolean Tid_is_single() {return true;}
|
||||
public String Src_string() {return src_string;} private final String src_string;
|
||||
public byte[] Src_bytes() {return src_bytes;} private final byte[] src_bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return codes_len;}
|
||||
public int Len_bytes() {return codes_len;}
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_codes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_bytes_to_chars(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_bytes_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
public int Pos_chars_to_codes(int i) {if (i < 0 || i > codes_len) throw Err_.new_wo_type("invalid idx", "src", src_string, "idx", i); return i;}
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Unicode_string_ {
|
||||
public static Unicode_string New(String orig) {
|
||||
// null
|
||||
if (orig == null)
|
||||
return new Unicode_string_single(null, null, null, 0);
|
||||
|
||||
// init bytes
|
||||
byte[] bytes = Bry_.new_u8(orig);
|
||||
int bytes_len = bytes.length;
|
||||
|
||||
// init codes
|
||||
int[] codes = new int[bytes_len];
|
||||
int codes_len = 0;
|
||||
|
||||
// loop
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
while (bytes_pos < bytes_len) {
|
||||
// set codes
|
||||
codes[codes_len] = Utf16_.Decode_to_int(bytes, bytes_pos);
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_len += 1;
|
||||
}
|
||||
return codes_len == bytes_len
|
||||
? (Unicode_string)new Unicode_string_single(orig, bytes, codes, codes_len)
|
||||
: (Unicode_string)new Unicode_string_multi (orig, bytes, bytes_len, codes, codes_len, chars_pos);
|
||||
}
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
class Unicode_string_multi implements Unicode_string {
|
||||
private final int[] codes;
|
||||
private final int[] codes_to_bytes;
|
||||
private final int[] codes_to_chars;
|
||||
private final int[] bytes_to_chars;
|
||||
private final int[] bytes_to_codes;
|
||||
private final int[] chars_to_codes;
|
||||
|
||||
public Unicode_string_multi(String src, byte[] bytes, int bytes_len, int[] codes, int codes_len, int chars_len) {
|
||||
// set member vars
|
||||
this.src = src;
|
||||
this.bytes = bytes;
|
||||
this.bytes_len = bytes_len;
|
||||
this.codes = codes;
|
||||
this.codes_len = codes_len;
|
||||
this.chars_len = chars_len;
|
||||
|
||||
// init maps
|
||||
this.codes_to_bytes = new int[codes_len + Adj_end];
|
||||
this.codes_to_chars = new int[codes_len + Adj_end];
|
||||
this.bytes_to_codes = New_int_ary(bytes_len);
|
||||
this.bytes_to_chars = New_int_ary(bytes_len);
|
||||
this.chars_to_codes = New_int_ary(chars_len);
|
||||
|
||||
// init loop
|
||||
int codes_pos = 0;
|
||||
int bytes_pos = 0;
|
||||
int chars_pos = 0;
|
||||
|
||||
// loop till EOS
|
||||
while (true) {
|
||||
// update
|
||||
codes_to_bytes[codes_pos] = bytes_pos;
|
||||
codes_to_chars[codes_pos] = chars_pos;
|
||||
bytes_to_chars[bytes_pos] = chars_pos;
|
||||
bytes_to_codes[bytes_pos] = codes_pos;
|
||||
chars_to_codes[chars_pos] = codes_pos;
|
||||
|
||||
if (bytes_pos == bytes_len) break;
|
||||
|
||||
// increment
|
||||
int cur_byte_len = Utf8_.Len_of_char_by_1st_byte(bytes[bytes_pos]);
|
||||
bytes_pos += cur_byte_len;
|
||||
chars_pos += Utf8_.Len_of_char_by_bytes_len(cur_byte_len);
|
||||
codes_pos += 1;
|
||||
}
|
||||
}
|
||||
public boolean Tid_is_single() {return false;}
|
||||
public String Src_string() {return src;} private final String src;
|
||||
public byte[] Src_bytes() {return bytes;} private final byte[] bytes;
|
||||
public int Len_codes() {return codes_len;} private final int codes_len;
|
||||
public int Len_chars() {return chars_len;} private final int chars_len;
|
||||
public int Len_bytes() {return bytes_len;} private final int bytes_len;
|
||||
public int Val_codes(int i) {return codes[i];}
|
||||
public int Pos_codes_to_bytes(int i) {return codes_to_bytes[i];}
|
||||
public int Pos_codes_to_chars(int i) {return codes_to_chars[i];}
|
||||
public int Pos_bytes_to_chars(int i) {int rv = bytes_to_chars[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_chars", "i", i); return rv;}
|
||||
public int Pos_bytes_to_codes(int i) {int rv = bytes_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "bytes_to_codes", "i", i); return rv;}
|
||||
public int Pos_chars_to_codes(int i) {int rv = chars_to_codes[i]; if (rv == Invalid) throw Err_.new_wo_type("invalid i", "src", src, "type", "chars_to_codes", "i", i); return rv;}
|
||||
|
||||
private static final int Invalid = -1, Adj_end = 1; // +1 to store last pos as len of String; needed for regex which returns match.Find_end() which will be len of String; EX: abc -> [0, 1, 2, 3]
|
||||
private static int[] New_int_ary(int len) {
|
||||
int rv_len = len + Adj_end;
|
||||
int[] rv = new int[rv_len];
|
||||
for (int i = 0; i < rv_len; i++)
|
||||
rv[i] = Invalid;
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Unicode_string_tst {
|
||||
private final Unicode_string_fxt fxt = new Unicode_string_fxt();
|
||||
@Test public void Null() {
|
||||
fxt.Init(null);
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Blank() {
|
||||
fxt.Init("");
|
||||
fxt.Test__Len(0, 0, 0);
|
||||
}
|
||||
@Test public void Single() {
|
||||
fxt.Init("Abc");
|
||||
fxt.Test__Len(3, 3, 3);
|
||||
fxt.Test__Val_codes(65, 98, 99);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3);
|
||||
fxt.Test__Pos_chars_to_codes(0, 1, 2, 3);
|
||||
fxt.Test__Pos_bytes_to_codes(0, 1, 2, 3);
|
||||
}
|
||||
@Test public void Multi() {
|
||||
fxt.Init("a¢€𤭢");
|
||||
fxt.Test__Len(4, 5, 10);
|
||||
fxt.Test__Val_codes(97, 162, 8364, 150370);
|
||||
fxt.Test__Pos_codes_to_bytes(0, 1, 3, 6, 10);
|
||||
fxt.Test__Pos_codes_to_chars(0, 1, 2, 3, 5);
|
||||
fxt.Test__Pos_chars_to_codes( 0, 1, 2, 3, -1, 4);
|
||||
fxt.Test__Pos_bytes_to_codes( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4);
|
||||
}
|
||||
}
|
||||
class Unicode_string_fxt {
|
||||
private Unicode_string under;
|
||||
public void Init(String src) {
|
||||
this.under = Unicode_string_.New(src);
|
||||
}
|
||||
public void Test__Len(int expd_codes, int expd_chars, int expd_bytes) {
|
||||
Gftest.Eq__int(expd_codes, under.Len_codes(), "codes");
|
||||
Gftest.Eq__int(expd_chars, under.Len_chars(), "chars");
|
||||
Gftest.Eq__int(expd_bytes, under.Len_bytes(), "bytes");
|
||||
}
|
||||
public void Test__Val_codes(int... expd) {
|
||||
int actl_len = under.Len_codes();
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Val_codes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_bytes(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_bytes(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_codes_to_chars(int... expd) {
|
||||
int actl_len = under.Len_codes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++)
|
||||
actl[i] = under.Pos_codes_to_chars(i);
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_bytes_to_codes(int... expd) {
|
||||
int actl_len = under.Len_bytes() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_bytes_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
public void Test__Pos_chars_to_codes(int... expd) {
|
||||
int actl_len = under.Len_chars() + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int val = 0;
|
||||
try {
|
||||
val = under.Pos_chars_to_codes(i);
|
||||
}
|
||||
catch (Exception exc) {
|
||||
val = -1;
|
||||
Err_.Noop(exc);
|
||||
}
|
||||
actl[i] = val;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl);
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.core.intls.*; import gplx.langs.regxs.*;
|
||||
import gplx.xowa.parsers.*;
|
||||
import gplx.xowa.xtns.scribunto.procs.*;
|
||||
@@ -55,10 +56,10 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
boolean plain = args.Cast_bool_or_n(3);
|
||||
|
||||
// init text vars
|
||||
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
|
||||
// convert bgn from base_1 to base_0
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
|
||||
|
||||
/*
|
||||
int offset = 0;
|
||||
@@ -80,34 +81,35 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
// if plain, just do literal match of find and exit
|
||||
if (plain) {
|
||||
// find pos by literal match
|
||||
Unicode_string find_ucs = Unicode_string_.New(find_str);
|
||||
byte[] find_bry = find_ucs.Src_bytes();
|
||||
int pos = Bry_find_.Find_fwd(text_ucs.Src_bytes(), find_bry, text_ucs.Pos_codes_to_bytes(bgn_as_codes));
|
||||
Ustring find_ucs = Ustring_.New_codepoints(find_str);
|
||||
int pos = String_.FindFwd(text_str, find_str, bgn_as_codes);
|
||||
|
||||
// nothing found; return empty
|
||||
if (pos == Bry_find_.Not_found)
|
||||
// if nothing found, return empty
|
||||
if (pos == String_.Find_none)
|
||||
return rslt.Init_ary_empty();
|
||||
// else, convert char_idx to code_idx
|
||||
else
|
||||
pos = text_ucs.Map_char_to_data(pos);
|
||||
|
||||
// bgn: convert pos from bytes back to codes; also adjust for base1
|
||||
int bgn = text_ucs.Pos_bytes_to_codes(pos) + Base1;
|
||||
int bgn = pos + Base1;
|
||||
|
||||
// end: add find.Len_in_codes and adjust end for PHP/LUA
|
||||
int end = bgn + find_ucs.Len_codes() - End_adj;
|
||||
int end = bgn + find_ucs.Len_in_data() - End_adj;
|
||||
|
||||
return rslt.Init_many_objs(bgn, end);
|
||||
}
|
||||
|
||||
// run regex
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||
if (regx_rslts.length == 0) return rslt.Init_ary_empty();
|
||||
// run regex; NOTE: take only 1st result; DATE:2014-08-27
|
||||
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
|
||||
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
|
||||
if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11
|
||||
|
||||
// add to tmp_list
|
||||
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_ucs.Pos_chars_to_codes(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
|
||||
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false);
|
||||
return rslt.Init_many_list(tmp_list);
|
||||
}
|
||||
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
@@ -119,41 +121,42 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
// validate / adjust
|
||||
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
|
||||
return rslt.Init_many_list(List_adp_.Noop);
|
||||
Unicode_string text_ucs = Unicode_string_.New(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_codes());
|
||||
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
|
||||
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
|
||||
|
||||
// run regex
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
Regx_match[] regx_rslts = Scrib_pattern_matcher_.Instance().Match(core.Ctx().Page().Url(), text_ucs, regx_converter, find_str, bgn_as_codes);
|
||||
if (regx_rslts.length == 0) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
||||
// run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
|
||||
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
|
||||
if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
|
||||
|
||||
// TOMBSTONE: add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23
|
||||
regx_rslts = regx_converter.Adjust_balanced(regx_rslts);
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
AddCapturesFromMatch(tmp_list, regx_rslts[0], text_str, regx_converter.Capt_ary(), true);
|
||||
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true);
|
||||
return rslt.Init_many_list(tmp_list);
|
||||
}
|
||||
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core, new Scrib_regx_converter());
|
||||
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core);
|
||||
return gsub_mgr.Exec(args, rslt);
|
||||
}
|
||||
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
|
||||
String regx = args.Pull_str(1);
|
||||
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
String pcre = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
|
||||
return rslt.Init_many_objs(pcre, regx_converter.Capt_ary());
|
||||
if (Scrib_pattern_matcher.Mode_is_xowa())
|
||||
regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
|
||||
else
|
||||
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
|
||||
return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
|
||||
}
|
||||
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically;
|
||||
String regx = args.Pull_str(1);
|
||||
Keyval[] capt = args.Cast_kv_ary_or_null(2);
|
||||
int pos = args.Pull_int(3);
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
|
||||
Regx_match[] regx_rslts = regx_adp.Match_all(text, pos);
|
||||
int len = regx_rslts.length;
|
||||
if (len == 0) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
|
||||
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result
|
||||
|
||||
Ustring text_ucs = Ustring_.New_codepoints(text);
|
||||
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data());
|
||||
Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false);
|
||||
if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
|
||||
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
|
||||
@@ -198,12 +201,12 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
|
||||
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
|
||||
}
|
||||
public static Regx_adp RegxAdp_new_(Xoa_url url, String regx) {
|
||||
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) {
|
||||
Regx_adp rv = Regx_adp_.new_(regx);
|
||||
if (rv.Pattern_is_invalid()) {
|
||||
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
|
||||
Exception exc = rv.Pattern_is_invalid_exception();
|
||||
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, url.To_bry(), Err_.Message_gplx_log(exc));
|
||||
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc));
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ public class Scrib_lib_ustring__find__tst {
|
||||
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.N, "3;3"); // bytes=4
|
||||
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
|
||||
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
|
||||
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
|
||||
fxt.Test__find("abcd" , "x" , 1, Bool_.N, String_.Null_mark); // no-match
|
||||
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
|
||||
}
|
||||
@Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
|
||||
@@ -64,6 +64,9 @@ public class Scrib_lib_ustring__find__tst {
|
||||
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
|
||||
fxt.Test__find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "5;4"); // 8 b/c \n starts at pos 8 (super 1)
|
||||
}
|
||||
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
|
||||
fxt.Test__find("[[5]]XccY", "%b[]X(%a)%1Y", 1, Bool_.N, "1;9;c");
|
||||
}
|
||||
}
|
||||
class Scrib_lib_ustring__find__fxt {
|
||||
private boolean dbg = false;
|
||||
|
||||
@@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
|
||||
public class Scrib_lib_ustring__gmatch__tst {
|
||||
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
|
||||
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
|
||||
@Before public void init() {
|
||||
fxt.Clear();
|
||||
lib = fxt.Core().Lib_ustring().Init();
|
||||
@@ -48,4 +48,7 @@ public class Scrib_lib_ustring__gmatch__tst {
|
||||
, " 1=2"
|
||||
));
|
||||
}
|
||||
@Test public void Callback__pattern() {
|
||||
fxt.Test__proc__objs__nest(lib, Scrib_lib_ustring.Invk_gmatch_callback, Object_.Ary("a", "%a+", Scrib_kv_utl_.base1_many_(false), 1) , "1=1\n2="); // fails if "a" is returned; note that 1 should be eos
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,6 +43,15 @@ public class Scrib_lib_ustring__gsub__tst {
|
||||
@Test public void Replace__double() { // PURPOSE: do not fail if double is passed in for @replace; PAGE:de.v:Wikivoyage:Wikidata/Test_Modul:Wikidata2 DATE:2016-04-21
|
||||
Exec_gsub("abcd", 1 , -1, 1.23d , "abcd;0");
|
||||
}
|
||||
@Test public void Replace__anypos() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
|
||||
Exec_gsub("'''a'''b", "()'''(.-'*)'''", 1, "z", "zb;1");
|
||||
}
|
||||
@Test public void Replace__balanced_and_grouping() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
|
||||
Exec_gsub("[[b]]", "%[(%b[])%]" , -1, "z" , "z;1"); // NOTE: not "[z]"
|
||||
}
|
||||
@Test public void Replace__initial() { // PURPOSE:whitespace being replaced during gsub replacement; DATE:2019-04-21
|
||||
Exec_gsub("a b c", "^%s*", -1, "x", "xa b c;1"); // fails if xabxc
|
||||
}
|
||||
@Test public void Replace__table() {
|
||||
Exec_gsub("abcd", "[ac]" , -1, Scrib_kv_utl_.flat_many_("a", "A", "c", "C") , "AbCd;2");
|
||||
Exec_gsub("abc" , "[ab]" , -1, Scrib_kv_utl_.flat_many_("a", "A") , "Abc;2"); // PURPOSE: match not in regex should still print itself; in this case [c] is not in tbl regex; DATE:2014-03-31
|
||||
@@ -122,6 +131,17 @@ public class Scrib_lib_ustring__gsub__tst {
|
||||
fxt.Init__cbk(proc);
|
||||
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "aBYz;2");
|
||||
}
|
||||
@Test public void Luacbk__balanced() { // PURPOSE:LUAJ_PATTERN_REPLACEMENT; DATE:2019-04-16
|
||||
String text = "}a{{b}}c{{d}}";
|
||||
String regx = "%b{}"; // "()" is anypos, which inserts find_pos to results
|
||||
Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{b}}"}, new Object[]{"y", "{{d}}"});
|
||||
fxt.Init__cbk(proc);
|
||||
Exec_gsub(text, regx, -1, proc.To_scrib_lua_proc(), "}axcy;2");
|
||||
}
|
||||
// Mock_proc__verify_args proc = new Mock_proc__verify_args(0, new Object[]{"x", "{{yes2}}"}, new Object[]{"x", "{{flagicon|USA}}"});
|
||||
// fxt.Init__cbk(proc);
|
||||
// Exec_gsub("}\n|-\n|28\n|{{yes2}}Win\n|28–0\n|style=\"text-align:left;\"|{{flagicon|USA}}", "%b{}", -1, proc.To_scrib_lua_proc(), "}axbx;2"); }
|
||||
//
|
||||
private void Exec_gsub(String text, Object regx, int limit, Object repl, String expd) {
|
||||
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_gsub, Scrib_kv_utl_.base1_many_(text, regx, repl, limit), expd);
|
||||
}
|
||||
|
||||
@@ -15,35 +15,39 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import gplx.langs.regxs.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.xowa.xtns.scribunto.libs.patterns.*;
|
||||
import gplx.xowa.xtns.scribunto.procs.*;
|
||||
class Scrib_lib_ustring_gsub_mgr {
|
||||
public class Scrib_lib_ustring_gsub_mgr { // THREAD.UNSAFE:LOCAL_VALUES
|
||||
private final Scrib_core core;
|
||||
private final Scrib_regx_converter regx_converter;
|
||||
private String src_str;
|
||||
private String pat_str;
|
||||
private int limit;
|
||||
private byte repl_tid;
|
||||
private byte[] repl_bry; private Hash_adp repl_hash; private Scrib_lua_proc repl_func;
|
||||
private int repl_count = 0;
|
||||
public Scrib_lib_ustring_gsub_mgr(Scrib_core core, Scrib_regx_converter regx_converter) {
|
||||
public int repl_count = 0;
|
||||
public Scrib_lib_ustring_gsub_mgr(Scrib_core core) {
|
||||
this.core = core;
|
||||
this.regx_converter = regx_converter;
|
||||
}
|
||||
public void Repl_count__add() {repl_count++;}
|
||||
public boolean Repl_count__done() {return repl_count == limit;}
|
||||
public boolean Exec(Scrib_proc_args args, Scrib_proc_rslt rslt) {
|
||||
// get @text; NOTE: sometimes int; DATE:2013-11-06
|
||||
String text = args.Xstr_str_or_null(0);
|
||||
if (args.Len() == 2) return rslt.Init_obj(text); // if no @replace, return @text; PAGE:en.d:'orse; DATE:2013-10-13
|
||||
// get @src_str; NOTE: sometimes int; DATE:2013-11-06
|
||||
this.src_str = args.Xstr_str_or_null(0);
|
||||
if (args.Len() == 2) return rslt.Init_obj(src_str); // if no @replace, return @src_str; PAGE:en.d:'orse; DATE:2013-10-13
|
||||
|
||||
// get @pattern; NOTE: sometimes int; PAGE:en.d:λύω; DATE:2014-09-02
|
||||
String regx = args.Xstr_str_or_null(1);
|
||||
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_pow, true);
|
||||
this.pat_str = args.Xstr_str_or_null(1);
|
||||
|
||||
// get @repl
|
||||
Object repl_obj = args.Cast_obj_or_null(2);
|
||||
byte repl_tid = Identify_repl(repl_obj);
|
||||
this.repl_tid = Identify_repl(repl_obj);
|
||||
|
||||
// get @limit; reset repl_count
|
||||
int limit = args.Cast_int_or(3, -1);
|
||||
repl_count = 0;
|
||||
this.limit = args.Cast_int_or(3, -1);
|
||||
|
||||
// do repl
|
||||
String repl = Exec_repl(repl_tid, text, regx, limit);
|
||||
String repl = Scrib_pattern_matcher.New(core.Page_url()).Gsub(this, Ustring_.New_codepoints(src_str), pat_str, 0);
|
||||
return rslt.Init_many_objs(repl, repl_count);
|
||||
}
|
||||
private byte Identify_repl(Object repl_obj) {
|
||||
@@ -80,44 +84,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
throw Err_.new_unhandled(Type_.Name(repl_type));
|
||||
return repl_tid;
|
||||
}
|
||||
private String Exec_repl(byte repl_tid, String text, String regx, int limit) {
|
||||
// parse regx
|
||||
Regx_adp regx_mgr = Scrib_lib_ustring.RegxAdp_new_(core.Ctx().Page().Url(), regx);
|
||||
if (regx_mgr.Pattern_is_invalid()) return text; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02)
|
||||
|
||||
// exec regx
|
||||
Regx_match[] rslts = regx_mgr.Match_all(text, 0);
|
||||
if (rslts.length == 0) return text; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
|
||||
rslts = regx_converter.Adjust_balanced(rslts);
|
||||
|
||||
Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
int rslts_len = rslts.length;
|
||||
int text_pos = 0;
|
||||
for (int i = 0; i < rslts_len; i++) {
|
||||
if (repl_count == limit) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified
|
||||
|
||||
// add text up to find.bgn
|
||||
Regx_match rslt = rslts[i];
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
|
||||
|
||||
// replace result
|
||||
if (!Exec_repl_itm(tmp_bfr, repl_tid, text, rslt)) {
|
||||
// will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
|
||||
}
|
||||
|
||||
// update
|
||||
text_pos = rslt.Find_end();
|
||||
repl_count++;
|
||||
}
|
||||
|
||||
// add rest of String
|
||||
int text_len = String_.Len(text);
|
||||
if (text_pos < text_len)
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
|
||||
return tmp_bfr.To_str_and_clear();
|
||||
}
|
||||
private boolean Exec_repl_itm(Bry_bfr tmp_bfr, byte repl_tid, String text, Regx_match match) {
|
||||
public boolean Exec_repl_itm(Bry_bfr tmp_bfr, Scrib_regx_converter regx_converter, Regx_match match) {
|
||||
switch (repl_tid) {
|
||||
case Repl_tid_string:
|
||||
int len = repl_bry.length;
|
||||
@@ -137,15 +104,15 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
// REF.MW: https://github.com/wikimedia/mediawiki-extensions-Scribunto/blob/master/includes/engines/LuaCommon/UstringLibrary.php#L785-L796
|
||||
// NOTE: 0 means take result; REF.MW:if ($x === '0'); return $m[0]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
|
||||
if (idx == 0)
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, match.Find_bgn(), match.Find_end()));
|
||||
// NOTE: > 0 means get from groups if it exists; REF.MW:elseif (isset($m["m$x"])) return $m["m$x"]; PAGE:Wikipedia:Wikipedia_Signpost/Templates/Voter/testcases; DATE:2015-08-02
|
||||
else if (idx - 1 < match.Groups().length) { // retrieve numbered capture; TODO_OLD: support more than 9 captures
|
||||
Regx_group grp = match.Groups()[idx - 1];
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, grp.Bgn(), grp.End())); // NOTE: grp.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
|
||||
}
|
||||
// NOTE: 1 per MW "Match undocumented Lua String.gsub behavior"; PAGE:en.d:Wiktionary:Scripts ISSUE#:393; DATE:2019-03-20
|
||||
else if (idx == 1) {
|
||||
tmp_bfr.Add_str_u8(String_.Mid(text, match.Find_bgn(), match.Find_end()));
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, match.Find_bgn(), match.Find_end()));
|
||||
}
|
||||
else {
|
||||
throw Err_.new_wo_type("invalid capture index %" + Char_.To_str(b) + " in replacement String");
|
||||
@@ -180,7 +147,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
match_bgn = grp.Bgn();
|
||||
match_end = grp.End();
|
||||
}
|
||||
String find_str = String_.Mid(text, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
|
||||
String find_str = String_.Mid(src_str, match_bgn, match_end); // NOTE: rslt.Bgn() / .End() is for String pos (bry pos will fail for utf8 strings)
|
||||
Object actl_repl_obj = repl_hash.Get_by(find_str);
|
||||
if (actl_repl_obj == null) // match found, but no replacement specified; EX:"abc", "[ab]", "a:A"; "b" in regex but not in tbl; EX:d:DVD; DATE:2014-03-31
|
||||
tmp_bfr.Add_str_u8(find_str);
|
||||
@@ -194,7 +161,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
int grps_len = grps.length;
|
||||
// no grps; pass 1 arg based on @match: EX: ("ace", "[b-d]"); args -> ("c")
|
||||
if (grps_len == 0) {
|
||||
String find_str = String_.Mid(text, match.Find_bgn(), match.Find_end());
|
||||
String find_str = String_.Mid(src_str, match.Find_bgn(), match.Find_end());
|
||||
luacbk_args = Scrib_kv_utl_.base1_obj_(find_str);
|
||||
}
|
||||
// grps exist; pass n args based on grp[n].match; EX: ("acfg", "([b-d])([e-g])"); args -> ("c", "f")
|
||||
@@ -202,7 +169,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
// memoize any_pos args for loop
|
||||
boolean any_pos = regx_converter.Any_pos();
|
||||
Keyval[] capt_ary = regx_converter.Capt_ary();
|
||||
int capt_ary_len = capt_ary.length;
|
||||
int capt_ary_len = capt_ary == null ? 0 : capt_ary.length; // capt_ary can be null b/c xowa_gsub will always create one group;
|
||||
|
||||
// loop grps; for each grp, create corresponding arg in luacbk
|
||||
luacbk_args = new Keyval[grps_len];
|
||||
@@ -212,7 +179,7 @@ class Scrib_lib_ustring_gsub_mgr {
|
||||
// anypos will create @offset arg; everything else creates a @match arg based on grp
|
||||
Object val = any_pos && i < capt_ary_len && Bool_.Cast(capt_ary[i].Val())
|
||||
? (Object)grp.Bgn()
|
||||
: (Object)String_.Mid(text, grp.Bgn(), grp.End());
|
||||
: (Object)String_.Mid(src_str, grp.Bgn(), grp.End());
|
||||
luacbk_args[i] = Keyval_.int_(i + Scrib_core.Base_1, val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,31 +14,33 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
|
||||
import gplx.core.brys.fmtrs.*; import gplx.core.intls.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.core.intls.*;
|
||||
import gplx.core.brys.fmtrs.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public class Scrib_regx_converter {
|
||||
public class Scrib_regx_converter {// THREAD.UNSAFE:MULTIPLE_RETURN_VALUES
|
||||
private final Scrib_regx_grp_mgr grp_mgr = new Scrib_regx_grp_mgr();
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private Bry_bfr tmp_bfr;
|
||||
private Bry_fmtr fmtr_balanced; private Bry_bfr bfr_balanced;
|
||||
private final Lua_cls_to_regx_map percent_map, brack_map;
|
||||
public Scrib_regx_converter() {
|
||||
percent_map = Lua_cls_matcher.Instance.Percent();
|
||||
brack_map = Lua_cls_matcher.Instance.Brack();
|
||||
}
|
||||
|
||||
public String Regx() {return regx;} private String regx;
|
||||
public Keyval[] Capt_ary() {return grp_mgr.Capt__to_ary();}
|
||||
public boolean Any_pos() {return any_pos;} private boolean any_pos;
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced(rslts);}
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] rslts) {return grp_mgr.Adjust_balanced_many(rslts);}
|
||||
public Regx_match Adjust_balanced_one(Regx_match rslt) {return grp_mgr.Adjust_balanced_one(rslt);}
|
||||
public String patternToRegex(String pat_str, byte[] anchor, boolean mode_is_regx) {
|
||||
Unicode_string pat_ucs = Unicode_string_.New(pat_str);
|
||||
Ustring pat_ucs = Ustring_.New_codepoints(pat_str);
|
||||
// TODO.CACHE: if (!$this->patternRegexCache->has($cacheKey))
|
||||
grp_mgr.Clear();
|
||||
any_pos = false;
|
||||
boolean q_flag = false;
|
||||
Bry_bfr bfr = Bry_bfr_.New();
|
||||
Bry_bfr tmp_bfr = null;
|
||||
Bry_fmtr fmtr_balanced = null;
|
||||
Bry_bfr bfr_balanced = null;
|
||||
Lua_cls_to_regx_map percent_map = Lua_cls_matcher.Instance.Percent();
|
||||
Lua_cls_to_regx_map brack_map = Lua_cls_matcher.Instance.Brack();
|
||||
|
||||
// bfr.Add_byte(Byte_ascii.Slash); // TOMBSTONE: do not add PHP "/" at start
|
||||
int len = pat_ucs.Len_codes();
|
||||
int len = pat_ucs.Len_in_data();
|
||||
int grps_len = 0;
|
||||
int bct = 0;
|
||||
|
||||
@@ -46,7 +48,7 @@ public class Scrib_regx_converter {
|
||||
for (int i = 0; i < len; i++) {
|
||||
int i_end = i + 1;
|
||||
q_flag = false; // must be reset; REF.MW:UstringLibrary.php|patternToRegex; DATE:2014-02-08
|
||||
int cur = pat_ucs.Val_codes(i);
|
||||
int cur = pat_ucs.Get_data(i);
|
||||
switch (cur) {
|
||||
case Byte_ascii.Pow:
|
||||
if (!mode_is_regx) {
|
||||
@@ -71,7 +73,7 @@ public class Scrib_regx_converter {
|
||||
int grp_idx = grp_mgr.Capt__len() + 1;
|
||||
|
||||
// check for "()"; enables anypos flag
|
||||
boolean is_empty_capture = pat_ucs.Val_codes(i + 1) == Byte_ascii.Paren_end;
|
||||
boolean is_empty_capture = pat_ucs.Get_data(i + 1) == Byte_ascii.Paren_end;
|
||||
if (is_empty_capture)
|
||||
any_pos = true;
|
||||
grp_mgr.Capt__add__real(grp_idx, is_empty_capture);
|
||||
@@ -93,19 +95,19 @@ public class Scrib_regx_converter {
|
||||
i++;
|
||||
if (i >= len)
|
||||
throw Err_.new_wo_type("malformed pattern (ends with '%')");
|
||||
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Val_codes(i));
|
||||
byte[] percent_bry = percent_map.Get_or_null(pat_ucs.Get_data(i));
|
||||
if (percent_bry != null) {
|
||||
bfr.Add(percent_bry);
|
||||
q_flag = true;
|
||||
}
|
||||
else {
|
||||
int nxt = pat_ucs.Val_codes(i);
|
||||
int nxt = pat_ucs.Get_data(i);
|
||||
switch (nxt) {
|
||||
case Byte_ascii.Ltr_b: // EX: "%b()"
|
||||
i += 2;
|
||||
if (i >= len) throw Err_.new_wo_type("malformed pattern (missing arguments to '%b')");
|
||||
int char_0 = pat_ucs.Val_codes(i - 1);
|
||||
int char_1 = pat_ucs.Val_codes(i);
|
||||
int char_0 = pat_ucs.Get_data(i - 1);
|
||||
int char_1 = pat_ucs.Get_data(i);
|
||||
if (char_0 == char_1) { // same char: easier regex; REF.MW: $bfr .= "{$d1}[^$d1]*$d1";
|
||||
bfr.Add(Bry_bf0_seg_0);
|
||||
Regx_quote(bfr, char_0);
|
||||
@@ -133,11 +135,11 @@ public class Scrib_regx_converter {
|
||||
}
|
||||
break;
|
||||
case Byte_ascii.Ltr_f: { // EX: lua frontier pattern; "%f[%a]"; DATE:2015-07-21
|
||||
if (i + 1 >= len || pat_ucs.Val_codes(++i) != Byte_ascii.Brack_bgn)
|
||||
if (i + 1 >= len || pat_ucs.Get_data(++i) != Byte_ascii.Brack_bgn)
|
||||
throw Err_.new_("scribunto", "missing '[' after %f in pattern at pattern character " + Int_.To_str(i_end));
|
||||
// %f always followed by bracketed term; convert lua bracketed term to regex
|
||||
if (tmp_bfr == null) tmp_bfr = Bry_bfr_.New();
|
||||
i = bracketedCharSetToRegex(tmp_bfr, pat_ucs, i, len);
|
||||
i = bracketedCharSetToRegex(tmp_bfr, brack_map, pat_ucs, i, len);
|
||||
byte[] re2 = tmp_bfr.To_bry_and_clear();
|
||||
|
||||
// scrib has following comment: 'Because %f considers the beginning and end of the String to be \0, determine if $re2 matches that and take it into account with "^" and "$".'
|
||||
@@ -169,7 +171,7 @@ public class Scrib_regx_converter {
|
||||
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||
continue;
|
||||
}
|
||||
i = bracketedCharSetToRegex(bfr, pat_ucs, i, len);
|
||||
i = bracketedCharSetToRegex(bfr, brack_map, pat_ucs, i, len);
|
||||
q_flag = true;
|
||||
break;
|
||||
case Byte_ascii.Brack_end:
|
||||
@@ -196,7 +198,7 @@ public class Scrib_regx_converter {
|
||||
break;
|
||||
}
|
||||
if (q_flag && i + 1 < len) {
|
||||
int tmp_b = pat_ucs.Val_codes(i + 1);
|
||||
int tmp_b = pat_ucs.Get_data(i + 1);
|
||||
switch (tmp_b) {
|
||||
case Byte_ascii.Star:
|
||||
case Byte_ascii.Plus:
|
||||
@@ -217,35 +219,35 @@ public class Scrib_regx_converter {
|
||||
regx = bfr.To_str_and_clear();
|
||||
return regx;
|
||||
}
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, Unicode_string pat_ucs, int i, int len) {
|
||||
private int bracketedCharSetToRegex(Bry_bfr bfr, Lua_cls_to_regx_map brack_map, Ustring pat_ucs, int i, int len) {
|
||||
bfr.Add_byte(Byte_ascii.Brack_bgn);
|
||||
i++;
|
||||
if (i < len && pat_ucs.Val_codes(i) == Byte_ascii.Pow) { // ^
|
||||
if (i < len && pat_ucs.Get_data(i) == Byte_ascii.Pow) { // ^
|
||||
bfr.Add_byte(Byte_ascii.Pow);
|
||||
i++;
|
||||
}
|
||||
for (int j = i; i < len && (j == i || pat_ucs.Val_codes(i) != Byte_ascii.Brack_end); i++) {
|
||||
if (pat_ucs.Val_codes(i) == Byte_ascii.Percent) {
|
||||
for (int j = i; i < len && (j == i || pat_ucs.Get_data(i) != Byte_ascii.Brack_end); i++) {
|
||||
if (pat_ucs.Get_data(i) == Byte_ascii.Percent) {
|
||||
i++;
|
||||
if (i >= len) {
|
||||
break;
|
||||
}
|
||||
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Val_codes(i));
|
||||
byte[] brack_bry = brack_map.Get_or_null(pat_ucs.Get_data(i));
|
||||
if (brack_bry != null)
|
||||
bfr.Add(brack_bry);
|
||||
else
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
Regx_quote(bfr, pat_ucs.Get_data(i));
|
||||
}
|
||||
else if (i + 2 < len && pat_ucs.Val_codes(i + 1) == Byte_ascii.Dash && pat_ucs.Val_codes(i + 2) != Byte_ascii.Brack_end && pat_ucs.Val_codes(i + 2) != Byte_ascii.Hash) {
|
||||
if (pat_ucs.Val_codes(i) <= pat_ucs.Val_codes(i + 2)) {
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
else if (i + 2 < len && pat_ucs.Get_data(i + 1) == Byte_ascii.Dash && pat_ucs.Get_data(i + 2) != Byte_ascii.Brack_end && pat_ucs.Get_data(i + 2) != Byte_ascii.Hash) {
|
||||
if (pat_ucs.Get_data(i) <= pat_ucs.Get_data(i + 2)) {
|
||||
Regx_quote(bfr, pat_ucs.Get_data(i));
|
||||
bfr.Add_byte(Byte_ascii.Dash);
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i + 2));
|
||||
Regx_quote(bfr, pat_ucs.Get_data(i + 2));
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
else {
|
||||
Regx_quote(bfr, pat_ucs.Val_codes(i));
|
||||
Regx_quote(bfr, pat_ucs.Get_data(i));
|
||||
}
|
||||
}
|
||||
if (i > len) throw Err_.new_wo_type("Missing close-bracket for character set beginning at pattern character $nxt_pos");
|
||||
|
||||
@@ -58,25 +58,29 @@ class Scrib_regx_grp_mgr {
|
||||
int actl_idx = Int_.Cast(idx_list.Get_by(regx_idx));
|
||||
bfr.Add_int_variable(actl_idx);
|
||||
}
|
||||
public Regx_match[] Adjust_balanced(Regx_match[] matches) {
|
||||
public Regx_match[] Adjust_balanced_many(Regx_match[] matches) {
|
||||
if (fake_count == 0) return matches;
|
||||
|
||||
int matches_len = matches.length;
|
||||
Regx_match[] rv = new Regx_match[matches_len];
|
||||
for (int i = 0; i < matches_len; i++) {
|
||||
Regx_match match = matches[i];
|
||||
Regx_group[] old_groups = match.Groups();
|
||||
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
|
||||
int group_idx = 0;
|
||||
for (int j = 0; j < old_groups.length; j++) {
|
||||
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
|
||||
if (itm.Is_fake()) continue;
|
||||
new_groups[group_idx++] = old_groups[j];
|
||||
}
|
||||
rv[i] = new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
|
||||
rv[i] = Adjust_balanced_one(matches[i]);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
public Regx_match Adjust_balanced_one(Regx_match match) {
|
||||
if (full_list.Len() == 0) return match; // no capture groups, so don't bother adjusting for balanced; DATE:2019-04-16
|
||||
|
||||
Regx_group[] old_groups = match.Groups();
|
||||
Regx_group[] new_groups = new Regx_group[full_list.Len() - fake_count];
|
||||
int group_idx = 0;
|
||||
for (int j = 0; j < old_groups.length; j++) {
|
||||
Scrib_regx_grp_itm itm = (Scrib_regx_grp_itm)full_list.Get_at(j);
|
||||
if (itm.Is_fake()) continue;
|
||||
new_groups[group_idx++] = old_groups[j];
|
||||
}
|
||||
return new Regx_match(match.Rslt(), match.Find_bgn(), match.Find_end(), new_groups);
|
||||
}
|
||||
}
|
||||
class Scrib_regx_grp_itm {
|
||||
public Scrib_regx_grp_itm(boolean is_fake, boolean is_empty_capture, int idx) {
|
||||
|
||||
@@ -14,8 +14,19 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||
import gplx.core.intls.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public interface Scrib_pattern_matcher {
|
||||
Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes);
|
||||
public abstract class Scrib_pattern_matcher {
|
||||
protected final Scrib_regx_converter regx_converter = new Scrib_regx_converter();
|
||||
public Keyval[] Capt_ary() {return regx_converter.Capt_ary();}
|
||||
public abstract Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace);
|
||||
public abstract String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes);
|
||||
|
||||
public static boolean Mode_is_xowa() {return false;}
|
||||
public static Scrib_pattern_matcher New(byte[] page_url) {
|
||||
return Mode_is_xowa()
|
||||
? (Scrib_pattern_matcher)new Scrib_pattern_matcher__xowa(page_url)
|
||||
: (Scrib_pattern_matcher)new Scrib_pattern_matcher__regx(page_url)
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||
import gplx.core.intls.*;
|
||||
import gplx.langs.regxs.*;
|
||||
public class Scrib_pattern_matcher_ {
|
||||
private static final Scrib_pattern_matcher instance = New();
|
||||
private static Scrib_pattern_matcher New() {
|
||||
return new Scrib_pattern_matcher__regx();
|
||||
// return new Scrib_pattern_matcher__luaj();
|
||||
}
|
||||
public static Scrib_pattern_matcher Instance() {return instance;}
|
||||
}
|
||||
class Scrib_pattern_matcher__regx implements Scrib_pattern_matcher {
|
||||
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||
// convert regex from lua to java
|
||||
find_str = regx_converter.patternToRegex(find_str, Scrib_regx_converter.Anchor_G, true);
|
||||
|
||||
// run regex
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(url, find_str);
|
||||
return regx_adp.Match_all(text_ucs.Src_string(), text_ucs.Pos_codes_to_chars(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
}
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||
import gplx.core.intls.*;
|
||||
import gplx.langs.regxs.*;
|
||||
import org.luaj.vm2.lib.StringLib;
|
||||
//import org.luaj.vm2.lib.Str_find_mgr;
|
||||
//import org.luaj.vm2.lib.Str_find_mgr__regx;
|
||||
class Scrib_pattern_matcher__luaj implements Scrib_pattern_matcher {
|
||||
public Regx_match[] Match(Xoa_url url, Unicode_string text_ucs, Scrib_regx_converter regx_converter, String find_str, int bgn_as_codes) {
|
||||
// int src_bgn = bgn_as_codes < 0 ? bgn_as_codes : text_ucs.Pos_codes_to_bytes(bgn_as_codes);
|
||||
// int src_bgn = bgn_as_codes < 0 ? Int_.Base1 : bgn_as_codes + Int_.Base1;
|
||||
// src_bgn = src_bgn >= text_ucs.Len_codes() ? text_ucs.Len_codes() : text_ucs.Pos_codes_to_bytes(src_bgn);
|
||||
// Str_find_mgr__regx mgr = new Str_find_mgr__regx(text_ucs.Src_string(), find_str, src_bgn, false, true);
|
||||
// mgr.Process();
|
||||
//
|
||||
// // convert to Regx_match
|
||||
// int find_bgn = mgr.Bgn() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.Bgn());
|
||||
// int find_end = mgr.End() == -1 ? -1 : text_ucs.Pos_bytes_to_chars(mgr.End());
|
||||
// boolean found = find_bgn != -1;
|
||||
// if (!found) {
|
||||
// return Regx_match.Ary_empty;
|
||||
// }
|
||||
// int[] captures = mgr.Capture_ints();
|
||||
// Regx_group[] groups = null;
|
||||
// if (found && captures != null) {
|
||||
// int captures_len = captures.length;
|
||||
// groups = new Regx_group[captures_len / 2];
|
||||
// for (int i = 0; i < captures_len; i += 2) {
|
||||
// groups[i / 2] = new Regx_group(true, captures[i], captures[i + 1], String_.Mid(text_ucs.Src_string(), text_ucs.Pos_bytes_to_chars(captures[i]), text_ucs.Pos_bytes_to_chars(captures[i + 1])));
|
||||
// }
|
||||
// }
|
||||
// Regx_match rv = new Regx_match(found, find_bgn, find_end, groups);
|
||||
// return new Regx_match[] {rv};
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.langs.regxs.*;
|
||||
class Scrib_pattern_matcher__regx extends Scrib_pattern_matcher { private final byte[] page_url;
|
||||
public Scrib_pattern_matcher__regx(byte[] page_url) {
|
||||
this.page_url = page_url;
|
||||
}
|
||||
@Override public Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace) {
|
||||
// convert lua pattern to java regex
|
||||
if (replace) // note that replace will be false for Gmatch_callback (b/c Gmatch_init already converted)
|
||||
pat_str = regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_G, true);
|
||||
|
||||
// run regex
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(page_url, pat_str);
|
||||
Regx_match match = regx_adp.Match(src_ucs.Src(), src_ucs.Map_data_to_char(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
match = regx_converter.Adjust_balanced_one(match);
|
||||
return match;
|
||||
}
|
||||
@Override public String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes) {
|
||||
// convert lua pattern to java regex
|
||||
pat_str = regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_pow, true);
|
||||
String src_str = src_ucs.Src();
|
||||
Regx_adp regx_adp = Scrib_lib_ustring.RegxAdp_new_(page_url, pat_str);
|
||||
if (regx_adp.Pattern_is_invalid()) return src_str; // NOTE: invalid patterns should return self; EX:[^]; DATE:2014-09-02
|
||||
|
||||
// run regex
|
||||
Regx_match[] rslts = regx_adp.Match_all(src_str, src_ucs.Map_data_to_char(bgn_as_codes)); // NOTE: MW calculates an offset to handle mb strings. however, java's regex always takes offset in chars (not bytes like PHP preg_match); DATE:2014-03-04
|
||||
if (rslts.length == 0) return src_str; // PHP: If matches are found, the new subject will be returned, otherwise subject will be returned unchanged.; http://php.net/manual/en/function.preg-replace-callback.php
|
||||
rslts = regx_converter.Adjust_balanced(rslts);
|
||||
|
||||
// replace results
|
||||
Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
int rslts_len = rslts.length;
|
||||
int text_pos = 0;
|
||||
for (int i = 0; i < rslts_len; i++) {
|
||||
if (gsub_mgr.Repl_count__done()) break; // stop if repl_count reaches limit; note that limit = -1 by default, unless specified
|
||||
|
||||
// add text up to find.bgn
|
||||
Regx_match rslt = rslts[i];
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, text_pos, rslt.Find_bgn())); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
|
||||
|
||||
// replace result
|
||||
if (!gsub_mgr.Exec_repl_itm(tmp_bfr, regx_converter, rslt)) {
|
||||
// will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, rslt.Find_bgn(), rslt.Find_end()));
|
||||
}
|
||||
|
||||
// update
|
||||
text_pos = rslt.Find_end();
|
||||
gsub_mgr.Repl_count__add();
|
||||
}
|
||||
|
||||
// add rest of String
|
||||
int text_len = String_.Len(src_str);
|
||||
if (text_pos < text_len)
|
||||
tmp_bfr.Add_str_u8(String_.Mid(src_str, text_pos, text_len)); // NOTE: regx returns char text_pos (not bry); must add as String, not bry; DATE:2013-07-17
|
||||
return tmp_bfr.To_str_and_clear();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.xowa.xtns.scribunto.libs.patterns; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; import gplx.xowa.xtns.scribunto.libs.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.langs.regxs.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import org.luaj.vm2.lib.StringLib;
|
||||
import org.luaj.vm2.Buffer;
|
||||
import org.luaj.vm2.LuaValue;
|
||||
import org.luaj.vm2.lib.Match_state;
|
||||
import org.luaj.vm2.lib.Str_find_mgr;
|
||||
import org.luaj.vm2.lib.Str_find_mgr__xowa;
|
||||
class Scrib_pattern_matcher__xowa extends Scrib_pattern_matcher { public Scrib_pattern_matcher__xowa(byte[] page_url) {}
|
||||
@Override public Regx_match Match_one(Ustring src_ucs, String pat_str, int bgn_as_codes, boolean replace) {
|
||||
regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_pow, true);
|
||||
Str_find_mgr__xowa mgr = new Str_find_mgr__xowa(src_ucs, Ustring_.New_codepoints(pat_str), bgn_as_codes, false, false);
|
||||
mgr.Process(false);
|
||||
|
||||
// convert to Regx_match
|
||||
int find_bgn = mgr.Bgn();
|
||||
int find_end = mgr.End();
|
||||
boolean found = find_bgn != -1;
|
||||
if (found) {
|
||||
find_bgn = src_ucs.Map_data_to_char(find_bgn);
|
||||
find_end = src_ucs.Map_data_to_char(find_end);
|
||||
}
|
||||
|
||||
Regx_group[] groups = Make_groups(src_ucs, mgr.Captures_ary());
|
||||
return new Regx_match(found, find_bgn, find_end, groups);
|
||||
}
|
||||
@Override public String Gsub(Scrib_lib_ustring_gsub_mgr gsub_mgr, Ustring src_ucs, String pat_str, int bgn_as_codes) {
|
||||
// get src vars
|
||||
String src_str = src_ucs.Src();
|
||||
int src_len = src_ucs.Len_in_data();
|
||||
if (src_len == 0) {
|
||||
return src_str;
|
||||
}
|
||||
int src_max = src_len + 1;
|
||||
|
||||
// get pat vars
|
||||
regx_converter.patternToRegex(pat_str, Scrib_regx_converter.Anchor_G, true);
|
||||
Ustring pat = Ustring_.New_codepoints(pat_str);
|
||||
int pat_len = pat.Len_in_data();
|
||||
final boolean pat_is_anchored = pat_len > 0 && pat.Get_data(0) == '^';
|
||||
|
||||
// get match vars
|
||||
Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
Str_find_mgr__xowa match_mgr = new Str_find_mgr__xowa(src_ucs, pat, bgn_as_codes, false, false);
|
||||
Match_state ms = new Match_state(match_mgr);
|
||||
|
||||
int src_pos = 0;
|
||||
int src_idx = 0;
|
||||
while (src_idx < src_max) {
|
||||
ms.reset();
|
||||
int res = ms.match(src_pos, pat_is_anchored ? 1 : 0);
|
||||
|
||||
// match found
|
||||
if (res != -1) {
|
||||
if (gsub_mgr.Repl_count__done()) break;
|
||||
src_idx++;
|
||||
|
||||
ms.push_captures(true, src_pos, res);
|
||||
|
||||
Regx_group[] groups = Make_groups(src_ucs, match_mgr.Captures_ary());
|
||||
Regx_match match = new Regx_match(true, src_pos, res, groups);
|
||||
if (!gsub_mgr.Exec_repl_itm(tmp_bfr, regx_converter, match)) {
|
||||
tmp_bfr.Add_str_u8(src_ucs.Substring(match.Find_bgn(), match.Find_end()));
|
||||
}
|
||||
|
||||
gsub_mgr.Repl_count__add();
|
||||
}
|
||||
|
||||
// match found; set src_pos to match_end
|
||||
if (res != -1 && res > src_pos)
|
||||
src_pos = res;
|
||||
// no match; add current byte
|
||||
else if (src_pos < src_len) {
|
||||
// lbuf.append( (byte) src.Get_data( src_pos++ ) );
|
||||
tmp_bfr.Add_u8_int(src_ucs.Get_data(src_pos++));
|
||||
}
|
||||
else
|
||||
break;
|
||||
|
||||
if (pat_is_anchored)
|
||||
break;
|
||||
|
||||
if (src_pos > src_len) // XOWA:assert src_pos is in bounds, else will throw ArrayIndexOutOfBounds exception; DATE:2016-09-20
|
||||
break;
|
||||
}
|
||||
|
||||
tmp_bfr.Add_str_u8(src_ucs.Substring(src_pos, src_len));
|
||||
return tmp_bfr.To_str_and_clear();
|
||||
}
|
||||
private Regx_group[] Make_groups(Ustring src_ucs, int[] captures) {
|
||||
if (captures == null) {
|
||||
return Regx_group.Ary_empty;
|
||||
}
|
||||
|
||||
int captures_len = captures.length;
|
||||
Regx_group[] groups = new Regx_group[captures_len / 2];
|
||||
for (int i = 0; i < captures_len; i += 2) {
|
||||
int capture_bgn = captures[i];
|
||||
int capture_end = captures[i + 1];
|
||||
capture_bgn = src_ucs.Map_data_to_char(capture_bgn);
|
||||
capture_end = src_ucs.Map_data_to_char(capture_end);
|
||||
groups[i / 2] = new Regx_group(true, capture_bgn, capture_end, String_.Mid(src_ucs.Src(), capture_bgn, capture_end));
|
||||
}
|
||||
return groups;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user