Scribunto: Allow int for `find` parameter [#802]

staging
gnosygnu 4 years ago
parent 59d8a42b22
commit c801e3a20b

@ -1,6 +1,6 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
@ -13,202 +13,224 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import gplx.objects.strings.unicodes.*;
import gplx.core.intls.*; import gplx.langs.regxs.*;
import gplx.xowa.parsers.*;
import gplx.xowa.xtns.scribunto.procs.*;
import gplx.xowa.xtns.scribunto.libs.patterns.*;
public class Scrib_lib_ustring implements Scrib_lib {
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
public String Key() {return "mw.ustring";}
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000;
public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;}
public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);}
public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) {
Init();
mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua")
, Keyval_.new_("stringLengthLimit", string_len_max)
, Keyval_.new_("patternLengthLimit", pattern_len_max)
);
return mod;
}
public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr();
public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) {
switch (key) {
case Proc_find: return Find(args, rslt);
case Proc_match: return Match(args, rslt);
case Proc_gmatch_init: return Gmatch_init(args, rslt);
case Proc_gmatch_callback: return Gmatch_callback(args, rslt);
case Proc_gsub: return Gsub(args, rslt);
default: throw Err_.new_unhandled(key);
}
}
private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4;
public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub";
private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub);
public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get args
String text_str = args.Xstr_str_or_null(0);
String find_str = args.Pull_str(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
boolean plain = args.Cast_bool_or_n(3);
// init text vars
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// convert bgn from base_1 to base_0
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
/*
int offset = 0;
if (bgn_as_codes > 0) { // NOTE: MW.BASE
// $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
}
else {
bgn_as_codes_base1 = 0; // NOTE: MW.BASE1
offset = 0; // -1?
}
*/
// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
// NOTE: not in MW; is this needed? DATE:2019-02-24
if (String_.Len_eq_0(find_str))
return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1);
// if plain, just do literal match of find and exit
if (plain) {
// find pos by literal match
Ustring find_ucs = Ustring_.New_codepoints(find_str);
int pos = text_ucs.Index_of(find_ucs, bgn_as_codes);
// if nothing found, return empty
if (pos == String_.Find_none)
return rslt.Init_ary_empty();
// bgn: adjust for base1
int bgn = pos + Base1;
// end: add find.Len_in_codes and adjust end for PHP/LUA
int end = bgn + find_ucs.Len_in_data() - End_adj;
return rslt.Init_many_objs(bgn, end);
}
// run regex; NOTE: take only 1st result; DATE:2014-08-27
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11
// add to tmp_list
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get args
String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
String find_str = args.Cast_str_or_null(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
// validate / adjust
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
return rslt.Init_many_list(List_adp_.Noop);
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
// run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
}
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core);
return gsub_mgr.Exec(args, rslt);
}
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
if (Scrib_pattern_matcher.Mode_is_xowa())
regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
else
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
}
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically;
String regx = args.Pull_str(1);
Keyval[] capt = args.Cast_kv_ary_or_null(2);
int pos = args.Pull_int(3);
Ustring text_ucs = Ustring_.New_codepoints(text);
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data());
Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false);
if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
}
private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) {
// convert bgn from base_1 to base_0
int bgn_as_codes = bgn_as_codes_base1;
if (bgn_as_codes > 0)
bgn_as_codes -= Scrib_lib_ustring.Base1;
// TOMBSTONE: do not adjust negative numbers for base1; fails tests
// else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1;
// adjust bgn for negative-numbers and large positive-numbers
// NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php
// NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ
if (bgn_as_codes < 0) // negative number means search from rear of String
bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
// will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23
if (bgn_as_codes < 0)
bgn_as_codes = 0;
return bgn_as_codes;
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
int capts_len = capts == null ? 0 : capts.length;
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Regx_group[] grps = rslt.Groups();
int grps_len = grps.length;
for (int j = 0; j < grps_len; j++) {
Regx_group grp = grps[j];
if ( j < capts_len // bounds check b/c null can be passed
&& Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23
)
tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21
else
tmp_list.Add(grp.Val()); // return match
}
}
else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match;
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) {
Regx_adp rv = Regx_adp_.new_(regx);
if (rv.Pattern_is_invalid()) {
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
Exception exc = rv.Pattern_is_invalid_exception();
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc));
}
return rv;
}
private static final int
Base1 = 1
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
}
package gplx.xowa.xtns.scribunto.libs;
import gplx.Bool_;
import gplx.Err_;
import gplx.Gfo_usr_dlg_;
import gplx.Io_url;
import gplx.Keyval;
import gplx.Keyval_;
import gplx.List_adp;
import gplx.List_adp_;
import gplx.String_;
import gplx.langs.regxs.Regx_adp;
import gplx.langs.regxs.Regx_adp_;
import gplx.langs.regxs.Regx_group;
import gplx.langs.regxs.Regx_match;
import gplx.objects.strings.unicodes.Ustring;
import gplx.objects.strings.unicodes.Ustring_;
import gplx.xowa.Xoa_page_;
import gplx.xowa.xtns.scribunto.Scrib_core;
import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
import gplx.xowa.xtns.scribunto.Scrib_lib;
import gplx.xowa.xtns.scribunto.Scrib_lua_mod;
import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_mgr;
import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt;
public class Scrib_lib_ustring implements Scrib_lib {
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
public String Key() {return "mw.ustring";}
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000;
public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;}
public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);}
public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) {
Init();
mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua")
, Keyval_.new_("stringLengthLimit", string_len_max)
, Keyval_.new_("patternLengthLimit", pattern_len_max)
);
return mod;
}
public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr();
public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) {
switch (key) {
case Proc_find: return Find(args, rslt);
case Proc_match: return Match(args, rslt);
case Proc_gmatch_init: return Gmatch_init(args, rslt);
case Proc_gmatch_callback: return Gmatch_callback(args, rslt);
case Proc_gsub: return Gsub(args, rslt);
default: throw Err_.new_unhandled(key);
}
}
private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4;
public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub";
private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub);
public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get args
String text_str = args.Xstr_str_or_null(0);
String find_str = args.Pull_str(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
boolean plain = args.Cast_bool_or_n(3);
// init text vars
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// convert bgn from base_1 to base_0
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
/*
int offset = 0;
if (bgn_as_codes > 0) { // NOTE: MW.BASE
// $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
}
else {
bgn_as_codes_base1 = 0; // NOTE: MW.BASE1
offset = 0; // -1?
}
*/
// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
// NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24
// NOTE: not in MW; is this needed? DATE:2019-02-24
if (String_.Len_eq_0(find_str))
return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1);
// if plain, just do literal match of find and exit
if (plain) {
// find pos by literal match
Ustring find_ucs = Ustring_.New_codepoints(find_str);
int pos = text_ucs.Index_of(find_ucs, bgn_as_codes);
// if nothing found, return empty
if (pos == String_.Find_none)
return rslt.Init_ary_empty();
// bgn: adjust for base1
int bgn = pos + Base1;
// end: add find.Len_in_codes and adjust end for PHP/LUA
int end = bgn + find_ucs.Len_in_data() - End_adj;
return rslt.Init_many_objs(bgn, end);
}
// run regex; NOTE: take only 1st result; DATE:2014-08-27
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11
// add to tmp_list
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// get args
String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
// 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error
String find_str = args.Xstr_str_or_null(1);
int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
// validate / adjust
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
return rslt.Init_many_list(List_adp_.Noop);
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
// run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true);
return rslt.Init_many_list(tmp_list);
}
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core);
return gsub_mgr.Exec(args, rslt);
}
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// String text = Scrib_kv_utl_.Val_to_str(values, 0);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter();
if (Scrib_pattern_matcher.Mode_is_xowa())
regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
else
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
}
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically;
String regx = args.Pull_str(1);
Keyval[] capt = args.Cast_kv_ary_or_null(2);
int pos = args.Pull_int(3);
Ustring text_ucs = Ustring_.New_codepoints(text);
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data());
Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false);
if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
}
private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) {
// convert bgn from base_1 to base_0
int bgn_as_codes = bgn_as_codes_base1;
if (bgn_as_codes > 0)
bgn_as_codes -= Scrib_lib_ustring.Base1;
// TOMBSTONE: do not adjust negative numbers for base1; fails tests
// else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1;
// adjust bgn for negative-numbers and large positive-numbers
// NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php
// NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ
if (bgn_as_codes < 0) // negative number means search from rear of String
bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
// will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23
if (bgn_as_codes < 0)
bgn_as_codes = 0;
return bgn_as_codes;
}
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
int capts_len = capts == null ? 0 : capts.length;
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Regx_group[] grps = rslt.Groups();
int grps_len = grps.length;
for (int j = 0; j < grps_len; j++) {
Regx_group grp = grps[j];
if ( j < capts_len // bounds check b/c null can be passed
&& Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23
)
tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21
else
tmp_list.Add(grp.Val()); // return match
}
}
else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match;
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) {
Regx_adp rv = Regx_adp_.new_(regx);
if (rv.Pattern_is_invalid()) {
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
Exception exc = rv.Pattern_is_invalid_exception();
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc));
}
return rv;
}
private static final int
Base1 = 1
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
}

@ -1,27 +1,35 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.xtns.scribunto.libs;
import gplx.Keyval_;
import gplx.String_;
import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
import gplx.xowa.xtns.scribunto.Scrib_lib;
import gplx.xowa.xtns.scribunto.engines.mocks.Mock_scrib_fxt;
import org.junit.Before;
import org.junit.Test;
public class Scrib_lib_ustring__match__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() {
fxt.Clear();
lib = fxt.Core().Lib_ustring().Init();
}
@Test public void Basic() {
@Test public void Basic() {
Exec_match("abcd" , "bc" , 1, "bc"); // basic
Exec_match("abcd" , "x" , 1, String_.Null_mark); // empty
Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn
@ -35,24 +43,26 @@ public class Scrib_lib_ustring__match__tst {
Exec_match(1 , "a" , 1, String_.Null_mark); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
Exec_match("" , "a?" , 1, ""); // no results with ? should return "" not nil; PAGE:en.d:民; DATE:2015-01-30
}
@Test public void Args_out_of_order() {
@Test public void Args_out_of_order() {
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
}
@Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
@Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
}
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9");
}
@Test public void Balanced__numbered_2() {
@Test public void Balanced__numbered_2() {
Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9");
}
@Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01
@Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01
Exec_match("ä" , "%a", 1, "ä");
}
@Test public void Number() {// 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error
Exec_match_obj("A" , 0, 0, String_.Null_mark);
}
// @Test public void Match_viwiktionary() {
// @Test public void Match_viwiktionary() {
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches
// Exec_match("tr" , "^([b]*).-([c]*)$", 1, ";"); // should return all matches
@ -60,4 +70,7 @@ public class Scrib_lib_ustring__match__tst {
private void Exec_match(Object text, String regx, int bgn, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd);
}
private void Exec_match_obj(Object text, Object regx, int bgn, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd);
}
}

Loading…
Cancel
Save