1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2025-06-13 12:54:14 +00:00

Scribunto: Allow int for find parameter [#802]

This commit is contained in:
gnosygnu 2020-09-23 08:43:47 -04:00
parent 59d8a42b22
commit c801e3a20b
2 changed files with 261 additions and 226 deletions

View File

@ -1,6 +1,6 @@
/* /*
XOWA: the XOWA Offline Wiki Application XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3, XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0. or alternatively under the terms of the Apache License Version 2.0.
@ -13,202 +13,224 @@ The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/ */
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; package gplx.xowa.xtns.scribunto.libs;
import gplx.objects.strings.unicodes.*;
import gplx.core.intls.*; import gplx.langs.regxs.*; import gplx.Bool_;
import gplx.xowa.parsers.*; import gplx.Err_;
import gplx.xowa.xtns.scribunto.procs.*; import gplx.Gfo_usr_dlg_;
import gplx.xowa.xtns.scribunto.libs.patterns.*; import gplx.Io_url;
public class Scrib_lib_ustring implements Scrib_lib { import gplx.Keyval;
public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core; import gplx.Keyval_;
public String Key() {return "mw.ustring";} import gplx.List_adp;
public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod; import gplx.List_adp_;
public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max; import gplx.String_;
public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000; import gplx.langs.regxs.Regx_adp;
public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;} import gplx.langs.regxs.Regx_adp_;
public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);} import gplx.langs.regxs.Regx_group;
public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) { import gplx.langs.regxs.Regx_match;
Init(); import gplx.objects.strings.unicodes.Ustring;
mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua") import gplx.objects.strings.unicodes.Ustring_;
, Keyval_.new_("stringLengthLimit", string_len_max) import gplx.xowa.Xoa_page_;
, Keyval_.new_("patternLengthLimit", pattern_len_max) import gplx.xowa.xtns.scribunto.Scrib_core;
); import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
return mod; import gplx.xowa.xtns.scribunto.Scrib_lib;
} import gplx.xowa.xtns.scribunto.Scrib_lua_mod;
public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr(); import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher;
public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) { import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args;
switch (key) { import gplx.xowa.xtns.scribunto.procs.Scrib_proc_mgr;
case Proc_find: return Find(args, rslt); import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt;
case Proc_match: return Match(args, rslt);
case Proc_gmatch_init: return Gmatch_init(args, rslt); public class Scrib_lib_ustring implements Scrib_lib {
case Proc_gmatch_callback: return Gmatch_callback(args, rslt); public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core;
case Proc_gsub: return Gsub(args, rslt); public String Key() {return "mw.ustring";}
default: throw Err_.new_unhandled(key); public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod;
} public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max;
} public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000;
private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4; public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;}
public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub"; public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);}
private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub); public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) {
public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) { Init();
// get args mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua")
String text_str = args.Xstr_str_or_null(0); , Keyval_.new_("stringLengthLimit", string_len_max)
String find_str = args.Pull_str(1); , Keyval_.new_("patternLengthLimit", pattern_len_max)
int bgn_as_codes_base1 = args.Cast_int_or(2, 1); );
boolean plain = args.Cast_bool_or_n(3); return mod;
}
// init text vars public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr();
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) {
switch (key) {
// convert bgn from base_1 to base_0 case Proc_find: return Find(args, rslt);
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); case Proc_match: return Match(args, rslt);
case Proc_gmatch_init: return Gmatch_init(args, rslt);
/* case Proc_gmatch_callback: return Gmatch_callback(args, rslt);
int offset = 0; case Proc_gsub: return Gsub(args, rslt);
if (bgn_as_codes > 0) { // NOTE: MW.BASE default: throw Err_.new_unhandled(key);
// $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); }
} }
else { private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4;
bgn_as_codes_base1 = 0; // NOTE: MW.BASE1 public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub";
offset = 0; // -1? private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub);
} public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) {
*/ // get args
String text_str = args.Xstr_str_or_null(0);
// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false; String find_str = args.Pull_str(1);
// NOTE: do not include surrogate calc; PAGE:en.d: DATE:2017-04-24 int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
// NOTE: not in MW; is this needed? DATE:2019-02-24 boolean plain = args.Cast_bool_or_n(3);
if (String_.Len_eq_0(find_str))
return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1); // init text vars
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
// if plain, just do literal match of find and exit
if (plain) { // convert bgn from base_1 to base_0
// find pos by literal match int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
Ustring find_ucs = Ustring_.New_codepoints(find_str);
int pos = text_ucs.Index_of(find_ucs, bgn_as_codes); /*
int offset = 0;
// if nothing found, return empty if (bgn_as_codes > 0) { // NOTE: MW.BASE
if (pos == String_.Find_none) // $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
return rslt.Init_ary_empty(); }
else {
// bgn: adjust for base1 bgn_as_codes_base1 = 0; // NOTE: MW.BASE1
int bgn = pos + Base1; offset = 0; // -1?
}
// end: add find.Len_in_codes and adjust end for PHP/LUA */
int end = bgn + find_ucs.Len_in_data() - End_adj;
// find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
return rslt.Init_many_objs(bgn, end); // NOTE: do not include surrogate calc; PAGE:en.d: DATE:2017-04-24
} // NOTE: not in MW; is this needed? DATE:2019-02-24
if (String_.Len_eq_0(find_str))
// run regex; NOTE: take only 1st result; DATE:2014-08-27 return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1);
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); // if plain, just do literal match of find and exit
if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11 if (plain) {
// find pos by literal match
// add to tmp_list Ustring find_ucs = Ustring_.New_codepoints(find_str);
List_adp tmp_list = List_adp_.New(); int pos = text_ucs.Index_of(find_ucs, bgn_as_codes);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); // if nothing found, return empty
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false); if (pos == String_.Find_none)
return rslt.Init_many_list(tmp_list); return rslt.Init_ary_empty();
}
public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) { // bgn: adjust for base1
// get args int bgn = pos + Base1;
String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
String find_str = args.Cast_str_or_null(1); // end: add find.Len_in_codes and adjust end for PHP/LUA
int bgn_as_codes_base1 = args.Cast_int_or(2, 1); int end = bgn + find_ucs.Len_in_data() - End_adj;
// validate / adjust return rslt.Init_many_objs(bgn, end);
if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 }
return rslt.Init_many_list(List_adp_.Noop);
Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 // run regex; NOTE: take only 1st result; DATE:2014-08-27
int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
// run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23 if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11
Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); // add to tmp_list
if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:; DATE:2015-01-30 List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1);
List_adp tmp_list = List_adp_.New(); tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true); AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false);
return rslt.Init_many_list(tmp_list); return rslt.Init_many_list(tmp_list);
} }
public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) {
Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core); // get args
return gsub_mgr.Exec(args, rslt); String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
} // 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error
public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { String find_str = args.Xstr_str_or_null(1);
// String text = Scrib_kv_utl_.Val_to_str(values, 0); int bgn_as_codes_base1 = args.Cast_int_or(2, 1);
String regx = args.Pull_str(1);
Scrib_regx_converter regx_converter = new Scrib_regx_converter(); // validate / adjust
if (Scrib_pattern_matcher.Mode_is_xowa()) if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06
regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); return rslt.Init_many_list(List_adp_.Noop);
else Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23
regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data());
return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
} // run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастиеастоящегоремени DATE:2017-04-23
public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) { Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url());
String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically; Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true);
String regx = args.Pull_str(1); if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:; DATE:2015-01-30
Keyval[] capt = args.Cast_kv_ary_or_null(2);
int pos = args.Pull_int(3); List_adp tmp_list = List_adp_.New();
AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true);
Ustring text_ucs = Ustring_.New_codepoints(text); return rslt.Init_many_list(tmp_list);
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data()); }
Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false); public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) {
if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty); Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core);
List_adp tmp_list = List_adp_.New(); return gsub_mgr.Exec(args, rslt);
AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23 }
return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list)); public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) {
} // String text = Scrib_kv_utl_.Val_to_str(values, 0);
private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) { String regx = args.Pull_str(1);
// convert bgn from base_1 to base_0 Scrib_regx_converter regx_converter = new Scrib_regx_converter();
int bgn_as_codes = bgn_as_codes_base1; if (Scrib_pattern_matcher.Mode_is_xowa())
if (bgn_as_codes > 0) regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
bgn_as_codes -= Scrib_lib_ustring.Base1; else
// TOMBSTONE: do not adjust negative numbers for base1; fails tests regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true);
// else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1; return rslt.Init_many_objs(regx, regx_converter.Capt_ary());
}
// adjust bgn for negative-numbers and large positive-numbers public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) {
// NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically;
// NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ String regx = args.Pull_str(1);
if (bgn_as_codes < 0) // negative number means search from rear of String Keyval[] capt = args.Cast_kv_ary_or_null(2);
bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ int pos = args.Pull_int(3);
else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ Ustring text_ucs = Ustring_.New_codepoints(text);
// int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data());
// will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23 Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false);
if (bgn_as_codes < 0) if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty);
bgn_as_codes = 0; List_adp tmp_list = List_adp_.New();
return bgn_as_codes; AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23
} return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list));
private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch }
int capts_len = capts == null ? 0 : capts.length; private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) {
if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 // convert bgn from base_1 to base_0
Regx_group[] grps = rslt.Groups(); int bgn_as_codes = bgn_as_codes_base1;
int grps_len = grps.length; if (bgn_as_codes > 0)
for (int j = 0; j < grps_len; j++) { bgn_as_codes -= Scrib_lib_ustring.Base1;
Regx_group grp = grps[j]; // TOMBSTONE: do not adjust negative numbers for base1; fails tests
if ( j < capts_len // bounds check b/c null can be passed // else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1;
&& Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23
) // adjust bgn for negative-numbers and large positive-numbers
tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21 // NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php
else // NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ
tmp_list.Add(grp.Val()); // return match if (bgn_as_codes < 0) // negative number means search from rear of String
} bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
} else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ
else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match; bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end())); // will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23
} if (bgn_as_codes < 0)
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) { bgn_as_codes = 0;
Regx_adp rv = Regx_adp_.new_(regx); return bgn_as_codes;
if (rv.Pattern_is_invalid()) { }
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23 private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch
Exception exc = rv.Pattern_is_invalid_exception(); int capts_len = capts == null ? 0 : capts.length;
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc)); if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
} Regx_group[] grps = rslt.Groups();
return rv; int grps_len = grps.length;
} for (int j = 0; j < grps_len; j++) {
private static final int Regx_group grp = grps[j];
Base1 = 1 if ( j < capts_len // bounds check b/c null can be passed
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab" && Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23
} )
tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21
else
tmp_list.Add(grp.Val()); // return match
}
}
else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match;
&& tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02
tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) {
Regx_adp rv = Regx_adp_.new_(regx);
if (rv.Pattern_is_invalid()) {
// try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23
Exception exc = rv.Pattern_is_invalid_exception();
Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc));
}
return rv;
}
private static final int
Base1 = 1
, End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab"
}

View File

@ -1,27 +1,35 @@
/* /*
XOWA: the XOWA Offline Wiki Application XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com Copyright (C) 2012-2020 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3, XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0. or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis. for your project on a case-by-case basis.
The terms of each license can be found in the source code repository: The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/ */
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; package gplx.xowa.xtns.scribunto.libs;
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
import gplx.Keyval_;
import gplx.String_;
import gplx.xowa.xtns.scribunto.Scrib_kv_utl_;
import gplx.xowa.xtns.scribunto.Scrib_lib;
import gplx.xowa.xtns.scribunto.engines.mocks.Mock_scrib_fxt;
import org.junit.Before;
import org.junit.Test;
public class Scrib_lib_ustring__match__tst { public class Scrib_lib_ustring__match__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib; private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() { @Before public void init() {
fxt.Clear(); fxt.Clear();
lib = fxt.Core().Lib_ustring().Init(); lib = fxt.Core().Lib_ustring().Init();
} }
@Test public void Basic() { @Test public void Basic() {
Exec_match("abcd" , "bc" , 1, "bc"); // basic Exec_match("abcd" , "bc" , 1, "bc"); // basic
Exec_match("abcd" , "x" , 1, String_.Null_mark); // empty Exec_match("abcd" , "x" , 1, String_.Null_mark); // empty
Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn
@ -35,24 +43,26 @@ public class Scrib_lib_ustring__match__tst {
Exec_match(1 , "a" , 1, String_.Null_mark); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 Exec_match(1 , "a" , 1, String_.Null_mark); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22
Exec_match("" , "a?" , 1, ""); // no results with ? should return "" not nil; PAGE:en.d:; DATE:2015-01-30 Exec_match("" , "a?" , 1, ""); // no results with ? should return "" not nil; PAGE:en.d:; DATE:2015-01-30
} }
@Test public void Args_out_of_order() { @Test public void Args_out_of_order() {
fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]"))); fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]")));
} }
@Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 @Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02
Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] "); Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] ");
} }
@Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas @Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas
Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9"); Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9");
} }
@Test public void Balanced__numbered_2() { @Test public void Balanced__numbered_2() {
Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9"); Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9");
} }
@Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01 @Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01
Exec_match("ä" , "%a", 1, "ä"); Exec_match("ä" , "%a", 1, "ä");
} }
@Test public void Number() {// 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error
Exec_match_obj("A" , 0, 0, String_.Null_mark);
}
// @Test public void Match_viwiktionary() {
// @Test public void Match_viwiktionary() {
// fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match); // fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match);
// Exec_match("tr" , "()(r)", 1, ";"); // should return all matches // Exec_match("tr" , "()(r)", 1, ";"); // should return all matches
// Exec_match("tr" , "^([b]*).-([c]*)$", 1, ";"); // should return all matches // Exec_match("tr" , "^([b]*).-([c]*)$", 1, ";"); // should return all matches
@ -60,4 +70,7 @@ public class Scrib_lib_ustring__match__tst {
private void Exec_match(Object text, String regx, int bgn, String expd) { private void Exec_match(Object text, String regx, int bgn, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd); fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd);
} }
private void Exec_match_obj(Object text, Object regx, int bgn, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd);
}
} }