diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java index c1349f098..7ee80d8b1 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring.java @@ -1,6 +1,6 @@ /* XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com +Copyright (C) 2012-2020 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. @@ -13,202 +13,224 @@ The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ -package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import gplx.objects.strings.unicodes.*; -import gplx.core.intls.*; import gplx.langs.regxs.*; -import gplx.xowa.parsers.*; -import gplx.xowa.xtns.scribunto.procs.*; -import gplx.xowa.xtns.scribunto.libs.patterns.*; -public class Scrib_lib_ustring implements Scrib_lib { - public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core; - public String Key() {return "mw.ustring";} - public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod; - public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max; - public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000; - public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;} - public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);} - public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) { - Init(); - mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua") - , Keyval_.new_("stringLengthLimit", string_len_max) - , Keyval_.new_("patternLengthLimit", pattern_len_max) - ); - return mod; - } - public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr(); - public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) { - switch (key) { - case Proc_find: return Find(args, rslt); - case Proc_match: return Match(args, rslt); - case Proc_gmatch_init: return Gmatch_init(args, rslt); - case Proc_gmatch_callback: return Gmatch_callback(args, rslt); - case Proc_gsub: return Gsub(args, rslt); - default: throw Err_.new_unhandled(key); - } - } - private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4; - public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub"; - private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub); - public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) { - // get args - String text_str = args.Xstr_str_or_null(0); - String find_str = args.Pull_str(1); - int bgn_as_codes_base1 = args.Cast_int_or(2, 1); - boolean plain = args.Cast_bool_or_n(3); - - // init text vars - Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 - - // convert bgn from base_1 to base_0 - int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); - - /* - int offset = 0; - if (bgn_as_codes > 0) { // NOTE: MW.BASE - // $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); - } - else { - bgn_as_codes_base1 = 0; // NOTE: MW.BASE1 - offset = 0; // -1? - } - */ - - // find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false; - // NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24 - // NOTE: not in MW; is this needed? DATE:2019-02-24 - if (String_.Len_eq_0(find_str)) - return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1); - - // if plain, just do literal match of find and exit - if (plain) { - // find pos by literal match - Ustring find_ucs = Ustring_.New_codepoints(find_str); - int pos = text_ucs.Index_of(find_ucs, bgn_as_codes); - - // if nothing found, return empty - if (pos == String_.Find_none) - return rslt.Init_ary_empty(); - - // bgn: adjust for base1 - int bgn = pos + Base1; - - // end: add find.Len_in_codes and adjust end for PHP/LUA - int end = bgn + find_ucs.Len_in_data() - End_adj; - - return rslt.Init_many_objs(bgn, end); - } - - // run regex; NOTE: take only 1st result; DATE:2014-08-27 - Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url()); - Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); - if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11 - - // add to tmp_list - List_adp tmp_list = List_adp_.New(); - tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1); - tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); - AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false); - return rslt.Init_many_list(tmp_list); - } - public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) { - // get args - String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 - String find_str = args.Cast_str_or_null(1); - int bgn_as_codes_base1 = args.Cast_int_or(2, 1); - - // validate / adjust - if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 - return rslt.Init_many_list(List_adp_.Noop); - Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 - int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); - - // run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23 - Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url()); - Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); - if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 - - List_adp tmp_list = List_adp_.New(); - AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true); - return rslt.Init_many_list(tmp_list); - } - public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { - Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core); - return gsub_mgr.Exec(args, rslt); - } - public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { - // String text = Scrib_kv_utl_.Val_to_str(values, 0); - String regx = args.Pull_str(1); - Scrib_regx_converter regx_converter = new Scrib_regx_converter(); - if (Scrib_pattern_matcher.Mode_is_xowa()) - regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); - else - regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); - return rslt.Init_many_objs(regx, regx_converter.Capt_ary()); - } - public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) { - String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically; - String regx = args.Pull_str(1); - Keyval[] capt = args.Cast_kv_ary_or_null(2); - int pos = args.Pull_int(3); - - Ustring text_ucs = Ustring_.New_codepoints(text); - // int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data()); - Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false); - if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty); - List_adp tmp_list = List_adp_.New(); - AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23 - return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list)); - } - private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) { - // convert bgn from base_1 to base_0 - int bgn_as_codes = bgn_as_codes_base1; - if (bgn_as_codes > 0) - bgn_as_codes -= Scrib_lib_ustring.Base1; - // TOMBSTONE: do not adjust negative numbers for base1; fails tests - // else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1; - - // adjust bgn for negative-numbers and large positive-numbers - // NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php - // NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ - if (bgn_as_codes < 0) // negative number means search from rear of String - bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ - else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ - bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ - - // will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23 - if (bgn_as_codes < 0) - bgn_as_codes = 0; - return bgn_as_codes; - } - private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch - int capts_len = capts == null ? 0 : capts.length; - if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 - Regx_group[] grps = rslt.Groups(); - int grps_len = grps.length; - for (int j = 0; j < grps_len; j++) { - Regx_group grp = grps[j]; - if ( j < capts_len // bounds check b/c null can be passed - && Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23 - ) - tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21 - else - tmp_list.Add(grp.Val()); // return match - } - } - else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match; - && tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02 - tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end())); - } - public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) { - Regx_adp rv = Regx_adp_.new_(regx); - if (rv.Pattern_is_invalid()) { - // try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23 - Exception exc = rv.Pattern_is_invalid_exception(); - Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc)); - } - return rv; - } - private static final int - Base1 = 1 - , End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab" -} +package gplx.xowa.xtns.scribunto.libs; + +import gplx.Bool_; +import gplx.Err_; +import gplx.Gfo_usr_dlg_; +import gplx.Io_url; +import gplx.Keyval; +import gplx.Keyval_; +import gplx.List_adp; +import gplx.List_adp_; +import gplx.String_; +import gplx.langs.regxs.Regx_adp; +import gplx.langs.regxs.Regx_adp_; +import gplx.langs.regxs.Regx_group; +import gplx.langs.regxs.Regx_match; +import gplx.objects.strings.unicodes.Ustring; +import gplx.objects.strings.unicodes.Ustring_; +import gplx.xowa.Xoa_page_; +import gplx.xowa.xtns.scribunto.Scrib_core; +import gplx.xowa.xtns.scribunto.Scrib_kv_utl_; +import gplx.xowa.xtns.scribunto.Scrib_lib; +import gplx.xowa.xtns.scribunto.Scrib_lua_mod; +import gplx.xowa.xtns.scribunto.libs.patterns.Scrib_pattern_matcher; +import gplx.xowa.xtns.scribunto.procs.Scrib_proc_args; +import gplx.xowa.xtns.scribunto.procs.Scrib_proc_mgr; +import gplx.xowa.xtns.scribunto.procs.Scrib_proc_rslt; + +public class Scrib_lib_ustring implements Scrib_lib { + public Scrib_lib_ustring(Scrib_core core) {this.core = core;} private Scrib_core core; + public String Key() {return "mw.ustring";} + public Scrib_lua_mod Mod() {return mod;} private Scrib_lua_mod mod; + public int String_len_max() {return string_len_max;} public Scrib_lib_ustring String_len_max_(int v) {string_len_max = v; return this;} private int string_len_max = Xoa_page_.Page_len_max; + public int Pattern_len_max() {return pattern_len_max;} public Scrib_lib_ustring Pattern_len_max_(int v) {pattern_len_max = v; return this;} private int pattern_len_max = 10000; + public Scrib_lib Init() {procs.Init_by_lib(this, Proc_names); return this;} + public Scrib_lib Clone_lib(Scrib_core core) {return new Scrib_lib_ustring(core);} + public Scrib_lua_mod Register(Scrib_core core, Io_url script_dir) { + Init(); + mod = core.RegisterInterface(this, script_dir.GenSubFil("mw.ustring.lua") + , Keyval_.new_("stringLengthLimit", string_len_max) + , Keyval_.new_("patternLengthLimit", pattern_len_max) + ); + return mod; + } + public Scrib_proc_mgr Procs() {return procs;} private Scrib_proc_mgr procs = new Scrib_proc_mgr(); + public boolean Procs_exec(int key, Scrib_proc_args args, Scrib_proc_rslt rslt) { + switch (key) { + case Proc_find: return Find(args, rslt); + case Proc_match: return Match(args, rslt); + case Proc_gmatch_init: return Gmatch_init(args, rslt); + case Proc_gmatch_callback: return Gmatch_callback(args, rslt); + case Proc_gsub: return Gsub(args, rslt); + default: throw Err_.new_unhandled(key); + } + } + private static final int Proc_find = 0, Proc_match = 1, Proc_gmatch_init = 2, Proc_gmatch_callback = 3, Proc_gsub = 4; + public static final String Invk_find = "find", Invk_match = "match", Invk_gmatch_init = "gmatch_init", Invk_gmatch_callback = "gmatch_callback", Invk_gsub = "gsub"; + private static final String[] Proc_names = String_.Ary(Invk_find, Invk_match, Invk_gmatch_init, Invk_gmatch_callback, Invk_gsub); + public boolean Find(Scrib_proc_args args, Scrib_proc_rslt rslt) { + // get args + String text_str = args.Xstr_str_or_null(0); + String find_str = args.Pull_str(1); + int bgn_as_codes_base1 = args.Cast_int_or(2, 1); + boolean plain = args.Cast_bool_or_n(3); + + // init text vars + Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + + // convert bgn from base_1 to base_0 + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); + + /* + int offset = 0; + if (bgn_as_codes > 0) { // NOTE: MW.BASE + // $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); + } + else { + bgn_as_codes_base1 = 0; // NOTE: MW.BASE1 + offset = 0; // -1? + } + */ + + // find_str of "" should return (bgn, bgn - 1) regardless of whether plain is true or false; + // NOTE: do not include surrogate calc; PAGE:en.d:佻 DATE:2017-04-24 + // NOTE: not in MW; is this needed? DATE:2019-02-24 + if (String_.Len_eq_0(find_str)) + return rslt.Init_many_objs(bgn_as_codes_base1, bgn_as_codes_base1 - 1); + + // if plain, just do literal match of find and exit + if (plain) { + // find pos by literal match + Ustring find_ucs = Ustring_.New_codepoints(find_str); + int pos = text_ucs.Index_of(find_ucs, bgn_as_codes); + + // if nothing found, return empty + if (pos == String_.Find_none) + return rslt.Init_ary_empty(); + + // bgn: adjust for base1 + int bgn = pos + Base1; + + // end: add find.Len_in_codes and adjust end for PHP/LUA + int end = bgn + find_ucs.Len_in_data() - End_adj; + + return rslt.Init_many_objs(bgn, end); + } + + // run regex; NOTE: take only 1st result; DATE:2014-08-27 + Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url()); + Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); + if (match.Rslt_none()) return rslt.Init_null(); // null verified on MW; EX: =mw.ustring.find("abc", "z"); DATE:2019-04-11 + + // add to tmp_list + List_adp tmp_list = List_adp_.New(); + tmp_list.Add(text_ucs.Map_char_to_data(match.Find_bgn()) + Scrib_lib_ustring.Base1); + tmp_list.Add(text_ucs.Map_char_to_data(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj); + AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), false); + return rslt.Init_many_list(tmp_list); + } + public boolean Match(Scrib_proc_args args, Scrib_proc_rslt rslt) { + // get args + String text_str = args.Xstr_str_or_null(0); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 + // 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error + String find_str = args.Xstr_str_or_null(1); + int bgn_as_codes_base1 = args.Cast_int_or(2, 1); + + // validate / adjust + if (text_str == null) // if no text_str is passed, do not fail; return empty; EX:d:changed; DATE:2014-02-06 + return rslt.Init_many_list(List_adp_.Noop); + Ustring text_ucs = Ustring_.New_codepoints(text_str); // NOTE: must count codes for supplementaries; PAGE:en.d:iglesia DATE:2017-04-23 + int bgn_as_codes = To_java_by_lua(bgn_as_codes_base1, text_ucs.Len_in_data()); + + // run regex; NOTE add 1st match only; do not add all; PAGE:en.d:действительное_причастие_настоящего_времени DATE:2017-04-23 + Scrib_pattern_matcher matcher = Scrib_pattern_matcher.New(core.Page_url()); + Regx_match match = matcher.Match_one(text_ucs, find_str, bgn_as_codes, true); + if (match.Rslt_none()) return rslt.Init_null(); // return null if no matches found; EX:w:Mount_Gambier_(volcano); DATE:2014-04-02; confirmed with en.d:民; DATE:2015-01-30 + + List_adp tmp_list = List_adp_.New(); + AddCapturesFromMatch(tmp_list, match, text_str, matcher.Capt_ary(), true); + return rslt.Init_many_list(tmp_list); + } + public boolean Gsub(Scrib_proc_args args, Scrib_proc_rslt rslt) { + Scrib_lib_ustring_gsub_mgr gsub_mgr = new Scrib_lib_ustring_gsub_mgr(core); + return gsub_mgr.Exec(args, rslt); + } + public boolean Gmatch_init(Scrib_proc_args args, Scrib_proc_rslt rslt) { + // String text = Scrib_kv_utl_.Val_to_str(values, 0); + String regx = args.Pull_str(1); + Scrib_regx_converter regx_converter = new Scrib_regx_converter(); + if (Scrib_pattern_matcher.Mode_is_xowa()) + regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); + else + regx = regx_converter.patternToRegex(regx, Scrib_regx_converter.Anchor_null, true); + return rslt.Init_many_objs(regx, regx_converter.Capt_ary()); + } + public boolean Gmatch_callback(Scrib_proc_args args, Scrib_proc_rslt rslt) { + String text = args.Xstr_str_or_null(0); // NOTE: UstringLibrary.php!ustringGmatchCallback calls preg_match directly; $s can be any type, and php casts automatically; + String regx = args.Pull_str(1); + Keyval[] capt = args.Cast_kv_ary_or_null(2); + int pos = args.Pull_int(3); + + Ustring text_ucs = Ustring_.New_codepoints(text); + // int pos_as_codes = To_java_by_lua(pos, text_ucs.Len_in_data()); + Regx_match match = Scrib_pattern_matcher.New(core.Page_url()).Match_one(text_ucs, regx, pos, false); + if (match.Rslt_none()) return rslt.Init_many_objs(pos, Keyval_.Ary_empty); + List_adp tmp_list = List_adp_.New(); + AddCapturesFromMatch(tmp_list, match, text, capt, true); // NOTE: was incorrectly set as false; DATE:2014-04-23 + return rslt.Init_many_objs(match.Find_end(), Scrib_kv_utl_.base1_list_(tmp_list)); + } + private int To_java_by_lua(int bgn_as_codes_base1, int len_in_codes) { + // convert bgn from base_1 to base_0 + int bgn_as_codes = bgn_as_codes_base1; + if (bgn_as_codes > 0) + bgn_as_codes -= Scrib_lib_ustring.Base1; + // TOMBSTONE: do not adjust negative numbers for base1; fails tests + // else if (bgn_as_codes < 0) bgn_as_codes += Scrib_lib_ustring.Base1; + + // adjust bgn for negative-numbers and large positive-numbers + // NOTE: MW uses mb_strlen which returns len of mb chars as 1; REF.PHP: http://php.net/manual/en/function.mb-strlen.php + // NOTE: MW does additional +1 for PHP.base_1. This is not needed for JAVA; noted below as IGNORE_BASE_1_ADJ + if (bgn_as_codes < 0) // negative number means search from rear of String + bgn_as_codes += len_in_codes; // NOTE:IGNORE_BASE_1_ADJ + else if (bgn_as_codes > len_in_codes) // bgn_as_codes > text_len; confine to text_len; NOTE:IGNORE_BASE_1_ADJ + bgn_as_codes = len_in_codes; // NOTE:IGNORE_BASE_1_ADJ + + // will be negative if Abs(bgn_as_codes) > text.length; ISSUE#:366; DATE:2019-02-23 + if (bgn_as_codes < 0) + bgn_as_codes = 0; + return bgn_as_codes; + } + private void AddCapturesFromMatch(List_adp tmp_list, Regx_match rslt, String text, Keyval[] capts, boolean op_is_match) {// NOTE: this matches behavior in UstringLibrary.php!addCapturesFromMatch + int capts_len = capts == null ? 0 : capts.length; + if (capts_len > 0) { // NOTE: changed from "grps_len > 0"; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + Regx_group[] grps = rslt.Groups(); + int grps_len = grps.length; + for (int j = 0; j < grps_len; j++) { + Regx_group grp = grps[j]; + if ( j < capts_len // bounds check b/c null can be passed + && Bool_.Cast(capts[j].Val()) // check if true; indicates that group is "()" or "anypos" see regex converter; DATE:2014-04-23 + ) + tmp_list.Add(grp.Bgn() + Scrib_lib_ustring.Base1); // return index only for "()"; NOTE: do not return as String; callers expect int and will fail typed comparisons; DATE:2016-01-21 + else + tmp_list.Add(grp.Val()); // return match + } + } + else if ( op_is_match // if op_is_match, and no captures, extract find_txt; note that UstringLibrary.php says "$arr[] = $m[0][0];" which means get the 1st match; + && tmp_list.Count() == 0) // only add match once; EX: "aaaa", "a" will have four matches; get 1st; DATE:2014-04-02 + tmp_list.Add(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end())); + } + public static Regx_adp RegxAdp_new_(byte[] page_url, String regx) { + Regx_adp rv = Regx_adp_.new_(regx); + if (rv.Pattern_is_invalid()) { + // try to identify [z-a] errors; PAGE:https://en.wiktionary.org/wiki/Module:scripts/data; DATE:2017-04-23 + Exception exc = rv.Pattern_is_invalid_exception(); + Gfo_usr_dlg_.Instance.Log_many("", "", "regx is invalid: regx=~{0} page=~{1} exc=~{2}", regx, page_url, Err_.Message_gplx_log(exc)); + } + return rv; + } + private static final int + Base1 = 1 + , End_adj = 1; // lua / php uses "end" as <= not <; EX: "abc" and bgn=0, end= 1; for XOWA, this is "a"; for MW / PHP it is "ab" +} diff --git a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java index 2d3a4e5de..7415244b4 100644 --- a/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java +++ b/400_xowa/src/gplx/xowa/xtns/scribunto/libs/Scrib_lib_ustring__match__tst.java @@ -1,27 +1,35 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*; -import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*; +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2020 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.xtns.scribunto.libs; + +import gplx.Keyval_; +import gplx.String_; +import gplx.xowa.xtns.scribunto.Scrib_kv_utl_; +import gplx.xowa.xtns.scribunto.Scrib_lib; +import gplx.xowa.xtns.scribunto.engines.mocks.Mock_scrib_fxt; +import org.junit.Before; +import org.junit.Test; + public class Scrib_lib_ustring__match__tst { - private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib; + private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib; @Before public void init() { fxt.Clear(); lib = fxt.Core().Lib_ustring().Init(); } - @Test public void Basic() { + @Test public void Basic() { Exec_match("abcd" , "bc" , 1, "bc"); // basic Exec_match("abcd" , "x" , 1, String_.Null_mark); // empty Exec_match("abcd" , "a" , 2, String_.Null_mark); // bgn @@ -35,24 +43,26 @@ public class Scrib_lib_ustring__match__tst { Exec_match(1 , "a" , 1, String_.Null_mark); // Module can pass raw ints; PAGE:en.w:Budget_of_the_European_Union; DATE:2015-01-22 Exec_match("" , "a?" , 1, ""); // no results with ? should return "" not nil; PAGE:en.d:民; DATE:2015-01-30 } - @Test public void Args_out_of_order() { + @Test public void Args_out_of_order() { fxt.Test__proc__kvps__empty(lib, Scrib_lib_ustring.Invk_match, Keyval_.Ary(Keyval_.int_(2, "[a]"))); } - @Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 + @Test public void Balanced__trailing_whitespace() { // PURPOSE: match trailing whitespace; PAGE:en.w:Portal:Constructed_languages/Intro DATE:2018-07-02 Exec_match("[[a]] b", "%b[]%s*", 1, "[[a]] "); } - @Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas + @Test public void Balanced__numbered_1() { // PURPOSE: handle mix of balanced and regular capture; PAGE:en.w:Bahamas Exec_match("[[5]]X99Y", "%b[]X(%d)%1Y", 1, "9"); } - @Test public void Balanced__numbered_2() { + @Test public void Balanced__numbered_2() { Exec_match("A88B[[5]]X99Y", "A(%d)%1B%b[]X(%d)%2Y", 1, "8;9"); } - @Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01 + @Test public void Unicode_alpha() {// ISSUE#:502; DATE:2019-07-01 Exec_match("ä" , "%a", 1, "ä"); } + @Test public void Number() {// 2019-20-01|ISSUE#:802|passing integer should return NULL, not throw error + Exec_match_obj("A" , 0, 0, String_.Null_mark); + } - -// @Test public void Match_viwiktionary() { +// @Test public void Match_viwiktionary() { // fxt.Init_cbk(Scrib_core.Key_mw_interface, fxt.Core().Lib_ustring(), Scrib_lib_ustring.Invk_match); // Exec_match("tr" , "()(r)", 1, ";"); // should return all matches // Exec_match("tr" , "^([b]*).-([c]*)$", 1, ";"); // should return all matches @@ -60,4 +70,7 @@ public class Scrib_lib_ustring__match__tst { private void Exec_match(Object text, String regx, int bgn, String expd) { fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd); } + private void Exec_match_obj(Object text, Object regx, int bgn, String expd) { + fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_match, Scrib_kv_utl_.base1_many_(text, regx, bgn), expd); + } }