1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Scribunto.Regex: Reinstate codepoint calculations for ustring.find

This commit is contained in:
gnosygnu 2017-04-24 21:16:20 -04:00
parent 0e3af23c87
commit ea3cb238fd
2 changed files with 17 additions and 11 deletions

View File

@ -56,13 +56,16 @@ public class Scrib_lib_ustring implements Scrib_lib {
synchronized (surrogate_utl) {
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
bgn_char_idx = Bgn_adjust(text_str, bgn_char_idx);
// TOMBSTONE: do not adjust for 2-len chars (surrogates); lua always iterates correctly by chars; DATE:2017-04-23
// int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
int bgn_adj = 0;
// regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false;
// NOTE: do not include surrogate calc; PAGE:en.d: DATE:2017-04-24
if (String_.Len_eq_0(regx)) // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false
return rslt.Init_many_objs(bgn_char_idx + Scrib_lib_ustring.Base1, bgn_char_idx + Scrib_lib_ustring.Base1 - 1);
// NOTE: adjust for 2-len chars (surrogates); PAGE:en.d:iglesia DATE:2017-04-23
int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
int bgn_codepoint_idx = bgn_char_idx + bgn_adj;
int bgn_byte_pos = surrogate_utl.Byte_pos();
if (String_.Len_eq_0(regx)) // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false
return rslt.Init_many_objs(bgn_codepoint_idx + Scrib_lib_ustring.Base1, bgn_codepoint_idx + Scrib_lib_ustring.Base1 - 1);
if (plain) {
int pos = String_.FindFwd(text_str, regx, bgn_codepoint_idx);
boolean found = pos != Bry_find_.Not_found;
@ -255,7 +258,7 @@ class Scrib_lib_ustring_gsub_mgr {
if (limit > -1 && repl_count == limit) break;
Regx_match rslt = rslts[i];
tmp_bfr.Add_str_u8(String_.Mid(text, pos, rslt.Find_bgn())); // NOTE: regx returns char pos (not bry); must add as String, not bry; DATE:2013-07-17
if (!Exec_repl_itm(tmp_bfr, repl_tid, repl_bry, text, rslt)) { // will be false when gsub_proc returns nothing; PAGE:en.d:tracer DATE:2017-04-22
if (!Exec_repl_itm(tmp_bfr, repl_tid, repl_bry, text, rslt)) { // will be false when gsub_proc returns nothing; PAGE:en.d:tracer PAGE:en.d:שלום DATE:2017-04-22;
tmp_bfr.Add_str_u8(String_.Mid(text, rslt.Find_bgn(), rslt.Find_end()));
}
pos = rslt.Find_end();

View File

@ -38,11 +38,14 @@ public class Scrib_lib_ustring__find__tst {
@Test public void Return_int() {
fxt.Test__proc__kvps__vals(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_("a", "()", 2, Bool_.N), 2, 1, 2);
}
// DELETE: no longer needed after tombstoning surrogate logic; DATE:2017-04-23
// @Test public void Surrogate() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
// Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
// Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
// }
@Test public void Surrogate__find__value() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
}
@Test public void Surrogate__find__empty() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
Exec_find("aé𡼾\nbî𡼾\n" , "" , 1, Bool_.N, "1;0"); // 4 b/c \n starts at pos 4 (super 1)
// Exec_find("aé𡼾\nbî𡼾\n" , "" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
}
private void Exec_find(String text, String regx, int bgn, boolean plain, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
}