1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-09-28 14:30:51 +00:00

Scribunto: Classify 3 byte UTF-8 sequences as 1 Java char, not 2 Java chars [#377]

This commit is contained in:
gnosygnu 2019-03-04 23:16:43 -05:00
parent beab14117e
commit 790e82ac9e
4 changed files with 14 additions and 9 deletions

View File

@ -55,7 +55,7 @@ public class Utf16_mapper {
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1 char; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
// increment
pos_in_bytes += cur_len_in_bytes;

View File

@ -21,8 +21,8 @@ public class Utf16_mapper_tst {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
);
}
}

View File

@ -106,8 +106,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
// add to tmp_list
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
List_adp tmp_list = List_adp_.New();
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_bgn()) + Scrib_lib_ustring.Base1);
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
return rslt.Init_many_list(tmp_list);
}

View File

@ -21,6 +21,7 @@ public class Scrib_lib_ustring__find__tst {
private final Scrib_lib_ustring__find__fxt fxt = new Scrib_lib_ustring__find__fxt();
@Test public void Plain() {
fxt.Test__find("aabaab" , "b" , 2, Bool_.Y, "3;3"); // bytes=1
fxt.Test__find("¢¢b¢¢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=2
fxt.Test__find("€€b€€b" , "b" , 2, Bool_.Y, "3;3"); // bytes=3
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=4
fxt.Test__find("()()" , "(" , 2, Bool_.Y, "3;3"); // exact match; note that "(" is invalid regx
@ -33,10 +34,14 @@ public class Scrib_lib_ustring__find__tst {
fxt.Test__find("𤭢" , "𤭢" , -1, Bool_.Y, "1;1"); // fails if "" b/c it would have counted -1 as -1 char instead of -1 codepoint
}
@Test public void Regx__simple() {
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
fxt.Test__find("aabaab" , "b" , 2, Bool_.N, "3;3"); // bytes=1
fxt.Test__find("¢¢b¢¢b" , "b" , 2, Bool_.N, "3;3"); // bytes=2
fxt.Test__find("€€b€€b" , "b" , 2, Bool_.N, "3;3"); // bytes=3
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.N, "3;3"); // bytes=4
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
}
@Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
fxt.Test__find(123 , "2" , 1, Bool_.N, "2;2");