mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Scribunto: Classify 3 byte UTF-8 sequences as 1 Java char, not 2 Java chars [#377]
This commit is contained in:
parent
beab14117e
commit
790e82ac9e
@ -55,7 +55,7 @@ public class Utf16_mapper {
|
||||
|
||||
// get lengths
|
||||
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
|
||||
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
|
||||
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1 char; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
|
||||
|
||||
// increment
|
||||
pos_in_bytes += cur_len_in_bytes;
|
||||
|
@ -21,8 +21,8 @@ public class Utf16_mapper_tst {
|
||||
fxt.Test__map("a¢€𤭢"
|
||||
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
|
||||
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -106,8 +106,8 @@ public class Scrib_lib_ustring implements Scrib_lib {
|
||||
// add to tmp_list
|
||||
Regx_match match = regx_rslts[0]; // NOTE: take only 1st result; DATE:2014-08-27
|
||||
List_adp tmp_list = List_adp_.New();
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_neg1(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_bgn()) + Scrib_lib_ustring.Base1);
|
||||
tmp_list.Add(text_map.Get_code_for_char_or_fail(match.Find_end()) + Scrib_lib_ustring.Base1 - Scrib_lib_ustring.End_adj);
|
||||
AddCapturesFromMatch(tmp_list, match, text_str, regx_converter.Capt_ary(), false);
|
||||
return rslt.Init_many_list(tmp_list);
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ public class Scrib_lib_ustring__find__tst {
|
||||
private final Scrib_lib_ustring__find__fxt fxt = new Scrib_lib_ustring__find__fxt();
|
||||
@Test public void Plain() {
|
||||
fxt.Test__find("aabaab" , "b" , 2, Bool_.Y, "3;3"); // bytes=1
|
||||
fxt.Test__find("¢¢b¢¢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=2
|
||||
fxt.Test__find("€€b€€b" , "b" , 2, Bool_.Y, "3;3"); // bytes=3
|
||||
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.Y, "3;3"); // bytes=4
|
||||
fxt.Test__find("()()" , "(" , 2, Bool_.Y, "3;3"); // exact match; note that "(" is invalid regx
|
||||
@ -33,10 +34,14 @@ public class Scrib_lib_ustring__find__tst {
|
||||
fxt.Test__find("𤭢" , "𤭢" , -1, Bool_.Y, "1;1"); // fails if "" b/c it would have counted -1 as -1 char instead of -1 codepoint
|
||||
}
|
||||
@Test public void Regx__simple() {
|
||||
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
|
||||
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
|
||||
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
|
||||
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
|
||||
fxt.Test__find("aabaab" , "b" , 2, Bool_.N, "3;3"); // bytes=1
|
||||
fxt.Test__find("¢¢b¢¢b" , "b" , 2, Bool_.N, "3;3"); // bytes=2
|
||||
fxt.Test__find("€€b€€b" , "b" , 2, Bool_.N, "3;3"); // bytes=3
|
||||
fxt.Test__find("𤭢𤭢b𤭢𤭢b" , "b" , 2, Bool_.N, "3;3"); // bytes=4
|
||||
fxt.Test__find("abcd" , "b" , 1, Bool_.N, "2;2"); // basic
|
||||
fxt.Test__find("abad" , "a" , 2, Bool_.N, "3;3"); // bgn
|
||||
fxt.Test__find("abcd" , "x" , 1, Bool_.N, ""); // no-match
|
||||
fxt.Test__find("abcd" , "" , 2, Bool_.N, "2;1"); // empty regx should return values; regx; EX:w:Fool's_mate; DATE:2014-03-04
|
||||
}
|
||||
@Test public void Regx__int() { // PURPOSE: allow int find; PAGE:ro.w:Innsbruck DATE:2015-09-12
|
||||
fxt.Test__find(123 , "2" , 1, Bool_.N, "2;2");
|
||||
|
Loading…
Reference in New Issue
Block a user