1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Scribunto: Classify 3 byte UTF-8 sequences as 1 Java char, not 2 Java chars [#377]

This commit is contained in:
gnosygnu
2019-03-04 23:16:43 -05:00
parent beab14117e
commit 790e82ac9e
4 changed files with 14 additions and 9 deletions

View File

@@ -55,7 +55,7 @@ public class Utf16_mapper {
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
int cur_len_in_chars = cur_len_in_bytes == 4 ? 2 : 1; // NOTE: 3 bytes represent up to U+FFFF (65,536) which will fit in 1 char; REF:en.w:UTF-8; ISSUE#:377; DATE:2019-03-04
// increment
pos_in_bytes += cur_len_in_bytes;

View File

@@ -21,8 +21,8 @@ public class Utf16_mapper_tst {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, -1, 4, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 3, 5, -1, -1, -1, -1, -1, -1)
);
}
}