mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Json: Parse surrogate-pairs correctly in unicode-escaped strings [#487]
This commit is contained in:
parent
641a17621f
commit
43cc4b15e8
@ -132,4 +132,13 @@ public class Utf16_ {
|
||||
else if (c < 65536) return 3; // 1 << 16
|
||||
else throw Err_.new_wo_type("UTF-16 int must be between 0 and 65536", "char", c);
|
||||
}
|
||||
public static final int // REF: https://en.wikipedia.org/wiki/Universal_Character_Set_characters
|
||||
Surrogate_hi_bgn = 0xD800 // 55,296: Surrogate high start
|
||||
, Surrogate_hi_end = 0xDBFF // 56,319: Surrogate high end
|
||||
, Surrogate_lo_bgn = 0xDC00 // 56,320: Surrogate low start
|
||||
, Surrogate_lo_end = 0xDFFF // 57,343: Surrogate low end
|
||||
, Surrogate_cp_bgn = 0x010000 // 65,536: Surrogate codepoint start
|
||||
, Surrogate_cp_end = 0x10FFFF // 1,114,111: Surrogate codepoint end
|
||||
, Surrogate_range = 0x400 // 1,024: Surrogate range (end - start) for high / low
|
||||
;
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.langs.jsons; import gplx.*; import gplx.langs.*;
|
||||
import gplx.core.intls.*;
|
||||
public class Json_itm_str extends Json_itm_base {
|
||||
private final boolean exact; private final Json_doc doc;
|
||||
private String data_str; private byte[] data_bry = null;
|
||||
@ -56,11 +57,28 @@ public class Json_itm_str extends Json_itm_base {
|
||||
case Byte_ascii.Ltr_b: bfr.Add_byte(Byte_ascii.Backfeed); break;
|
||||
case Byte_ascii.Ltr_f: bfr.Add_byte(Byte_ascii.Formfeed); break;
|
||||
case Byte_ascii.Ltr_u:
|
||||
int utf8_val = gplx.core.encoders.Hex_utl_.Parse_or(src, i + 1, i + 5, -1);
|
||||
i += 1; // +1 to skip "u"
|
||||
int utf8_val = gplx.core.encoders.Hex_utl_.Parse_or(src, i, i + 4, -1);
|
||||
// check for UTF surrogate-pairs; ISSUE#:487; DATE:2019-06-02
|
||||
// hi: 0xD800-0xDBFF; 55,296-56,319
|
||||
if (utf8_val >= Utf16_.Surrogate_hi_bgn && utf8_val <= Utf16_.Surrogate_hi_end) {
|
||||
int lo_bgn = i + 4; // +4 to skip 4 hex-dec chars
|
||||
if (lo_bgn + 6 <= end // +6 to handle encoded String; EX: '\u0022'
|
||||
&& src[lo_bgn] == Byte_ascii.Backslash
|
||||
&& src[lo_bgn + 1] == Byte_ascii.Ltr_u) {
|
||||
lo_bgn = lo_bgn + 2; // +2 to skip '\' and 'u'
|
||||
int lo = gplx.core.encoders.Hex_utl_.Parse_or(src, lo_bgn, lo_bgn + 4, -1);
|
||||
// lo: 0xDC00-0xDFFF; 56,320-57,343
|
||||
if (lo >= Utf16_.Surrogate_lo_bgn && lo <= Utf16_.Surrogate_lo_end) {
|
||||
utf8_val = Utf16_.Surrogate_merge(utf8_val, lo);
|
||||
i += 6; // +6 to skip entire lo-String; EX: '\u0022'
|
||||
}
|
||||
}
|
||||
}
|
||||
int len = gplx.core.intls.Utf16_.Encode_int(utf8_val, utf8_bry, 0);
|
||||
bfr.Add_mid(utf8_bry, 0, len);
|
||||
i += 4;
|
||||
break; // \uFFFF 4 hex-dec
|
||||
i += 3; // +3 b/c for-loop will do another +1 to bring total to 4; EX: '0022'
|
||||
break;
|
||||
case Byte_ascii.Backslash:
|
||||
case Byte_ascii.Slash:
|
||||
default:
|
||||
|
@ -25,7 +25,8 @@ public class Json_parser_tst {
|
||||
@Test public void Num_neg() {fxt.Test_parse_val0("{'k0':-123}" , -123);}
|
||||
@Test public void Str() {fxt.Test_parse_val0("{'k0':'v0'}" , "v0");}
|
||||
@Test public void Str_esc_quote() {fxt.Test_parse_val0("{'k0':'a\\\"b'}" , "a\"b");}
|
||||
@Test public void Str_esc_hex4() {fxt.Test_parse_val0("{'k0':'a\\u0021b'}" , "a!b");}
|
||||
@Test public void Str_encoded_basic() {fxt.Test_parse_val0("{'k0':'a\\u0021b'}" , "a!b");}
|
||||
@Test public void Str_encoded_surrogate() {fxt.Test_parse_val0("{'k0':'a\\ud83c\\udf0eb'}", "a🌎b");} // check for UTF surrogate-pairs; symbol is earth globe americas (U+1F30E); ISSUE#:487; DATE:2019-06-02
|
||||
@Test public void Num_dec() {fxt.Test_parse("{'k0':1.23}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "1.23")));}
|
||||
@Test public void Num_exp() {fxt.Test_parse("{'k0':1e+2}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "1e+2")));}
|
||||
@Test public void Num_mix() {fxt.Test_parse("{'k0':-1.23e-1}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "-1.23e-1")));}
|
||||
|
Loading…
Reference in New Issue
Block a user