Json: Parse surrogate-pairs correctly in unicode-escaped strings [#487]

pull/620/head
gnosygnu 5 years ago
parent 641a17621f
commit 43cc4b15e8

@ -132,4 +132,13 @@ public class Utf16_ {
else if (c < 65536) return 3; // 1 << 16
else throw Err_.new_wo_type("UTF-16 int must be between 0 and 65536", "char", c);
}
public static final int // REF: https://en.wikipedia.org/wiki/Universal_Character_Set_characters
Surrogate_hi_bgn = 0xD800 // 55,296: Surrogate high start
, Surrogate_hi_end = 0xDBFF // 56,319: Surrogate high end
, Surrogate_lo_bgn = 0xDC00 // 56,320: Surrogate low start
, Surrogate_lo_end = 0xDFFF // 57,343: Surrogate low end
, Surrogate_cp_bgn = 0x010000 // 65,536: Surrogate codepoint start
, Surrogate_cp_end = 0x10FFFF // 1,114,111: Surrogate codepoint end
, Surrogate_range = 0x400 // 1,024: Surrogate range (end - start) for high / low
;
}

@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.langs.jsons; import gplx.*; import gplx.langs.*;
import gplx.core.intls.*;
public class Json_itm_str extends Json_itm_base {
private final boolean exact; private final Json_doc doc;
private String data_str; private byte[] data_bry = null;
@ -56,11 +57,28 @@ public class Json_itm_str extends Json_itm_base {
case Byte_ascii.Ltr_b: bfr.Add_byte(Byte_ascii.Backfeed); break;
case Byte_ascii.Ltr_f: bfr.Add_byte(Byte_ascii.Formfeed); break;
case Byte_ascii.Ltr_u:
int utf8_val = gplx.core.encoders.Hex_utl_.Parse_or(src, i + 1, i + 5, -1);
i += 1; // +1 to skip "u"
int utf8_val = gplx.core.encoders.Hex_utl_.Parse_or(src, i, i + 4, -1);
// check for UTF surrogate-pairs; ISSUE#:487; DATE:2019-06-02
// hi: 0xD800-0xDBFF; 55,296-56,319
if (utf8_val >= Utf16_.Surrogate_hi_bgn && utf8_val <= Utf16_.Surrogate_hi_end) {
int lo_bgn = i + 4; // +4 to skip 4 hex-dec chars
if (lo_bgn + 6 <= end // +6 to handle encoded String; EX: '\u0022'
&& src[lo_bgn] == Byte_ascii.Backslash
&& src[lo_bgn + 1] == Byte_ascii.Ltr_u) {
lo_bgn = lo_bgn + 2; // +2 to skip '\' and 'u'
int lo = gplx.core.encoders.Hex_utl_.Parse_or(src, lo_bgn, lo_bgn + 4, -1);
// lo: 0xDC00-0xDFFF; 56,320-57,343
if (lo >= Utf16_.Surrogate_lo_bgn && lo <= Utf16_.Surrogate_lo_end) {
utf8_val = Utf16_.Surrogate_merge(utf8_val, lo);
i += 6; // +6 to skip entire lo-String; EX: '\u0022'
}
}
}
int len = gplx.core.intls.Utf16_.Encode_int(utf8_val, utf8_bry, 0);
bfr.Add_mid(utf8_bry, 0, len);
i += 4;
break; // \uFFFF 4 hex-dec
i += 3; // +3 b/c for-loop will do another +1 to bring total to 4; EX: '0022'
break;
case Byte_ascii.Backslash:
case Byte_ascii.Slash:
default:

@ -25,7 +25,8 @@ public class Json_parser_tst {
@Test public void Num_neg() {fxt.Test_parse_val0("{'k0':-123}" , -123);}
@Test public void Str() {fxt.Test_parse_val0("{'k0':'v0'}" , "v0");}
@Test public void Str_esc_quote() {fxt.Test_parse_val0("{'k0':'a\\\"b'}" , "a\"b");}
@Test public void Str_esc_hex4() {fxt.Test_parse_val0("{'k0':'a\\u0021b'}" , "a!b");}
@Test public void Str_encoded_basic() {fxt.Test_parse_val0("{'k0':'a\\u0021b'}" , "a!b");}
@Test public void Str_encoded_surrogate() {fxt.Test_parse_val0("{'k0':'a\\ud83c\\udf0eb'}", "a🌎b");} // check for UTF surrogate-pairs; symbol is earth globe americas (U+1F30E); ISSUE#:487; DATE:2019-06-02
@Test public void Num_dec() {fxt.Test_parse("{'k0':1.23}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "1.23")));}
@Test public void Num_exp() {fxt.Test_parse("{'k0':1e+2}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "1e+2")));}
@Test public void Num_mix() {fxt.Test_parse("{'k0':-1.23e-1}" , fxt.itm_nde_().Add_many(fxt.itm_kv_dec_("k0", "-1.23e-1")));}

@ -46,6 +46,6 @@ public class Ustring_ {
, Surrogate_lo_end = 0xDFFF // 57,343: Surrogate low end
, Surrogate_cp_bgn = 0x010000 // 65,536: Surrogate codepoint start
, Surrogate_cp_end = 0x10FFFF // 1,114,111: Surrogate codepoint end
, Surrogate_range = 0x400 // 1,024: Surrogate range (end - start) for high / low
, Surrogate_range = 0x400 // 1,024: Surrogate range (end - start) for high / low
;
}

Loading…
Cancel
Save