From 5d886501e89a5a90b68d2dea69d5af4487938deb Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Sun, 23 Jun 2019 21:35:21 -0400 Subject: [PATCH] Language: Escape left-to-right / right-to-left marks in names.json ('\xE2\x80\x8E' to '\u200E') [#501] --- 100_core/src/gplx/Byte_ascii.java | 2 +- 100_core/src/gplx/core/encoders/Hex_utl_.java | 22 +-- 100_core/src/gplx/core/encoders/Oct_utl_.java | 34 +++++ .../src/gplx/langs/jsons/Json_doc_wtr.java | 53 ++++++- .../gplx/langs/jsons/Json_doc_wtr_tst.java | 21 +++ .../src/gplx/langs/phps/Php_evaluator.java | 12 +- .../src/gplx/langs/phps/Php_parser_fxt.java | 9 ++ .../src/gplx/langs/phps/Php_parser_tst.java | 26 +++- .../src/gplx/langs/phps/Php_quote_parser.java | 132 ++++++++++++++++++ .../src/gplx/langs/phps/Php_tkn_base.java | 5 +- .../xtns/cldrs/Language_names_converter.java | 1 + res/bin/any/xowa/cfg/lang/data/names.json | 54 +++---- 12 files changed, 326 insertions(+), 45 deletions(-) create mode 100644 100_core/src/gplx/core/encoders/Oct_utl_.java create mode 100644 400_xowa/src/gplx/langs/phps/Php_quote_parser.java diff --git a/100_core/src/gplx/Byte_ascii.java b/100_core/src/gplx/Byte_ascii.java index 4ed86de51..721e63fd7 100644 --- a/100_core/src/gplx/Byte_ascii.java +++ b/100_core/src/gplx/Byte_ascii.java @@ -17,7 +17,7 @@ package gplx; public class Byte_ascii { public static final byte Null = 0 , Backfeed = 8, Tab = 9 - , Nl = 10, Formfeed = 12, Cr = 13 + , Nl = 10, Vertical_tab = 11, Formfeed = 12, Cr = 13 , Escape = 27 , Space = 32, Bang = 33, Quote = 34 , Hash = 35, Dollar = 36, Percent = 37, Amp = 38, Apos = 39 diff --git a/100_core/src/gplx/core/encoders/Hex_utl_.java b/100_core/src/gplx/core/encoders/Hex_utl_.java index ee8741835..859a90f55 100644 --- a/100_core/src/gplx/core/encoders/Hex_utl_.java +++ b/100_core/src/gplx/core/encoders/Hex_utl_.java @@ -128,18 +128,22 @@ public class Hex_utl_ { } public static boolean Is_hex_many(byte... ary) { for (byte itm : ary) { - switch (itm) { - case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4: - case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: - case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F: - case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f: - break; - default: - return false; - } + if (!Is_hex(itm)) + return false; } return true; } + public static boolean Is_hex(byte itm) { + switch (itm) { + case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4: + case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: + case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F: + case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f: + return true; + default: + return false; + } + } private static int To_int(char c) { switch (c) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; diff --git a/100_core/src/gplx/core/encoders/Oct_utl_.java b/100_core/src/gplx/core/encoders/Oct_utl_.java new file mode 100644 index 000000000..db60d1e29 --- /dev/null +++ b/100_core/src/gplx/core/encoders/Oct_utl_.java @@ -0,0 +1,34 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.core.encoders; import gplx.*; import gplx.core.*; +public class Oct_utl_ { + public static int Parse_or(byte[] src, int or) {return Parse_or(src, 0, src.length, or);} + public static int Parse_or(byte[] src, int bgn, int end, int or) { + int rv = 0; int factor = 1; + byte b = Byte_.Max_value_127; + for (int i = end - 1; i >= bgn; i--) { + switch (src[i]) { + case Byte_ascii.Num_0: b = 0; break; case Byte_ascii.Num_1: b = 1; break; case Byte_ascii.Num_2: b = 2; break; case Byte_ascii.Num_3: b = 3; break; case Byte_ascii.Num_4: b = 4; break; + case Byte_ascii.Num_5: b = 5; break; case Byte_ascii.Num_6: b = 6; break; case Byte_ascii.Num_7: b = 7; break; + default: b = Byte_.Max_value_127; break; + } + if (b == Byte_.Max_value_127) return or; + rv += b * factor; + factor *= 8; + } + return rv; + } +} diff --git a/400_xowa/src/gplx/langs/jsons/Json_doc_wtr.java b/400_xowa/src/gplx/langs/jsons/Json_doc_wtr.java index b2793ce6a..63658aeb9 100644 --- a/400_xowa/src/gplx/langs/jsons/Json_doc_wtr.java +++ b/400_xowa/src/gplx/langs/jsons/Json_doc_wtr.java @@ -14,9 +14,12 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.langs.jsons; import gplx.*; import gplx.langs.*; +import gplx.objects.strings.unicodes.*; +import gplx.core.encoders.*; public class Json_doc_wtr { private int indent = -2; private Bry_bfr bfr = Bry_bfr_.Reset(255); + public void Opt_unicode_y_() {opt_unicode = true;} private boolean opt_unicode; public Json_doc_wtr Indent() {return Indent(indent);} private Json_doc_wtr Indent(int v) {if (v > 0) bfr.Add_byte_repeat(Byte_ascii.Space, v); return this;} public Json_doc_wtr Indent_add() {indent += 2; return this;} @@ -31,11 +34,59 @@ public class Json_doc_wtr { bfr.Add(Object_.Bry__null); else { bfr.Add_byte(Byte_ascii.Quote); - bfr.Add_bry_escape(Byte_ascii.Quote, Escaped__quote, v, 0, v.length); + if (opt_unicode) { + Ustring ustr = Ustring_.New_codepoints(String_.new_u8(v)); + int ustr_len = ustr.Len_in_data(); + for (int i = 0; i < ustr_len; i++) { + int cp = ustr.Get_data(i); + Write_str_codepoint(bfr, cp); + } + } + else { + bfr.Add_bry_escape(Byte_ascii.Quote, Escaped__quote, v, 0, v.length); + } bfr.Add_byte(Byte_ascii.Quote); } return this; } + private void Write_str_codepoint(Bry_bfr bfr, int val) { + switch (val) { // REF: https://www.json.org/ + case Byte_ascii.Quote: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Quote); + break; + case Byte_ascii.Backslash: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Backslash); + break; + case Byte_ascii.Backfeed: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_b); + break; + case Byte_ascii.Formfeed: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_f); + break; + case Byte_ascii.Nl: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_n); + break; + case Byte_ascii.Cr: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_r); + break; + case Byte_ascii.Tab: + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_t); + break; + default: + if ( val < Byte_ascii.Space // control characters + || val == 160 // nbsp + || val == 8206 // left to right + || val == 8207 // right to left + ) { + // convert to \u1234 + bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_u).Add_str_a7(Hex_utl_.To_str(val, 4)); + } + else { + bfr.Add_u8_int(val); + } + break; + } + } public Json_doc_wtr Int(int v) {bfr.Add_int_variable(v); return this;} public Json_doc_wtr Double(double v) {bfr.Add_double(v); return this;} public Json_doc_wtr Comma() {Indent(); bfr.Add_byte(Byte_ascii.Comma).Add_byte_nl(); return this;} diff --git a/400_xowa/src/gplx/langs/jsons/Json_doc_wtr_tst.java b/400_xowa/src/gplx/langs/jsons/Json_doc_wtr_tst.java index daa663e40..c15465b55 100644 --- a/400_xowa/src/gplx/langs/jsons/Json_doc_wtr_tst.java +++ b/400_xowa/src/gplx/langs/jsons/Json_doc_wtr_tst.java @@ -25,6 +25,19 @@ public class Json_doc_wtr_tst { , " 'k1':'v\\\"1'" , "}")); } + @Test public void Quotes() { + fxt.Test__string__quotes("a\"z" , "a\\\"z"); + fxt.Test__string__quotes("a\u0008z" , "a\\bz"); + fxt.Test__string__quotes("a\fz" , "a\\fz"); + fxt.Test__string__quotes("a\nz" , "a\\nz"); + fxt.Test__string__quotes("a\rz" , "a\\rz"); + fxt.Test__string__quotes("a\tz" , "a\\tz"); + fxt.Test__string__quotes("aēz" , "aēz"); + fxt.Test__string__quotes("az" , "a\\u000Fz"); + fxt.Test__string__quotes("a z" , "a\\u00A0z"); + fxt.Test__string__quotes("a‎z" , "a\\u200Ez"); + fxt.Test__string__quotes("a‏z" , "a\\u200Fz"); + } } class Json_doc_wtr_fxt { public Json_doc_wtr Exec__Kv_simple(String key, String val) { @@ -40,4 +53,12 @@ class Json_doc_wtr_fxt { public String Exec__Concat_apos(String... ary) { return Json_doc.Make_str_by_apos(ary); } + public void Test__string__quotes(String raw, String expd) { + Json_doc_wtr doc_wtr = new Json_doc_wtr(); + doc_wtr.Opt_unicode_y_(); + doc_wtr.Str(Bry_.new_u8(raw)); + String actl = doc_wtr.Bld_as_str(); + actl = String_.Mid(actl, 1, String_.Len(actl) - 1); + Gftest.Eq__str(expd, actl); + } } diff --git a/400_xowa/src/gplx/langs/phps/Php_evaluator.java b/400_xowa/src/gplx/langs/phps/Php_evaluator.java index a1738b83b..f5523f0be 100644 --- a/400_xowa/src/gplx/langs/phps/Php_evaluator.java +++ b/400_xowa/src/gplx/langs/phps/Php_evaluator.java @@ -25,7 +25,11 @@ public class Php_evaluator implements Php_tkn_wkr { private byte mode = Mode_key_bgn, next_tid = 0, next_mode = 0; private Php_line_assign cur_line; private Php_itm_ary cur_ary; private Php_key cur_kv_key; private final List_adp frame_stack = List_adp_.New(); - public Php_evaluator(Gfo_msg_log msg_log) {this.msg_log = msg_log;} private Gfo_msg_log msg_log; + private final Php_quote_parser quote_parser = new Php_quote_parser(); + private final Gfo_msg_log msg_log; + public Php_evaluator(Gfo_msg_log msg_log) { + this.msg_log = msg_log; + } public void Init(Php_ctx ctx) {src = ctx.Src(); frame_stack.Clear();} private byte[] src; public List_adp List() {return lines;} private final List_adp lines = List_adp_.New(); public Gfo_msg_log Msg_log() {return msg_log;} @@ -101,7 +105,7 @@ public class Php_evaluator implements Php_tkn_wkr { switch (tkn_tid) { case Php_tkn_.Tid_quote: Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn; - Php_itm_quote key_sub = new Php_itm_quote(tkn_quote.Quote_text(src)); + Php_itm_quote key_sub = new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src)); cur_line.Key_subs_(new Php_key[] {key_sub}); mode = Mode_key_end; break; @@ -121,7 +125,7 @@ public class Php_evaluator implements Php_tkn_wkr { case Php_tkn_.Tid_quote: Expect(Php_tkn_.Tid_semic, Mode_key_bgn); Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn; - line_val = new Php_itm_quote(tkn_quote.Quote_text(src)); + line_val = new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src)); break; case Php_tkn_.Tid_ary: case Php_tkn_.Tid_brack_bgn: @@ -161,7 +165,7 @@ public class Php_evaluator implements Php_tkn_wkr { case Php_tkn_.Tid_true: Ary_add_itm(Php_itm_bool_true.Instance); break; case Php_tkn_.Tid_quote: Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn; - Ary_add_itm(new Php_itm_quote(tkn_quote.Quote_text(src))); + Ary_add_itm(new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src))); break; case Php_tkn_.Tid_num: Php_tkn_num tkn_num = (Php_tkn_num)tkn; diff --git a/400_xowa/src/gplx/langs/phps/Php_parser_fxt.java b/400_xowa/src/gplx/langs/phps/Php_parser_fxt.java index e91276415..50853f5dc 100644 --- a/400_xowa/src/gplx/langs/phps/Php_parser_fxt.java +++ b/400_xowa/src/gplx/langs/phps/Php_parser_fxt.java @@ -67,6 +67,15 @@ class Php_parser_fxt { tst_mgr.Tst_ary("", expd, actl); log_mgr_chkr.tst(tst_mgr, line_wkr.Msg_log()); } + public void Test__string__quotes(String raw, String expd) { + line_wkr.Clear(); + byte[] raw_bry = Bry_.new_u8("$var =\"" + raw +"\";"); + parser.Parse_tkns(raw_bry, line_wkr); + Php_line[] actl_lines = (Php_line[])line_wkr.List().To_ary(Php_line.class); + Php_line_assign actl_line = (Php_line_assign)actl_lines[0]; + Php_itm_quote actl = (Php_itm_quote)actl_line.Val(); + Tfds.Eq_str(expd, String_.new_u8(actl.Val_obj_bry())); + } } abstract class Php_tkn_chkr_base implements Tst_chkr { public abstract byte Tkn_tid(); diff --git a/400_xowa/src/gplx/langs/phps/Php_parser_tst.java b/400_xowa/src/gplx/langs/phps/Php_parser_tst.java index 354f33a88..e3d9300d2 100644 --- a/400_xowa/src/gplx/langs/phps/Php_parser_tst.java +++ b/400_xowa/src/gplx/langs/phps/Php_parser_tst.java @@ -48,10 +48,10 @@ public class Php_parser_tst { @Test public void Ary_flat() {fxt.tst_lines("$a = array('b', 'c', 'd');" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b"), fxt.itm_quote("c"), fxt.itm_quote("d"))));} @Test public void Brack_flat() {fxt.tst_lines("$a = ['b', 'c', 'd'];" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b"), fxt.itm_quote("c"), fxt.itm_quote("d"))));} @Test public void Ary_flat_escape() { // PURPOSE.fix: \\' was being interpreted incorrectly; \\ should escape \, but somehow \' was being escaped - fxt.tst_lines("$a = array('b\\\\', 'c');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\\\"), fxt.itm_quote("c")))); + fxt.tst_lines("$a = array('b\\\\', 'c');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\"), fxt.itm_quote("c")))); } @Test public void Ary_flat_escape2() { // PURPOSE.fix: \\' was being interpreted incorrectly; \\ should escape \, but somehow \' was being escaped - fxt.tst_lines("$a = array('b\\\\\\'c', 'd');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\\\\\'c"), fxt.itm_quote("d")))); + fxt.tst_lines("$a = array('b\\\\\\'c', 'd');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\'c"), fxt.itm_quote("d")))); } @Test public void Ary_kv() {fxt.tst_lines("$a = array(k0 => 'v0', k1 => 'v1', k2 => 'v2');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_kv_quote("k0", "v0"), fxt.itm_kv_quote("k1", "v1"), fxt.itm_kv_quote("k2", "v2"))));} @Test public void Brack_kv() {fxt.tst_lines("$a = [k0 => 'v0', k1 => 'v1', k2 => 'v2'];" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_kv_quote("k0", "v0"), fxt.itm_kv_quote("k1", "v1"), fxt.itm_kv_quote("k2", "v2"))));} @@ -79,4 +79,26 @@ public class Php_parser_tst { , fxt.itm_kv_itm("i20", fxt.itm_ary().Subs_(fxt.itm_quote("21"), fxt.itm_quote("22"))) ))); } + @Test public void Quoted() { + fxt.Test__string__quotes("a\\\"z" , "a\"z"); + fxt.Test__string__quotes("a\\\\z" , "a\\z"); + fxt.Test__string__quotes("a\\u0008z" , "a\bz"); + fxt.Test__string__quotes("a\\fz" , "a\fz"); + fxt.Test__string__quotes("a\\nz" , "a\nz"); + fxt.Test__string__quotes("a\\rz" , "a\rz"); + fxt.Test__string__quotes("a\\tz" , "a\tz"); + fxt.Test__string__quotes("a\\vz" , "a\u000bz"); + fxt.Test__string__quotes("a\\ez" , "a\u001bz"); + fxt.Test__string__quotes("a\\$z" , "a$z"); + fxt.Test__string__quotes("a\\7z" , "a\u0007z"); + fxt.Test__string__quotes("a\\41z" , "a!z"); + fxt.Test__string__quotes("a\\111z" , "aIz"); + fxt.Test__string__quotes("a\\x9z" , "a\tz"); + fxt.Test__string__quotes("a\\x21z" , "a!z"); + fxt.Test__string__quotes("a\\xE2\\x80\\x8Ez" , "a\u200Ez"); + fxt.Test__string__quotes("a\\u9z" , "a\tz"); + fxt.Test__string__quotes("a\\u21z" , "a!z"); + fxt.Test__string__quotes("a\\u113z" , "aēz"); + fxt.Test__string__quotes("a\\u{0008}z" , "a\bz"); + } } diff --git a/400_xowa/src/gplx/langs/phps/Php_quote_parser.java b/400_xowa/src/gplx/langs/phps/Php_quote_parser.java new file mode 100644 index 000000000..da9e57c9c --- /dev/null +++ b/400_xowa/src/gplx/langs/phps/Php_quote_parser.java @@ -0,0 +1,132 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.langs.phps; import gplx.*; import gplx.langs.*; +import gplx.core.encoders.*; +class Php_quote_parser { // REF: https://www.php.net/manual/en/language.types.String.php + private final Bry_bfr bfr = Bry_bfr_.New(); + public byte[] Parse(byte[] src, int src_pos, int src_end) { + try { + while (src_pos < src_end) { + int val = 0; + byte b = src[src_pos++]; + if (b == Byte_ascii.Backslash) { + b = src[src_pos++]; + switch(b) { + case Byte_ascii.Ltr_n: + val = Byte_ascii.Nl; + break; + case Byte_ascii.Ltr_r: + val = Byte_ascii.Cr; + break; + case Byte_ascii.Ltr_t: + val = Byte_ascii.Tab; + break; + case Byte_ascii.Ltr_v: + val = Byte_ascii.Vertical_tab; + break; + case Byte_ascii.Ltr_e: + val = Byte_ascii.Escape; + break; + case Byte_ascii.Ltr_f: + val = Byte_ascii.Formfeed; + break; + case Byte_ascii.Dollar: + case Byte_ascii.Backslash: + case Byte_ascii.Quote: + val = b; + break; + // octal + case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4: + case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: { + int num_bgn = src_pos - 1; // - 1 b/c pos++ above + int num_end = src_pos; + for (int i = 0; i < 3; i++) {// per REF, octal is {1,3} + byte n = src[src_pos]; + num_end = src_pos; + if (Byte_ascii.Is_num(n)) { + ++src_pos; + } + else { + break; + } + } + + val = Oct_utl_.Parse_or(src, num_bgn, num_end, -1); + break; + } + // hexdec + case Byte_ascii.Ltr_x: { + // REF: changed from \xFF to \u1234; https://github.com/wikimedia/mediawiki/commit/0313128b1038de8f2ee52a181eafdee8c5e430f7#diff-1b04277d170b32db7f92ce812744ef6b + int num_bgn = src_pos; + int num_end = src_pos++; + for (int i = 0; i < 2; i++) { // per REF, hex is {1,2} + byte n = src[src_pos]; + num_end = src_pos; + if (Hex_utl_.Is_hex(n)) { + ++src_pos; + } + else { + break; + } + } + val = Hex_utl_.Parse_or(src, num_bgn, num_end, -1); + break; + } + // unicode + case Byte_ascii.Ltr_u: { + if (src[src_pos] == Byte_ascii.Curly_bgn) { // ignore braces in u{1234} + src_pos++; + } + + int num_bgn = src_pos; + int num_end = src_pos; + for (int i = 0; i < 8; i++) { // assume max of 8 hexdecimals + byte n = src[src_pos]; + num_end = src_pos; + if (Byte_ascii.Is_num(n)) { + ++src_pos; + } + else { + break; + } + } + + if (src[src_pos] == Byte_ascii.Curly_end) { // ignore braces in u{1234} + ++src_pos; + } + + val = Hex_utl_.Parse_or(src, num_bgn, num_end, -1); + break; + } + default: + val = b; + break; + } + } + else { + val = b; + } + if (val < 255) + bfr.Add_byte((byte)val); + else + bfr.Add_u8_int(val); + } + return bfr.To_bry_and_clear(); + } catch (Exception e) { + throw Err_.new_exc(e, "Ustring_parser", "unable to parse ustring", "src", Bry_.Mid(src, src_pos, src_end)); + } + } +} \ No newline at end of file diff --git a/400_xowa/src/gplx/langs/phps/Php_tkn_base.java b/400_xowa/src/gplx/langs/phps/Php_tkn_base.java index ab250b850..91c2434ee 100644 --- a/400_xowa/src/gplx/langs/phps/Php_tkn_base.java +++ b/400_xowa/src/gplx/langs/phps/Php_tkn_base.java @@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.langs.phps; import gplx.*; import gplx.langs.*; +import gplx.core.encoders.*; public abstract class Php_tkn_base implements Php_tkn { public abstract byte Tkn_tid(); public int Src_bgn() {return src_bgn;} private int src_bgn; @@ -48,7 +49,9 @@ class Php_tkn_quote extends Php_tkn_base { public Php_tkn_quote(int src_bgn, int src_end, byte quote_tid) {this.Src_rng_(src_bgn, src_end); this.quote_tid = quote_tid;} @Override public byte Tkn_tid() {return Php_tkn_.Tid_quote;} public byte Quote_tid() {return quote_tid;} private byte quote_tid; - public byte[] Quote_text(byte[] src) {return Bry_.Mid(src, this.Src_bgn() + 1, this.Src_end() - 1);} // NOTE: assume quote are of form 'abc'; +1, -1 to skip flanking chars + public byte[] Quote_text(Php_quote_parser quote_parser, byte[] src) { + return quote_parser.Parse(src, this.Src_bgn() + 1, this.Src_end() - 1); + } public static final byte Tid_null = 0, Tid_mult = 1, Tid_slash = 2, Tid_hash = 3; } class Php_tkn_declaration extends Php_tkn_base { diff --git a/400_xowa/src/gplx/xowa/xtns/cldrs/Language_names_converter.java b/400_xowa/src/gplx/xowa/xtns/cldrs/Language_names_converter.java index 8d552e8a0..4cc8a359b 100644 --- a/400_xowa/src/gplx/xowa/xtns/cldrs/Language_names_converter.java +++ b/400_xowa/src/gplx/xowa/xtns/cldrs/Language_names_converter.java @@ -73,6 +73,7 @@ class Language_names_converter { } public String To_json(Language_name[] ary) { Json_doc_wtr doc_wtr = new Json_doc_wtr(); + doc_wtr.Opt_unicode_y_(); doc_wtr.Ary_bgn(); int len = ary.length; byte[] key_code = Bry_.new_a7("code"); diff --git a/res/bin/any/xowa/cfg/lang/data/names.json b/res/bin/any/xowa/cfg/lang/data/names.json index 24fa5e769..8211f9205 100644 --- a/res/bin/any/xowa/cfg/lang/data/names.json +++ b/res/bin/any/xowa/cfg/lang/data/names.json @@ -241,13 +241,13 @@ , { "code":"be-tarask" - , "name":"беларуская (тарашкевіца)\xE2\x80\x8E" + , "name":"беларуская (тарашкевіца)\u200E" , "note":"Belarusian in Taraskievica orthography" } , { "code":"be-x-old" - , "name":"беларуская (тарашкевіца)\xE2\x80\x8E" + , "name":"беларуская (тарашкевіца)\u200E" , "note":"(be-tarask compat)" } , @@ -439,13 +439,13 @@ , { "code":"crh-latn" - , "name":"qırımtatarca (Latin)\xE2\x80\x8E" + , "name":"qırımtatarca (Latin)\u200E" , "note":"Crimean Tatar (Latin)" } , { "code":"crh-cyrl" - , "name":"къырымтатарджа (Кирилл)\xE2\x80\x8E" + , "name":"къырымтатарджа (Кирилл)\u200E" , "note":"Crimean Tatar (Cyrillic)" } , @@ -505,7 +505,7 @@ , { "code":"de-formal" - , "name":"Deutsch (Sie-Form)\xE2\x80\x8E" + , "name":"Deutsch (Sie-Form)\u200E" , "note":"German - formal address (\"Sie\")" } , @@ -715,13 +715,13 @@ , { "code":"gan-hans" - , "name":"赣语(简体)\xE2\x80\x8E" + , "name":"赣语(简体)\u200E" , "note":"Gan (Simplified Han)" } , { "code":"gan-hant" - , "name":"贛語(繁體)\xE2\x80\x8E" + , "name":"贛語(繁體)\u200E" , "note":"Gan (Traditional Han)" } , @@ -1081,37 +1081,37 @@ , { "code":"kk-arab" - , "name":"قازاقشا (تٴوتە)\xE2\x80\x8F" + , "name":"قازاقشا (تٴوتە)\u200F" , "note":"Kazakh Arabic" } , { "code":"kk-cyrl" - , "name":"қазақша (кирил)\xE2\x80\x8E" + , "name":"қазақша (кирил)\u200E" , "note":"Kazakh Cyrillic" } , { "code":"kk-latn" - , "name":"qazaqşa (latın)\xE2\x80\x8E" + , "name":"qazaqşa (latın)\u200E" , "note":"Kazakh Latin" } , { "code":"kk-cn" - , "name":"قازاقشا (جۇنگو)\xE2\x80\x8F" + , "name":"قازاقشا (جۇنگو)\u200F" , "note":"Kazakh (China)" } , { "code":"kk-kz" - , "name":"қазақша (Қазақстан)\xE2\x80\x8E" + , "name":"қазақша (Қазақстан)\u200E" , "note":"Kazakh (Kazakhstan)" } , { "code":"kk-tr" - , "name":"qazaqşa (Türkïya)\xE2\x80\x8E" + , "name":"qazaqşa (Türkïya)\u200E" , "note":"Kazakh (Turkey)" } , @@ -1213,13 +1213,13 @@ , { "code":"ku-latn" - , "name":"Kurdî (latînî)\xE2\x80\x8E" + , "name":"Kurdî (latînî)\u200E" , "note":"Northern Kurdish (Latin script)" } , { "code":"ku-arab" - , "name":"كوردي (عەرەبی)\xE2\x80\x8F" + , "name":"كوردي (عەرەبی)\u200F" , "note":"Northern Kurdish (Arabic script) (falls back to ckb)" } , @@ -1303,7 +1303,7 @@ , { "code":"lki" - , "name":"لەکی‎" + , "name":"لەکی\u200E" , "note":"Laki" } , @@ -1579,7 +1579,7 @@ , { "code":"nl-informal" - , "name":"Nederlands (informeel)\xE2\x80\x8E" + , "name":"Nederlands (informeel)\u200E" , "note":"Dutch (informal address (\"je\"))" } , @@ -2047,13 +2047,13 @@ , { "code":"sr-ec" - , "name":"српски (ћирилица)\xE2\x80\x8E" + , "name":"српски (ћирилица)\u200E" , "note":"Serbian Cyrillic ekavian" } , { "code":"sr-el" - , "name":"srpski (latinica)\xE2\x80\x8E" + , "name":"srpski (latinica)\u200E" , "note":"Serbian Latin ekavian" } , @@ -2461,25 +2461,25 @@ , { "code":"zh-cn" - , "name":"中文(中国大陆)\xE2\x80\x8E" + , "name":"中文(中国大陆)\u200E" , "note":"Chinese (PRC)" } , { "code":"zh-hans" - , "name":"中文(简体)\xE2\x80\x8E" + , "name":"中文(简体)\u200E" , "note":"Mandarin Chinese (Simplified Chinese script) (cmn-hans)" } , { "code":"zh-hant" - , "name":"中文(繁體)\xE2\x80\x8E" + , "name":"中文(繁體)\u200E" , "note":"Mandarin Chinese (Traditional Chinese script) (cmn-hant)" } , { "code":"zh-hk" - , "name":"中文(香港)\xE2\x80\x8E" + , "name":"中文(香港)\u200E" , "note":"Chinese (Hong Kong)" } , @@ -2491,25 +2491,25 @@ , { "code":"zh-mo" - , "name":"中文(澳門)\xE2\x80\x8E" + , "name":"中文(澳門)\u200E" , "note":"Chinese (Macau)" } , { "code":"zh-my" - , "name":"中文(马来西亚)\xE2\x80\x8E" + , "name":"中文(马来西亚)\u200E" , "note":"Chinese (Malaysia)" } , { "code":"zh-sg" - , "name":"中文(新加坡)\xE2\x80\x8E" + , "name":"中文(新加坡)\u200E" , "note":"Chinese (Singapore)" } , { "code":"zh-tw" - , "name":"中文(台灣)\xE2\x80\x8E" + , "name":"中文(台灣)\u200E" , "note":"Chinese (Taiwan)" } ,