mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Language: Escape left-to-right / right-to-left marks in names.json ('\xE2\x80\x8E' to '\u200E') [#501]
This commit is contained in:
@@ -14,9 +14,12 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.langs.jsons; import gplx.*; import gplx.langs.*;
|
||||
import gplx.objects.strings.unicodes.*;
|
||||
import gplx.core.encoders.*;
|
||||
public class Json_doc_wtr {
|
||||
private int indent = -2;
|
||||
private Bry_bfr bfr = Bry_bfr_.Reset(255);
|
||||
public void Opt_unicode_y_() {opt_unicode = true;} private boolean opt_unicode;
|
||||
public Json_doc_wtr Indent() {return Indent(indent);}
|
||||
private Json_doc_wtr Indent(int v) {if (v > 0) bfr.Add_byte_repeat(Byte_ascii.Space, v); return this;}
|
||||
public Json_doc_wtr Indent_add() {indent += 2; return this;}
|
||||
@@ -31,11 +34,59 @@ public class Json_doc_wtr {
|
||||
bfr.Add(Object_.Bry__null);
|
||||
else {
|
||||
bfr.Add_byte(Byte_ascii.Quote);
|
||||
bfr.Add_bry_escape(Byte_ascii.Quote, Escaped__quote, v, 0, v.length);
|
||||
if (opt_unicode) {
|
||||
Ustring ustr = Ustring_.New_codepoints(String_.new_u8(v));
|
||||
int ustr_len = ustr.Len_in_data();
|
||||
for (int i = 0; i < ustr_len; i++) {
|
||||
int cp = ustr.Get_data(i);
|
||||
Write_str_codepoint(bfr, cp);
|
||||
}
|
||||
}
|
||||
else {
|
||||
bfr.Add_bry_escape(Byte_ascii.Quote, Escaped__quote, v, 0, v.length);
|
||||
}
|
||||
bfr.Add_byte(Byte_ascii.Quote);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
private void Write_str_codepoint(Bry_bfr bfr, int val) {
|
||||
switch (val) { // REF: https://www.json.org/
|
||||
case Byte_ascii.Quote:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Quote);
|
||||
break;
|
||||
case Byte_ascii.Backslash:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Backslash);
|
||||
break;
|
||||
case Byte_ascii.Backfeed:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_b);
|
||||
break;
|
||||
case Byte_ascii.Formfeed:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_f);
|
||||
break;
|
||||
case Byte_ascii.Nl:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_n);
|
||||
break;
|
||||
case Byte_ascii.Cr:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_r);
|
||||
break;
|
||||
case Byte_ascii.Tab:
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_t);
|
||||
break;
|
||||
default:
|
||||
if ( val < Byte_ascii.Space // control characters
|
||||
|| val == 160 // nbsp
|
||||
|| val == 8206 // left to right
|
||||
|| val == 8207 // right to left
|
||||
) {
|
||||
// convert to \u1234
|
||||
bfr.Add_byte_backslash().Add_byte(Byte_ascii.Ltr_u).Add_str_a7(Hex_utl_.To_str(val, 4));
|
||||
}
|
||||
else {
|
||||
bfr.Add_u8_int(val);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
public Json_doc_wtr Int(int v) {bfr.Add_int_variable(v); return this;}
|
||||
public Json_doc_wtr Double(double v) {bfr.Add_double(v); return this;}
|
||||
public Json_doc_wtr Comma() {Indent(); bfr.Add_byte(Byte_ascii.Comma).Add_byte_nl(); return this;}
|
||||
|
||||
@@ -25,6 +25,19 @@ public class Json_doc_wtr_tst {
|
||||
, " 'k1':'v\\\"1'"
|
||||
, "}"));
|
||||
}
|
||||
@Test public void Quotes() {
|
||||
fxt.Test__string__quotes("a\"z" , "a\\\"z");
|
||||
fxt.Test__string__quotes("a\u0008z" , "a\\bz");
|
||||
fxt.Test__string__quotes("a\fz" , "a\\fz");
|
||||
fxt.Test__string__quotes("a\nz" , "a\\nz");
|
||||
fxt.Test__string__quotes("a\rz" , "a\\rz");
|
||||
fxt.Test__string__quotes("a\tz" , "a\\tz");
|
||||
fxt.Test__string__quotes("aēz" , "aēz");
|
||||
fxt.Test__string__quotes("az" , "a\\u000Fz");
|
||||
fxt.Test__string__quotes("a z" , "a\\u00A0z");
|
||||
fxt.Test__string__quotes("az" , "a\\u200Ez");
|
||||
fxt.Test__string__quotes("az" , "a\\u200Fz");
|
||||
}
|
||||
}
|
||||
class Json_doc_wtr_fxt {
|
||||
public Json_doc_wtr Exec__Kv_simple(String key, String val) {
|
||||
@@ -40,4 +53,12 @@ class Json_doc_wtr_fxt {
|
||||
public String Exec__Concat_apos(String... ary) {
|
||||
return Json_doc.Make_str_by_apos(ary);
|
||||
}
|
||||
public void Test__string__quotes(String raw, String expd) {
|
||||
Json_doc_wtr doc_wtr = new Json_doc_wtr();
|
||||
doc_wtr.Opt_unicode_y_();
|
||||
doc_wtr.Str(Bry_.new_u8(raw));
|
||||
String actl = doc_wtr.Bld_as_str();
|
||||
actl = String_.Mid(actl, 1, String_.Len(actl) - 1);
|
||||
Gftest.Eq__str(expd, actl);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,11 @@ public class Php_evaluator implements Php_tkn_wkr {
|
||||
private byte mode = Mode_key_bgn, next_tid = 0, next_mode = 0;
|
||||
private Php_line_assign cur_line; private Php_itm_ary cur_ary; private Php_key cur_kv_key;
|
||||
private final List_adp frame_stack = List_adp_.New();
|
||||
public Php_evaluator(Gfo_msg_log msg_log) {this.msg_log = msg_log;} private Gfo_msg_log msg_log;
|
||||
private final Php_quote_parser quote_parser = new Php_quote_parser();
|
||||
private final Gfo_msg_log msg_log;
|
||||
public Php_evaluator(Gfo_msg_log msg_log) {
|
||||
this.msg_log = msg_log;
|
||||
}
|
||||
public void Init(Php_ctx ctx) {src = ctx.Src(); frame_stack.Clear();} private byte[] src;
|
||||
public List_adp List() {return lines;} private final List_adp lines = List_adp_.New();
|
||||
public Gfo_msg_log Msg_log() {return msg_log;}
|
||||
@@ -101,7 +105,7 @@ public class Php_evaluator implements Php_tkn_wkr {
|
||||
switch (tkn_tid) {
|
||||
case Php_tkn_.Tid_quote:
|
||||
Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn;
|
||||
Php_itm_quote key_sub = new Php_itm_quote(tkn_quote.Quote_text(src));
|
||||
Php_itm_quote key_sub = new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src));
|
||||
cur_line.Key_subs_(new Php_key[] {key_sub});
|
||||
mode = Mode_key_end;
|
||||
break;
|
||||
@@ -121,7 +125,7 @@ public class Php_evaluator implements Php_tkn_wkr {
|
||||
case Php_tkn_.Tid_quote:
|
||||
Expect(Php_tkn_.Tid_semic, Mode_key_bgn);
|
||||
Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn;
|
||||
line_val = new Php_itm_quote(tkn_quote.Quote_text(src));
|
||||
line_val = new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src));
|
||||
break;
|
||||
case Php_tkn_.Tid_ary:
|
||||
case Php_tkn_.Tid_brack_bgn:
|
||||
@@ -161,7 +165,7 @@ public class Php_evaluator implements Php_tkn_wkr {
|
||||
case Php_tkn_.Tid_true: Ary_add_itm(Php_itm_bool_true.Instance); break;
|
||||
case Php_tkn_.Tid_quote:
|
||||
Php_tkn_quote tkn_quote = (Php_tkn_quote)tkn;
|
||||
Ary_add_itm(new Php_itm_quote(tkn_quote.Quote_text(src)));
|
||||
Ary_add_itm(new Php_itm_quote(tkn_quote.Quote_text(quote_parser, src)));
|
||||
break;
|
||||
case Php_tkn_.Tid_num:
|
||||
Php_tkn_num tkn_num = (Php_tkn_num)tkn;
|
||||
|
||||
@@ -67,6 +67,15 @@ class Php_parser_fxt {
|
||||
tst_mgr.Tst_ary("", expd, actl);
|
||||
log_mgr_chkr.tst(tst_mgr, line_wkr.Msg_log());
|
||||
}
|
||||
public void Test__string__quotes(String raw, String expd) {
|
||||
line_wkr.Clear();
|
||||
byte[] raw_bry = Bry_.new_u8("$var =\"" + raw +"\";");
|
||||
parser.Parse_tkns(raw_bry, line_wkr);
|
||||
Php_line[] actl_lines = (Php_line[])line_wkr.List().To_ary(Php_line.class);
|
||||
Php_line_assign actl_line = (Php_line_assign)actl_lines[0];
|
||||
Php_itm_quote actl = (Php_itm_quote)actl_line.Val();
|
||||
Tfds.Eq_str(expd, String_.new_u8(actl.Val_obj_bry()));
|
||||
}
|
||||
}
|
||||
abstract class Php_tkn_chkr_base implements Tst_chkr {
|
||||
public abstract byte Tkn_tid();
|
||||
|
||||
@@ -48,10 +48,10 @@ public class Php_parser_tst {
|
||||
@Test public void Ary_flat() {fxt.tst_lines("$a = array('b', 'c', 'd');" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b"), fxt.itm_quote("c"), fxt.itm_quote("d"))));}
|
||||
@Test public void Brack_flat() {fxt.tst_lines("$a = ['b', 'c', 'd'];" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b"), fxt.itm_quote("c"), fxt.itm_quote("d"))));}
|
||||
@Test public void Ary_flat_escape() { // PURPOSE.fix: \\' was being interpreted incorrectly; \\ should escape \, but somehow \' was being escaped
|
||||
fxt.tst_lines("$a = array('b\\\\', 'c');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\\\"), fxt.itm_quote("c"))));
|
||||
fxt.tst_lines("$a = array('b\\\\', 'c');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\"), fxt.itm_quote("c"))));
|
||||
}
|
||||
@Test public void Ary_flat_escape2() { // PURPOSE.fix: \\' was being interpreted incorrectly; \\ should escape \, but somehow \' was being escaped
|
||||
fxt.tst_lines("$a = array('b\\\\\\'c', 'd');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\\\\\'c"), fxt.itm_quote("d"))));
|
||||
fxt.tst_lines("$a = array('b\\\\\\'c', 'd');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_quote("b\\'c"), fxt.itm_quote("d"))));
|
||||
}
|
||||
@Test public void Ary_kv() {fxt.tst_lines("$a = array(k0 => 'v0', k1 => 'v1', k2 => 'v2');", fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_kv_quote("k0", "v0"), fxt.itm_kv_quote("k1", "v1"), fxt.itm_kv_quote("k2", "v2"))));}
|
||||
@Test public void Brack_kv() {fxt.tst_lines("$a = [k0 => 'v0', k1 => 'v1', k2 => 'v2'];" , fxt.line_assign("a", fxt.itm_ary().Subs_(fxt.itm_kv_quote("k0", "v0"), fxt.itm_kv_quote("k1", "v1"), fxt.itm_kv_quote("k2", "v2"))));}
|
||||
@@ -79,4 +79,26 @@ public class Php_parser_tst {
|
||||
, fxt.itm_kv_itm("i20", fxt.itm_ary().Subs_(fxt.itm_quote("21"), fxt.itm_quote("22")))
|
||||
)));
|
||||
}
|
||||
@Test public void Quoted() {
|
||||
fxt.Test__string__quotes("a\\\"z" , "a\"z");
|
||||
fxt.Test__string__quotes("a\\\\z" , "a\\z");
|
||||
fxt.Test__string__quotes("a\\u0008z" , "a\bz");
|
||||
fxt.Test__string__quotes("a\\fz" , "a\fz");
|
||||
fxt.Test__string__quotes("a\\nz" , "a\nz");
|
||||
fxt.Test__string__quotes("a\\rz" , "a\rz");
|
||||
fxt.Test__string__quotes("a\\tz" , "a\tz");
|
||||
fxt.Test__string__quotes("a\\vz" , "a\u000bz");
|
||||
fxt.Test__string__quotes("a\\ez" , "a\u001bz");
|
||||
fxt.Test__string__quotes("a\\$z" , "a$z");
|
||||
fxt.Test__string__quotes("a\\7z" , "a\u0007z");
|
||||
fxt.Test__string__quotes("a\\41z" , "a!z");
|
||||
fxt.Test__string__quotes("a\\111z" , "aIz");
|
||||
fxt.Test__string__quotes("a\\x9z" , "a\tz");
|
||||
fxt.Test__string__quotes("a\\x21z" , "a!z");
|
||||
fxt.Test__string__quotes("a\\xE2\\x80\\x8Ez" , "a\u200Ez");
|
||||
fxt.Test__string__quotes("a\\u9z" , "a\tz");
|
||||
fxt.Test__string__quotes("a\\u21z" , "a!z");
|
||||
fxt.Test__string__quotes("a\\u113z" , "aēz");
|
||||
fxt.Test__string__quotes("a\\u{0008}z" , "a\bz");
|
||||
}
|
||||
}
|
||||
|
||||
132
400_xowa/src/gplx/langs/phps/Php_quote_parser.java
Normal file
132
400_xowa/src/gplx/langs/phps/Php_quote_parser.java
Normal file
@@ -0,0 +1,132 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.langs.phps; import gplx.*; import gplx.langs.*;
|
||||
import gplx.core.encoders.*;
|
||||
class Php_quote_parser { // REF: https://www.php.net/manual/en/language.types.String.php
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
public byte[] Parse(byte[] src, int src_pos, int src_end) {
|
||||
try {
|
||||
while (src_pos < src_end) {
|
||||
int val = 0;
|
||||
byte b = src[src_pos++];
|
||||
if (b == Byte_ascii.Backslash) {
|
||||
b = src[src_pos++];
|
||||
switch(b) {
|
||||
case Byte_ascii.Ltr_n:
|
||||
val = Byte_ascii.Nl;
|
||||
break;
|
||||
case Byte_ascii.Ltr_r:
|
||||
val = Byte_ascii.Cr;
|
||||
break;
|
||||
case Byte_ascii.Ltr_t:
|
||||
val = Byte_ascii.Tab;
|
||||
break;
|
||||
case Byte_ascii.Ltr_v:
|
||||
val = Byte_ascii.Vertical_tab;
|
||||
break;
|
||||
case Byte_ascii.Ltr_e:
|
||||
val = Byte_ascii.Escape;
|
||||
break;
|
||||
case Byte_ascii.Ltr_f:
|
||||
val = Byte_ascii.Formfeed;
|
||||
break;
|
||||
case Byte_ascii.Dollar:
|
||||
case Byte_ascii.Backslash:
|
||||
case Byte_ascii.Quote:
|
||||
val = b;
|
||||
break;
|
||||
// octal
|
||||
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
|
||||
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9: {
|
||||
int num_bgn = src_pos - 1; // - 1 b/c pos++ above
|
||||
int num_end = src_pos;
|
||||
for (int i = 0; i < 3; i++) {// per REF, octal is {1,3}
|
||||
byte n = src[src_pos];
|
||||
num_end = src_pos;
|
||||
if (Byte_ascii.Is_num(n)) {
|
||||
++src_pos;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
val = Oct_utl_.Parse_or(src, num_bgn, num_end, -1);
|
||||
break;
|
||||
}
|
||||
// hexdec
|
||||
case Byte_ascii.Ltr_x: {
|
||||
// REF: changed from \xFF to \u1234; https://github.com/wikimedia/mediawiki/commit/0313128b1038de8f2ee52a181eafdee8c5e430f7#diff-1b04277d170b32db7f92ce812744ef6b
|
||||
int num_bgn = src_pos;
|
||||
int num_end = src_pos++;
|
||||
for (int i = 0; i < 2; i++) { // per REF, hex is {1,2}
|
||||
byte n = src[src_pos];
|
||||
num_end = src_pos;
|
||||
if (Hex_utl_.Is_hex(n)) {
|
||||
++src_pos;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
val = Hex_utl_.Parse_or(src, num_bgn, num_end, -1);
|
||||
break;
|
||||
}
|
||||
// unicode
|
||||
case Byte_ascii.Ltr_u: {
|
||||
if (src[src_pos] == Byte_ascii.Curly_bgn) { // ignore braces in u{1234}
|
||||
src_pos++;
|
||||
}
|
||||
|
||||
int num_bgn = src_pos;
|
||||
int num_end = src_pos;
|
||||
for (int i = 0; i < 8; i++) { // assume max of 8 hexdecimals
|
||||
byte n = src[src_pos];
|
||||
num_end = src_pos;
|
||||
if (Byte_ascii.Is_num(n)) {
|
||||
++src_pos;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (src[src_pos] == Byte_ascii.Curly_end) { // ignore braces in u{1234}
|
||||
++src_pos;
|
||||
}
|
||||
|
||||
val = Hex_utl_.Parse_or(src, num_bgn, num_end, -1);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
val = b;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
val = b;
|
||||
}
|
||||
if (val < 255)
|
||||
bfr.Add_byte((byte)val);
|
||||
else
|
||||
bfr.Add_u8_int(val);
|
||||
}
|
||||
return bfr.To_bry_and_clear();
|
||||
} catch (Exception e) {
|
||||
throw Err_.new_exc(e, "Ustring_parser", "unable to parse ustring", "src", Bry_.Mid(src, src_pos, src_end));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.langs.phps; import gplx.*; import gplx.langs.*;
|
||||
import gplx.core.encoders.*;
|
||||
public abstract class Php_tkn_base implements Php_tkn {
|
||||
public abstract byte Tkn_tid();
|
||||
public int Src_bgn() {return src_bgn;} private int src_bgn;
|
||||
@@ -48,7 +49,9 @@ class Php_tkn_quote extends Php_tkn_base {
|
||||
public Php_tkn_quote(int src_bgn, int src_end, byte quote_tid) {this.Src_rng_(src_bgn, src_end); this.quote_tid = quote_tid;}
|
||||
@Override public byte Tkn_tid() {return Php_tkn_.Tid_quote;}
|
||||
public byte Quote_tid() {return quote_tid;} private byte quote_tid;
|
||||
public byte[] Quote_text(byte[] src) {return Bry_.Mid(src, this.Src_bgn() + 1, this.Src_end() - 1);} // NOTE: assume quote are of form 'abc'; +1, -1 to skip flanking chars
|
||||
public byte[] Quote_text(Php_quote_parser quote_parser, byte[] src) {
|
||||
return quote_parser.Parse(src, this.Src_bgn() + 1, this.Src_end() - 1);
|
||||
}
|
||||
public static final byte Tid_null = 0, Tid_mult = 1, Tid_slash = 2, Tid_hash = 3;
|
||||
}
|
||||
class Php_tkn_declaration extends Php_tkn_base {
|
||||
|
||||
@@ -73,6 +73,7 @@ class Language_names_converter {
|
||||
}
|
||||
public String To_json(Language_name[] ary) {
|
||||
Json_doc_wtr doc_wtr = new Json_doc_wtr();
|
||||
doc_wtr.Opt_unicode_y_();
|
||||
doc_wtr.Ary_bgn();
|
||||
int len = ary.length;
|
||||
byte[] key_code = Bry_.new_a7("code");
|
||||
|
||||
Reference in New Issue
Block a user