mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Parser: Change encoding of html id to encode fewer characters [#462]
This commit is contained in:
@@ -17,12 +17,14 @@ package gplx.langs.htmls.encoders; import gplx.*; import gplx.langs.*; import gp
|
||||
import gplx.core.btries.*;
|
||||
import gplx.langs.htmls.entitys.*;
|
||||
public class Gfo_url_encoder_ {
|
||||
public static Gfo_url_encoder New__id() {return Gfo_url_encoder_.New__html_id().Make();}
|
||||
public static Gfo_url_encoder_mkr New__html_id() { // EX: "<a id='a<>b'>" -> "<a id='a.C3.A9b'>"
|
||||
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Dot).Init_common(Bool_.Y)
|
||||
public static Gfo_url_encoder_mkr New__html_id() { // EX: "<a id='a<>b'>" -> "<a id='a.C3.A9b'>"
|
||||
return new Gfo_url_encoder_mkr()
|
||||
.Init(Byte_ascii.Dot)
|
||||
.Init__same__rng(0, 255) // clear everything and set to do-not-encode
|
||||
.Init__encode_hex(Byte_ascii.Angle_bgn, Byte_ascii.Angle_end) // NOTE: should not be encoded, but will break existings tests; EX:{{#tag:pre|a|id='<br/>'}}; DATE:2019-05-12
|
||||
.Init__decode_mark(Byte_ascii.Dot)
|
||||
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Underline)
|
||||
.Init__html_ent(Byte_ascii.Amp, Gfh_entity_trie.Instance);
|
||||
.Init__html_ent(Byte_ascii.Amp, Gfh_entity_trie.Instance, false);
|
||||
}
|
||||
public static Gfo_url_encoder_mkr New__html_href_mw(boolean use_anchor_encoder) { // EX: "<a href='^#^'>" -> "<a href='%5E#.5E'>"; REF.MW: ";:@$!*(),/"
|
||||
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent).Init_common(Bool_.Y)
|
||||
@@ -84,9 +86,10 @@ public class Gfo_url_encoder_ {
|
||||
.Init__same__many(Byte_ascii.Paren_bgn, Byte_ascii.Paren_end, Byte_ascii.Apos, Byte_ascii.Semic);
|
||||
}
|
||||
public static Gfo_url_encoder_mkr New__mw_ttl() {
|
||||
return new Gfo_url_encoder_mkr().Init(Byte_ascii.Percent)
|
||||
return new Gfo_url_encoder_mkr()
|
||||
.Init(Byte_ascii.Percent)
|
||||
.Init__same__rng(0, 255)
|
||||
.Init__diff__many(Byte_ascii.Percent, Byte_ascii.Amp, Byte_ascii.Apos, Byte_ascii.Eq, Byte_ascii.Plus)
|
||||
.Init__diff__many(Byte_ascii.Amp, Byte_ascii.Apos, Byte_ascii.Eq, Byte_ascii.Plus)
|
||||
.Init__diff__one(Byte_ascii.Space, Byte_ascii.Underline)
|
||||
;
|
||||
}
|
||||
|
||||
@@ -76,7 +76,11 @@ class Gfo_url_encoder_itm_hex implements Gfo_url_encoder_itm {
|
||||
}
|
||||
class Gfo_url_encoder_itm_html_ent implements Gfo_url_encoder_itm {
|
||||
private final Btrie_slim_mgr amp_trie;
|
||||
public Gfo_url_encoder_itm_html_ent(Btrie_slim_mgr amp_trie) {this.amp_trie = amp_trie;}
|
||||
private final boolean encode_unknown_amp;
|
||||
public Gfo_url_encoder_itm_html_ent(Btrie_slim_mgr amp_trie, boolean encode_unknown_amp) {
|
||||
this.encode_unknown_amp = encode_unknown_amp;
|
||||
this.amp_trie = amp_trie;
|
||||
}
|
||||
public int Encode(Bry_bfr bfr, byte[] src, int end, int idx, byte b) {
|
||||
++idx; // b is &; get next character afterwards
|
||||
if (idx == end) { // & is last char; return
|
||||
@@ -86,7 +90,10 @@ class Gfo_url_encoder_itm_html_ent implements Gfo_url_encoder_itm {
|
||||
b = src[idx];
|
||||
Object o = amp_trie.Match_bgn_w_byte(b, src, idx, end);
|
||||
if (o == null) { // unknown entity (EX:&unknown;); return &;
|
||||
Gfo_url_encoder_itm_hex.Encode_byte(Byte_ascii.Amp, bfr, Byte_ascii.Dot);
|
||||
if (encode_unknown_amp)
|
||||
Gfo_url_encoder_itm_hex.Encode_byte(Byte_ascii.Amp, bfr, Byte_ascii.Dot);
|
||||
else
|
||||
bfr.Add_byte(Byte_ascii.Amp);
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -16,15 +16,26 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
package gplx.langs.htmls.encoders; import gplx.*; import gplx.langs.*; import gplx.langs.htmls.*;
|
||||
import gplx.core.btries.*;
|
||||
public class Gfo_url_encoder_mkr {
|
||||
private Gfo_url_encoder_itm[] encode_ary, decode_ary; private Gfo_url_encoder anchor_encoder;
|
||||
private Gfo_url_encoder_itm[] encode_ary, decode_ary;
|
||||
private Gfo_url_encoder anchor_encoder;
|
||||
private Gfo_url_encoder_itm_hex encoder_hex;
|
||||
private byte bicode_mark;
|
||||
public Gfo_url_encoder_mkr Init(byte bicode_mark) {
|
||||
this.bicode_mark = bicode_mark;
|
||||
encode_ary = new Gfo_url_encoder_itm[256]; decode_ary = new Gfo_url_encoder_itm[256];
|
||||
Gfo_url_encoder_itm_hex hex = new Gfo_url_encoder_itm_hex(bicode_mark);
|
||||
encoder_hex = new Gfo_url_encoder_itm_hex(bicode_mark);
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
encode_ary[i] = hex; // default encode to hex
|
||||
decode_ary[i] = Gfo_url_encoder_itm_same.Instance; // default decode to same; needed for files; EX: A!%21.png -> A!!.png;
|
||||
encode_ary[i] = encoder_hex; // default encode to hex
|
||||
decode_ary[i] = Gfo_url_encoder_itm_same.Instance; // default decode to same; needed for files; EX: A!%21.png -> A!!.png;
|
||||
}
|
||||
decode_ary[bicode_mark] = encoder_hex;
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__encode_hex(int... ary) {
|
||||
for (int i = 0; i < ary.length; i++) {
|
||||
int idx = ary[i];
|
||||
encode_ary[idx] = encoder_hex;
|
||||
}
|
||||
decode_ary[bicode_mark] = hex;
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__same__rng(int bgn, int end) {
|
||||
@@ -49,7 +60,7 @@ public class Gfo_url_encoder_mkr {
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__decode_mark(byte decode_mark) {
|
||||
decode_ary[decode_mark & 0xff] = new Gfo_url_encoder_itm_hex(decode_mark);// PATCH.JAVA:need to convert to unsigned byte
|
||||
decode_ary[decode_mark & 0xff] = encoder_hex;// PATCH.JAVA:need to convert to unsigned byte
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__diff__one(byte src, byte trg) {
|
||||
@@ -57,18 +68,17 @@ public class Gfo_url_encoder_mkr {
|
||||
encode_ary[src] = decode_ary[trg] = itm;
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__diff__many(byte bicode_mark, int... ary) {
|
||||
Gfo_url_encoder_itm_hex hex = new Gfo_url_encoder_itm_hex(bicode_mark);
|
||||
public Gfo_url_encoder_mkr Init__diff__many(int... ary) {
|
||||
int len = ary.length;
|
||||
for (int i = 0; i < len; i++) {
|
||||
int idx = ary[i];
|
||||
encode_ary[idx] = decode_ary[idx] = hex;
|
||||
encode_ary[idx] = decode_ary[idx] = encoder_hex;
|
||||
}
|
||||
decode_ary[bicode_mark] = hex;
|
||||
decode_ary[bicode_mark] = encoder_hex;
|
||||
return this;
|
||||
}
|
||||
public Gfo_url_encoder_mkr Init__html_ent(byte src, Btrie_slim_mgr trie) {
|
||||
Gfo_url_encoder_itm_html_ent itm = new Gfo_url_encoder_itm_html_ent(trie);
|
||||
public Gfo_url_encoder_mkr Init__html_ent(byte src, Btrie_slim_mgr trie, boolean encode_unknown_amp) {
|
||||
Gfo_url_encoder_itm_html_ent itm = new Gfo_url_encoder_itm_html_ent(trie, encode_unknown_amp);
|
||||
encode_ary[src] = itm;
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -17,13 +17,29 @@ package gplx.langs.htmls.encoders; import gplx.*; import gplx.langs.*; import gp
|
||||
import org.junit.*;
|
||||
public class Gfo_url_encoder_tst {
|
||||
private final Gfo_url_encoder_fxt fxt = new Gfo_url_encoder_fxt();
|
||||
@Test public void Id__nums() {fxt.Encoder_id().Test__bicode("0123456789" , "0123456789");}
|
||||
@Test public void Id__ltrs_lower() {fxt.Encoder_id().Test__bicode("abcdefghijklmnopqrstuvwxyz" , "abcdefghijklmnopqrstuvwxyz");}
|
||||
@Test public void Id__ltrs_upper() {fxt.Encoder_id().Test__bicode("ABCDEFGHIJKLMNOPQRSTUVWXYZ" , "ABCDEFGHIJKLMNOPQRSTUVWXYZ");}
|
||||
@Test public void Id__syms() {fxt.Encoder_id().Test__encode("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ".21.22.23.24.25.26.27.28.29.2A.2B.2C-..2F:.3B.3C.3D.3E.3F.40.5B.5C.5D.5E_.60.7B.7C.7D.7E");} // NOTE: not reversible since "." is encode_marker but not encoded
|
||||
@Test public void Id__foreign() {fxt.Encoder_id().Test__bicode("aéb", "a.C3.A9b");}
|
||||
@Test public void Id__nbsp() {fxt.Encoder_id().Test__encode("a b", "a.C2.A0b");} // NOTE: not just .A0 (160) but utf8-encoded .C2.A0
|
||||
@Test public void Id__nums() {fxt.Encoder_id().Test__bicode("0123456789");}
|
||||
@Test public void Id__ltrs_lower() {fxt.Encoder_id().Test__bicode("abcdefghijklmnopqrstuvwxyz");}
|
||||
@Test public void Id__ltrs_upper() {fxt.Encoder_id().Test__bicode("ABCDEFGHIJKLMNOPQRSTUVWXYZ");}
|
||||
@Test public void Id__syms_0() {fxt.Encoder_id().Test__bicode("!\"#$%&'()*+,-./");} // ISSUE#:462; DATE:2019-05-12
|
||||
@Test public void Id__syms_1() {fxt.Encoder_id().Test__bicode(":;=?@");} // ISSUE#:462; DATE:2019-05-12
|
||||
@Test public void Id__syms_2() {fxt.Encoder_id().Test__bicode("[\\]^`");} // ISSUE#:462; DATE:2019-05-12
|
||||
@Test public void Id__syms_3() {fxt.Encoder_id().Test__bicode("{|}~");}// ISSUE#:462; DATE:2019-05-12
|
||||
@Test public void Id__foreign() {fxt.Encoder_id().Test__bicode("aéb");}
|
||||
@Test public void Id__space() {fxt.Encoder_id().Test__bicode("a b", "a_b");}
|
||||
@Test public void Id__syms_1_angles() { // NOTE:should not be encoded, but will break existings tests; EX:{{#tag:pre|a|id='<br/>'}}; DATE:2019-05-12;
|
||||
fxt.Encoder_id().Test__encode("<", ".3C");
|
||||
fxt.Encoder_id().Test__decode(".3C", "<");
|
||||
fxt.Encoder_id().Test__encode(">", ".3E");
|
||||
fxt.Encoder_id().Test__decode(".3E", ">");
|
||||
}
|
||||
@Test public void Id__syms_2_lodash() { // ISSUE#:462; DATE:2019-05-12
|
||||
fxt.Encoder_id().Test__encode("_", "_");
|
||||
fxt.Encoder_id().Test__decode("_", " ");
|
||||
}
|
||||
@Test public void Id__nbsp() {
|
||||
fxt.Encoder_id().Test__encode("a b", "a.C2.A0b"); // NOTE: not just .A0 (160) but utf8-encoded .C2.A0
|
||||
fxt.Encoder_id().Test__decode("a.C2.A0b", "a b"); // WS is nbsp
|
||||
}
|
||||
@Test public void Id__err() {
|
||||
byte[] raw = Bry_.new_a7("0%.jpg");
|
||||
Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
@@ -39,7 +55,7 @@ public class Gfo_url_encoder_tst {
|
||||
fxt.Encoder_href().Test__encode("a b", "a_b");
|
||||
}
|
||||
@Test public void Href__special_and_anchor() { // PURPOSE: MediaWiki encodes with % for ttls, but . for anchors; REF:Title.php!(before-anchor)getLocalUrl;wfUrlencode (after-anchor)escapeFragmentForURL
|
||||
fxt.Encoder_href().Test__bicode("^#^", "%5E#.5E");
|
||||
fxt.Encoder_href().Test__bicode("^#^", "%5E#^");
|
||||
fxt.Encoder_href().Test__encode("A#", "A#");
|
||||
}
|
||||
@Test public void Href__invalid() { // PURPOSE: check that invalid url decodings are rendered literally; DATE:2014-04-10
|
||||
@@ -56,6 +72,7 @@ class Gfo_url_encoder_fxt {
|
||||
public Gfo_url_encoder_fxt Encoder_url() {encoder = Gfo_url_encoder_.Http_url; return this;}
|
||||
public Gfo_url_encoder_fxt Encoder_ttl() {encoder = Gfo_url_encoder_.Mw_ttl; return this;}
|
||||
public Gfo_url_encoder_fxt Encoder_fsys_safe() {encoder = Gfo_url_encoder_.New__fsys_wnt().Make(); return this;}
|
||||
public void Test__bicode(String raw) {Test__bicode(raw, raw);}
|
||||
public void Test__bicode(String raw, String encoded) {
|
||||
Test__encode(raw, encoded);
|
||||
Test__decode(encoded, raw);
|
||||
|
||||
Reference in New Issue
Block a user