diff --git a/100_core/src/gplx/Bry_bfr.java b/100_core/src/gplx/Bry_bfr.java index fe9bb8257..195be3313 100644 --- a/100_core/src/gplx/Bry_bfr.java +++ b/100_core/src/gplx/Bry_bfr.java @@ -17,6 +17,7 @@ along with this program. If not, see . */ package gplx; import gplx.core.primitives.*; import gplx.core.brys.*; import gplx.core.encoders.*; +import gplx.langs.htmls.entitys.*; public class Bry_bfr { private Bry_bfr_mkr_mgr mkr_mgr; private int reset; public byte[] Bfr() {return bfr;} private byte[] bfr; @@ -293,7 +294,38 @@ public class Bry_bfr { } } if (clean) - Add(val); + Add_mid(val, bgn, end); + return this; + } + public Bry_bfr Add_bry_escape_html(byte[] val) {return Add_bry_escape_html(val, 0, val.length);} + public Bry_bfr Add_bry_escape_html(byte[] val, int bgn, int end) { // uses PHP rules for htmlspecialchars; REF.PHP:http://php.net/manual/en/function.htmlspecialchars.php + boolean clean = true; + for (int i = bgn; i < end; ++i) { + byte[] escaped = null; + byte b = val[i]; + switch (b) { + case Byte_ascii.Amp: escaped = Gfh_entity_.Amp_bry; break; + case Byte_ascii.Quote: escaped = Gfh_entity_.Quote_bry; break; + case Byte_ascii.Apos: escaped = Gfh_entity_.Apos_num_bry; break; + case Byte_ascii.Lt: escaped = Gfh_entity_.Lt_bry; break; + case Byte_ascii.Gt: escaped = Gfh_entity_.Gt_bry; break; + } + if (escaped == null && clean) { + continue; + } + else { + if (clean) { + clean = false; + this.Add_mid(val, bgn, i); + } + if (escaped == null) + this.Add_byte(b); + else + this.Add(escaped); + } + } + if (clean) + Add_mid(val, bgn, end); return this; } public Bry_bfr Add_str_u8_w_nl(String s) {Add_str_u8(s); return Add_byte_nl();} diff --git a/100_core/src/gplx/Bry_bfr_tst.java b/100_core/src/gplx/Bry_bfr_tst.java index e3a7b474c..aa5114a0d 100644 --- a/100_core/src/gplx/Bry_bfr_tst.java +++ b/100_core/src/gplx/Bry_bfr_tst.java @@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx; -import org.junit.*; +import org.junit.*; import gplx.core.tests.*; public class Bry_bfr_tst { private Bry_bfr bb = Bry_bfr_.New(); @Before public void setup() {bb.Clear();} private ByteAryBfr_fxt fxt = new ByteAryBfr_fxt(); @@ -187,9 +187,16 @@ public class Bry_bfr_tst { fxt.Test_Add_int_pad_bgn(Byte_ascii.Num_0, 3, 1000, "1000"); } @Test public void Add_bry_escape() { - fxt.Test_Add_bry_escape("abc" , "abc"); // nothing to escape - fxt.Test_Add_bry_escape("a'bc" , "a''bc"); // single escape (code handles first quote differently) - fxt.Test_Add_bry_escape("a'b'c" , "a''b''c"); // double escape (code handles subsequent quotes different than first) + fxt.Test__add_bry_escape("abc" , "abc"); // nothing to escape + fxt.Test__add_bry_escape("a'bc" , "a''bc"); // single escape (code handles first quote differently) + fxt.Test__add_bry_escape("a'b'c" , "a''b''c"); // double escape (code handles subsequent quotes different than first) + fxt.Test__add_bry_escape("abc", 1, 2 , "b"); // nothing to escape + } + @Test public void Add_bry_escape_html() { + fxt.Test__add_bry_escape_html("abc" , "abc"); // escape=none + fxt.Test__add_bry_escape_html("a&\"'<>b" , "a&"'<>b"); // escape=all; code handles first escape differently + fxt.Test__add_bry_escape_html("a&b&c" , "a&b&c"); // staggered; code handles subsequent escapes differently + fxt.Test__add_bry_escape_html("abc", 1, 2 , "b"); // by index; fixes bug in initial implementation } @Test public void Insert_at() { fxt.Test_Insert_at("abcd", 0, "xyz" , "xyzabcd"); // bgn @@ -217,13 +224,15 @@ public class Bry_bfr_tst { } class ByteAryBfr_fxt { private final Bry_bfr bfr = Bry_bfr_.Reset(16); + public Bry_bfr Bfr() {return bfr;} public void Clear() { bfr.ClearAndReset(); } public void Test_Add_int_pad_bgn(byte pad_byte, int str_len, int val, String expd) {Tfds.Eq(expd, bfr.Add_int_pad_bgn(pad_byte, str_len, val).To_str_and_clear());} - public void Test_Add_bry_escape(String val, String expd) { - byte[] val_bry = Bry_.new_u8(val); - Tfds.Eq(expd, bfr.Add_bry_escape(Byte_ascii.Apos, Byte_.Ary(Byte_ascii.Apos, Byte_ascii.Apos), val_bry, 0, val_bry.length).To_str_and_clear()); + public void Test__add_bry_escape(String src, String expd) {Test__add_bry_escape(src, 0, String_.Len(src), expd);} + public void Test__add_bry_escape(String src, int src_bgn, int src_end, String expd) { + byte[] val_bry = Bry_.new_u8(src); + Tfds.Eq(expd, bfr.Add_bry_escape(Byte_ascii.Apos, Byte_.Ary(Byte_ascii.Apos, Byte_ascii.Apos), val_bry, src_bgn, src_end).To_str_and_clear()); } public void Test_Insert_at(String init, int pos, String val, String expd) {Tfds.Eq(expd, bfr.Add_str_u8(init).Insert_at(pos, Bry_.new_u8(val)).To_str_and_clear());} public void Test_Insert_at(String init, int pos, String val, int val_bgn, int val_end, String expd) {Tfds.Eq(expd, bfr.Add_str_u8(init).Insert_at(pos, Bry_.new_u8(val), val_bgn, val_end).To_str_and_clear());} @@ -233,4 +242,8 @@ class ByteAryBfr_fxt { public void Test__to_bry_ary_and_clear(String bfr_str, String... expd) { Tfds.Eq_ary(expd, String_.Ary(bfr.Add_str_u8(bfr_str).To_bry_ary_and_clear())); } + public void Test__add_bry_escape_html(String src, String expd) {Test__add_bry_escape_html(src, 0, String_.Len(src), expd);} + public void Test__add_bry_escape_html(String src, int src_bgn, int src_end, String expd) { + Gftest.Eq__bry(Bry_.new_u8(expd), bfr.Add_bry_escape_html(Bry_.new_u8(src), src_bgn, src_end).To_bry_and_clear()); + } } diff --git a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java index 4f9ec1ed8..5ad6491cb 100644 --- a/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java +++ b/400_xowa/src/gplx/xowa/addons/bldrs/mass_parses/parses/wkrs/Xomp_parse_wkr.java @@ -111,7 +111,7 @@ public class Xomp_parse_wkr implements Gfo_invk { // if ns changed and prv_ns is main if (cur_ns != prv_ns) { if (prv_ns == gplx.xowa.wikis.nss.Xow_ns_.Tid__main) - wiki.Cache_mgr().Free_mem__all(); // NOTE: clears page and wbase cache only; needed else OutOfMemory error for en.w in 25th hour; DATE:2017-01-07 + wiki.Cache_mgr().Free_mem__all(); // NOTE: clears page and wbase cache only; needed else OutOfMemory error for en.w in 25th hour; DATE:2017-01-11 prv_ns = cur_ns; } Xoae_page wpg = Xoae_page.New(wiki, ttl); diff --git a/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr.java b/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr.java index a43c06148..8dda808c2 100644 --- a/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr.java +++ b/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr.java @@ -132,11 +132,11 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls int start_pos = Bry_find_.Find_fwd(src, Bry__only_include_bgn, i, src_len); if (start_pos == Bry_find_.Not_found) { // Ignored section runs to the end - accum.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, i))).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(src, i, src_len).Add_str_a7(""); break; } int tag_end_pos = start_pos + Bry__only_include_bgn.length; // past-the-end - accum.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, i, tag_end_pos))).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(src, i, tag_end_pos).Add_str_a7(""); i = tag_end_pos; find_only_include = false; } @@ -205,7 +205,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls literal_len++; } if (literal_len > 0) { - accum.Add(htmlspecialchars(Bry_.Mid(src, i, i + literal_len))); + accum.Add_bry_escape_html(src, i, i + literal_len); i += literal_len; } if (i >= src_len) { @@ -277,7 +277,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls int end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len); if (end_pos == Bry_find_.Not_found) { // Unclosed comment in input, runs to end - accum.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, i))).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(src, i, src_len).Add_str_a7(""); i = src_len; } else { @@ -331,7 +331,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls break; } inner = Bry_.Mid(src, bgn_pos, end_pos); - accum.Add_str_a7("").Add(htmlspecialchars(inner)).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(inner).Add_str_a7(""); } // Do a line-start run next time to look for headings after the comment @@ -353,7 +353,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls } i = end_pos + 1; inner = Bry_.Mid(src, bgn_pos, end_pos + 1); - accum.Add_str_a7("").Add(htmlspecialchars(inner)).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(inner).Add_str_a7(""); continue; } } @@ -375,7 +375,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls // Handle ignored tags if (ignored_tags.Has(name)) { - accum.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, i, tag_end_pos + 1))).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(src, i, tag_end_pos + 1).Add_str_a7(""); i = tag_end_pos + 1; continue; } @@ -426,7 +426,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls && elem_end_found) { inner = Bry_.Mid(src, tag_end_pos + 1, elem_end_lhs); i = elem_end_rhs; - tmp_bfr.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, elem_end_lhs, elem_end_rhs))).Add_str_a7(""); + tmp_bfr.Add_str_a7("").Add_bry_escape_html(src, elem_end_lhs, elem_end_rhs).Add_str_a7(""); close = tmp_bfr.To_bry_and_clear(); } else { @@ -440,7 +440,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls else { // Don't match the tag, treat opening tag as literal and resume parsing. i = tag_end_pos + 1; - accum.Add(htmlspecialchars(Bry_.Mid(src, tag_bgn_pos, tag_end_pos + 1))); + accum.Add_bry_escape_html(src, tag_bgn_pos, tag_end_pos + 1); // Cache results, otherwise we have O(N^2) performance for input like ... no_more_closing_tag.Add_if_dupe_use_nth(name, name); continue; @@ -450,18 +450,26 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls // and just become tags if (ignored_elements.Has(name)) { - accum.Add_str_a7("").Add(htmlspecialchars(Bry_.Mid(src, tag_bgn_pos, i))).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(src, tag_bgn_pos, i).Add_str_a7(""); continue; } accum.Add_str_a7(""); - byte[] atr_bry = atr_end <= atr_bgn ? Bry_.Empty : Bry_.Mid(src, atr_bgn, atr_end); + // PORTED: + // if ( $attrEnd <= $attrStart ) { + // $attr = ''; + // } else { + // $attr = substr( $text, $attrStart, $attrEnd - $attrStart ); + // } accum.Add_str_a7("").Add(name).Add_str_a7(""); // Note that the attr element contains the whitespace between name and attribute, // this is necessary for precise reconstruction during pre-save transform. - accum.Add_str_a7("").Add(htmlspecialchars(atr_bry)).Add_str_a7(""); + accum.Add_str_a7(""); + if (atr_end > atr_bgn) + accum.Add_bry_escape_html(src, atr_bgn, atr_end); + accum.Add_str_a7(""); if (inner != null) { - accum.Add_str_a7("").Add(htmlspecialchars(inner)).Add_str_a7(""); + accum.Add_str_a7("").Add_bry_escape_html(inner).Add_str_a7(""); } accum.Add(close).Add_str_a7(""); } @@ -582,7 +590,8 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls } else { // Add literal brace(s) - accum.Add(htmlspecialchars(Bry_.Repeat_bry(cur_char, count))); + for (int j = 0; j < count; j++) + accum.Add_bry_escape_html(cur_char); } i += count; } @@ -613,7 +622,8 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls if (matching_count <= 0) { // No matching element found in callback array // Output a literal closing brace and continue - accum.Add(htmlspecialchars(Bry_.Repeat_bry(cur_char, count))); + for (int j = 0; j < count; j++) + accum.Add_bry_escape_html(cur_char); i += count; continue; } @@ -718,15 +728,6 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls root_accum.Add_str_a7(""); return root_accum.To_bry_and_clear(); } - private byte[] htmlspecialchars(byte[] bry) { -// http://php.net/manual/en/function.htmlspecialchars.php -//& (ampersand) & -//" (double quote) ", unless ENT_NOQUOTES is set -//' (single quote) ' (for ENT_HTML401) or ' (for ENT_XML1, ENT_XHTML or ENT_HTML5), but only when ENT_QUOTES is set -//< (less than) < -//> (greater than) > - return bry; - } private Xomw_prepro_rule Get_rule(byte[] bry) { if (Bry_.Eq(bry, rule_curly.bgn)) return rule_curly; else if (Bry_.Eq(bry, rule_brack.bgn)) return rule_brack; diff --git a/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr__tst.java b/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr__tst.java index d34d645e0..fddd546f0 100644 --- a/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr__tst.java +++ b/400_xowa/src/gplx/xowa/parsers/mws/prepros/Xomw_prepro_wkr__tst.java @@ -32,15 +32,13 @@ public class Xomw_prepro_wkr__tst { fxt.Test__parse("a{{{b}}}c", "abc"); } @Test public void Comment() { - fxt.Test__parse("ac", "ac"); + fxt.Test__parse("ac", "a<!--b-->c"); } @Test public void Ext__pre() { - fxt.Test__parse("a
b
c", "apre id=\"1\"bc"); + fxt.Test__parse("a
b
c", "apre id="1"b</pre>c"); } /* TODO: -* htmlspecialchars - * for_inclusion; in String * heading.general * heading.EOS: "==a" (no closing ==)