Table_of_contents: Skip whitespace after slash; Add <mark> as valid HTML tag [#542]

staging
gnosygnu 4 years ago
parent bc65e9fa61
commit ed010f76a8

@ -81,8 +81,8 @@ public class Gfh_tag_ { // NOTE: not serialized; used by tag_rdr
, Id__rp = 59
, Id__rt = 60
, Id__form = 61
, Id__mark = 62
;
// private static final int Id__ary_max = 60;
public static final byte[]
Bry__a = Bry_.new_a7("a")
, Bry__ul = Bry_.new_a7("ul")
@ -167,6 +167,7 @@ public class Gfh_tag_ { // NOTE: not serialized; used by tag_rdr
.Add_str_int("rp" , Id__rp)
.Add_str_int("rt" , Id__rt)
.Add_str_int("form" , Id__form)
.Add_str_int("mark" , Id__mark)
;
public static String To_str(int tid) {
switch (tid) {
@ -235,6 +236,7 @@ public class Gfh_tag_ { // NOTE: not serialized; used by tag_rdr
case Id__rp: return "rp";
case Id__rt: return "rt";
case Id__form: return "form";
case Id__mark: return "mark";
default: throw Err_.new_unhandled(tid);
}
}

@ -25,6 +25,7 @@ public class Gfh_tag_rdr {
public byte[] Src() {return src;} private byte[] src;
public int Src_end() {return src_end;} private int src_end;
public Bry_err_wkr Err_wkr() {return err_wkr;} private final Bry_err_wkr err_wkr = new Bry_err_wkr();
public Gfh_tag_rdr Skip_ws_after_slash_y_() {skip_ws_after_slash_y = true; return this;} private boolean skip_ws_after_slash_y;
public Gfh_tag_rdr Reg(String tag_name, int tag_id) {name_hash.Add_str_int(tag_name, tag_id); return this;}
public Gfh_tag_rdr Init(byte[] ctx_name, byte[] src, int src_bgn, int src_end) {
this.src = src; this.pos = src_bgn; this.src_end = src_end;
@ -205,6 +206,9 @@ public class Gfh_tag_rdr {
case Byte_ascii.Slash: // EX: "<a/>"
name_end = name_pos;
tag_end = name_pos + 1; if (tag_end == src_end) return Tag__eos(tag_bgn);// EX: "<a/EOS"
if (skip_ws_after_slash_y) {// skip ws after slash; EX:"<br />"; ISSUE#:542: DATE:2020-03-09
tag_end = Bry_find_.Find_fwd_while_ws(src, tag_end, src_end);
}
if (src[tag_end] == Byte_ascii.Angle_end) {
atrs_end = name_end;
inline = true;
@ -267,6 +271,9 @@ public class Gfh_tag_rdr {
case Byte_ascii.Slash: // EX: "<a/>"
name_end = name_pos;
tag_end = name_pos + 1; if (tag_end == src_end) return Tag__eos(tag_bgn);// EX: "<a/EOS"
if (skip_ws_after_slash_y) {// skip ws after slash; EX:"<br />"; ISSUE#:542: DATE:2020-03-09
tag_end = Bry_find_.Find_fwd_while_ws(src, tag_end, src_end);
}
if (src[tag_end] == Byte_ascii.Angle_end) {
atrs_end = name_end;
inline = true;

@ -17,7 +17,7 @@ package gplx.xowa.addons.htmls.tocs; import gplx.*; import gplx.xowa.*; import g
import gplx.langs.htmls.*; import gplx.langs.htmls.docs.*; import gplx.langs.htmls.encoders.*; import gplx.xowa.htmls.core.htmls.tidy.*;
import gplx.xowa.parsers.amps.*; import gplx.core.primitives.*;
class Xoh_toc_wkr__txt {
private final Gfh_tag_rdr tag_rdr = Gfh_tag_rdr.New__html();
private final Gfh_tag_rdr tag_rdr = Gfh_tag_rdr.New__html().Skip_ws_after_slash_y_();
private final Bry_bfr anch_bfr = Bry_bfr_.New(), text_bfr = Bry_bfr_.New();
private final Gfo_url_encoder anch_encoder = Gfo_url_encoder_.New__html_id().Make();
private final Xop_amp_mgr amp_mgr = Xop_amp_mgr.Instance;
@ -105,6 +105,7 @@ class Xoh_toc_wkr__txt {
case Gfh_tag_.Id__i:
case Gfh_tag_.Id__b:
case Gfh_tag_.Id__bdi:
case Gfh_tag_.Id__mark:// include mark; ISSUE#:542: DATE:2020-03-09
print_tag = true;
break;
case Gfh_tag_.Id__span: // print span only if it has a dir attribute

@ -67,6 +67,9 @@ class Xoh_toc_wkr__txt__fxt {
if (expd_anch != null) Gftest.Eq__str(expd_anch, itm.Anch(), "anch");
if (expd_text != null) Gftest.Eq__str(expd_text, itm.Text(), "text");
}
public void Test__both2(String html, String expd) {
Test__both(html, expd, expd);
}
public void Test__remove_comment(String html, String expd) {
byte[] html_bry = Bry_.new_u8(html);
Gftest.Eq__str(expd, Gfh_utl.Del_comments(tmp, html_bry, 0, html_bry.length));

@ -23,6 +23,7 @@ public class Xoh_toc_wkr__txt__xnde__tst {
@Test public void Sup() {fxt.Test__both("<sup>a</sup>" , "a", "<sup>a</sup>");}
@Test public void Sub() {fxt.Test__both("<sub>a</sub>" , "a", "<sub>a</sub>");}
@Test public void Bdi() {fxt.Test__both("<bdi>a</bdi>" , "a", "<bdi>a</bdi>");}
@Test public void Mark() {fxt.Test__both("<mark>a</mark>" , "a", "<mark>a</mark>");}
@Test public void Span() {fxt.Test__both("<span>a</span>" , "a", "a");}
@Test public void Span__id() {fxt.Test__both("<span id='1'>a</span>" , "a", "a");}
@Test public void Span__dir() {fxt.Test__both("<span dir=\"ltr\">a</span>" , "a", "<span dir=\"ltr\">a</span>");}
@ -31,6 +32,7 @@ public class Xoh_toc_wkr__txt__xnde__tst {
@Test public void A() {fxt.Test__both("<a href=\"/wiki/A\">b</a>" , "b");}
@Test public void A__nest() {fxt.Test__both("<a href=\"/wiki/A\">b<i>c</i>d</a>" , "bcd", "b<i>c</i>d");}
@Test public void Br() {fxt.Test__both("a<br/>b" , "ab");}
@Test public void Br__ws() {fxt.Test__both2("a<br/ >b", "ab");}
@Test public void Br__dangling() {fxt.Test__both("a<br>b" , "ab");}
@Test public void Wbr__dangling() {fxt.Test__both("a<wbr>b" , "ab");}
@Test public void H2() {fxt.Test__both("a<h2>b</h2>c" , "abc");} // NOTE: not a valid test; MW actually generates "ab" b/c of tidy; see corresponding edit test; DATE:2016-06-28
@ -38,6 +40,9 @@ public class Xoh_toc_wkr__txt__xnde__tst {
@Test public void Table() {fxt.Test__text("a<table><tr><td>b</td></tr></table>c" , "abc");}
@Test public void Unknown__i() {fxt.Test__both("a<unknown>b<i>c</i>d</unknown>e" , "abcde", "a<unknown>b<i>c</i>d</unknown>e");} // NOTE: technically, anch should be href_encoded a<unknown>b<i>c</i>d</unknown>e b/c <unknown> is not a valid tag; compare with known tags like <li> / <table> which are just stripped
@Test public void Unknown__a() {fxt.Test__both("a<unknown>b<a>c</a>d</unknown>e" , "abcde", "a<unknown>bcd</unknown>e");}
// @Test public void Br_w_space() {
// fxt.Test__remove_comment("1<!--2-->3<!--4->5", "13");
// }
@Test public void Fail() {
String html = "<i><a href='b'>c</i></a>";
fxt.Init__tidy(html, "<i><a href='b'>c</a></i>");

Loading…
Cancel
Save