mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse: Support ext xml nodes
This commit is contained in:
parent
7813e41e34
commit
bac00076b5
@ -54,12 +54,12 @@ class Xomw_prepro_elem {
|
||||
public final int type;
|
||||
public final byte[] name;
|
||||
public final byte[] tag_end_lhs;
|
||||
public static final int Type__comment = 0;
|
||||
public static final int Type__comment = 0, Type__other = 1;
|
||||
}
|
||||
class Xomw_prepro_curchar_itm {
|
||||
public Xomw_prepro_curchar_itm(byte[] bry) {
|
||||
public Xomw_prepro_curchar_itm(byte[] bry, byte type) {
|
||||
this.bry = bry;
|
||||
this.type = bry[0];
|
||||
this.type = type;
|
||||
}
|
||||
public byte[] bry;
|
||||
public byte type;
|
||||
|
@ -38,17 +38,21 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
, ignored_elements__includeonly = Hash_adp_bry.cs().Add_many_str("includeonly");
|
||||
private static Btrie_slim_mgr Cur_char_trie__new() {
|
||||
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
|
||||
String[] ary = new String[] {"|", "=", "<", "\n", "{", "[", "-{"};
|
||||
String[] ary = new String[] {"|", "=", "<", "\n", "{", "[", "-{", "}", "]"};
|
||||
for (String str : ary) {
|
||||
byte[] bry = Bry_.new_a7(str);
|
||||
rv.Add_obj(bry, new Xomw_prepro_curchar_itm(bry));
|
||||
rv.Add_obj(bry, new Xomw_prepro_curchar_itm(bry, bry[0]));
|
||||
}
|
||||
|
||||
// handle "}-" separately
|
||||
byte[] langv_end = Bry_.new_a7("}-");
|
||||
rv.Add_obj(langv_end, new Xomw_prepro_curchar_itm(langv_end, Byte_ascii.Bang));
|
||||
return rv;
|
||||
}
|
||||
|
||||
public byte[] Preprocess_to_xml(byte[] src, boolean for_inclusion) {
|
||||
xmlish_elems.Clear(); // TODO.XO: parser->getStripList();
|
||||
// PERF: xmlish_allow_missing_end_tag.Add_many_str("includeonly", "noinclude", "onlyinclude")
|
||||
xmlish_elems.Clear(); // TODO.XO: parser->getStripList(); pre|nowiki|gallery|indicator|ref|reference
|
||||
// RELIC: $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
|
||||
boolean enable_only_include = false;
|
||||
|
||||
Hash_adp_bry ignored_tags, ignored_elements;
|
||||
@ -67,12 +71,19 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
xmlish_elems.Add_many_str("includeonly");
|
||||
}
|
||||
|
||||
// RELIC:
|
||||
// $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
|
||||
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
|
||||
// $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
|
||||
|
||||
stack.Clear();
|
||||
|
||||
// RELIC:
|
||||
// $searchBase = "[{<\n"; # }
|
||||
// For fast reverse searches
|
||||
// $revText = strrev( $text );
|
||||
// $lengthText = strlen( $text );
|
||||
|
||||
// Input pointer, starts out pointing to a pseudo-newline before the start
|
||||
int i = 0;
|
||||
|
||||
@ -102,17 +113,18 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
// Do a line-start run without outputting an LF character
|
||||
boolean fake_line_start = true;
|
||||
|
||||
// XOWA-related init
|
||||
// XOWA: init
|
||||
int src_len = src.length;
|
||||
int found = -1;
|
||||
|
||||
elements_trie.Clear();
|
||||
elements_trie.Add_obj("!--", new Xomw_prepro_elem(Xomw_prepro_elem.Type__comment, Bry_.new_a7("comment")));
|
||||
Btrie_slim_mgr elements_end_trie = Btrie_slim_mgr.ci_a7();
|
||||
|
||||
byte[] cur_char = Bry_.Empty;
|
||||
byte[] cur_closing = Bry_.Empty;
|
||||
byte[] inner = null;
|
||||
Xomw_prepro_rule rule = null;
|
||||
|
||||
// XOWA: xml elements
|
||||
elements_trie.Clear();
|
||||
elements_trie.Add_obj("pre", new Xomw_prepro_elem(Xomw_prepro_elem.Type__other, Bry_.new_a7("pre")));
|
||||
elements_trie.Add_obj("!--", new Xomw_prepro_elem(Xomw_prepro_elem.Type__comment, Bry_.new_a7("comment")));
|
||||
|
||||
while (true) {
|
||||
if (find_only_include) {
|
||||
@ -129,34 +141,33 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
find_only_include = false;
|
||||
}
|
||||
|
||||
Xomw_prepro_rule rule = null;
|
||||
if (fake_line_start) {
|
||||
found = Found__line_bgn;
|
||||
cur_char = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
// Find next opening brace, closing brace or pipe
|
||||
|
||||
// $search = $searchBase;
|
||||
// Find next opening brace, closing brace or pipe
|
||||
// PORTED: $search = $searchBase;
|
||||
if (stack.top == null) {
|
||||
cur_closing = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
cur_closing = stack.top.close;
|
||||
// $search .= $currentClosing;
|
||||
// RELIC: $search .= $currentClosing;
|
||||
}
|
||||
if (find_pipe) {
|
||||
// $search .= '|';
|
||||
// RELIC: $search .= '|';
|
||||
}
|
||||
if (find_equals) {
|
||||
// First equals will be for the template
|
||||
// $search .= '=';
|
||||
// RELIC: $search .= '=';
|
||||
}
|
||||
|
||||
// Output literal section, advance input counter
|
||||
// PORTED: "$literalLength = strcspn(src, $search, i)"; NOTE: no trie b/c of frequent changes to $search
|
||||
int literal_len = 0;
|
||||
// NOTE: hard-coded translation of "strcspn(src, $search, i)"; no trie b/c of frequent additions / deletions
|
||||
boolean loop_stop = false;
|
||||
// read String until search_char is found
|
||||
for (int j = i; j < src_len; j++) {
|
||||
byte b = src[j];
|
||||
switch (b) { // handle '$searchBase = "[{<\n";'
|
||||
@ -197,7 +208,6 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
accum.Add(htmlspecialchars(Bry_.Mid(src, i, i + literal_len)));
|
||||
i += literal_len;
|
||||
}
|
||||
|
||||
if (i >= src_len) {
|
||||
if (Bry_.Eq(cur_closing, Byte_ascii.Nl_bry)) {
|
||||
// Do a past-the-end run to finish off the heading
|
||||
@ -210,31 +220,28 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
}
|
||||
}
|
||||
else {
|
||||
boolean match = false;
|
||||
if (cur_closing != Bry_.Empty) {
|
||||
if (Bry_.Match(src, i, i + cur_closing.length, cur_closing)) {
|
||||
match = true;
|
||||
found = Found__close;
|
||||
cur_char = cur_closing;
|
||||
// PORTED: corresponding block of MW code; note complexity to handle 2 char byte[]
|
||||
Xomw_prepro_curchar_itm cur_char_itm = (Xomw_prepro_curchar_itm)cur_char_trie.Match_at(trv, src, i, src_len);
|
||||
if (cur_char_itm != null) {
|
||||
cur_char = cur_char_itm.bry;
|
||||
switch (cur_char_itm.type) {
|
||||
case Byte_ascii.Pipe: found = Found__pipe; break;
|
||||
case Byte_ascii.Eq: found = Found__equals; break;
|
||||
case Byte_ascii.Angle_bgn: found = Found__angle; break;
|
||||
case Byte_ascii.Nl: found = in_heading ? Found__line_end : Found__line_bgn; break;
|
||||
|
||||
// PORT:"elseif ( $curChar == $currentClosing )"
|
||||
case Byte_ascii.Curly_end: found = Found__close; break;
|
||||
case Byte_ascii.Brack_end: found = Found__close; break;
|
||||
case Byte_ascii.Bang: found = Found__close; break;
|
||||
|
||||
// PORT:"elseif ( isset( $this->rules[$curChar] ) )"
|
||||
case Byte_ascii.Curly_bgn: {found = Found__open; rule = rule_curly; break;}
|
||||
case Byte_ascii.Brack_bgn: {found = Found__open; rule = rule_brack; break;}
|
||||
case Byte_ascii.Dash: {found = Found__open; rule = rule_langv; break;}
|
||||
}
|
||||
}
|
||||
else {
|
||||
Xomw_prepro_curchar_itm cur_char_itm = (Xomw_prepro_curchar_itm)cur_char_trie.Match_at(trv, src, i, src_len);
|
||||
if (cur_char_itm != null) {
|
||||
match = true;
|
||||
cur_char = cur_char_itm.bry;
|
||||
switch (cur_char_itm.type) {
|
||||
case Byte_ascii.Pipe: found = Found__pipe; break;
|
||||
case Byte_ascii.Eq: found = Found__equals; break;
|
||||
case Byte_ascii.Angle_bgn: found = Found__angle; break;
|
||||
case Byte_ascii.Nl: found = in_heading ? Found__line_end : Found__line_bgn; break;
|
||||
case Byte_ascii.Curly_bgn: {found = Found__open; rule = rule_curly; break;}
|
||||
case Byte_ascii.Brack_bgn: {found = Found__open; rule = rule_brack; break;}
|
||||
case Byte_ascii.Dash: {found = Found__open; rule = rule_langv; break;}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!match) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
@ -249,7 +256,8 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
continue;
|
||||
}
|
||||
|
||||
// Determine element name; $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; EX: "(span|div)(?:\s|\/>|>)|(!--)
|
||||
// Determine element name
|
||||
// PORT: $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; EX: "(span|div)(?:\s|\/>|>)|(!--)
|
||||
Xomw_prepro_elem element = (Xomw_prepro_elem)elements_trie.Match_at(trv, src, i + 1, src_len);
|
||||
if (element == null) {
|
||||
// Element name missing or not listed
|
||||
@ -260,12 +268,14 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
|
||||
// Handle comments
|
||||
if (element.type == Xomw_prepro_elem.Type__comment) {
|
||||
// To avoid leaving blank lines, when a sequence of space-separated comments is both preceded and followed by a newline
|
||||
// (ignoring spaces), then trim leading and trailing spaces and the trailing newline.
|
||||
// To avoid leaving blank lines, when a sequence of
|
||||
// space-separated comments is both preceded and followed by
|
||||
// a newline (ignoring spaces), then
|
||||
// trim leading and trailing spaces and the trailing newline.
|
||||
|
||||
// Find the end
|
||||
int comment_end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len);
|
||||
if (comment_end_pos == Bry_find_.Not_found) {
|
||||
int end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len);
|
||||
if (end_pos == Bry_find_.Not_found) {
|
||||
// Unclosed comment in input, runs to end
|
||||
accum.Add_str_a7("<comment>").Add(htmlspecialchars(Bry_.Mid(src, i))).Add_str_a7("</comment>");
|
||||
i = src_len;
|
||||
@ -276,9 +286,10 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
|
||||
// Search forwards for trailing whitespace
|
||||
// $wsEnd will be the position of the last space (or the '>' if there's none)
|
||||
int ws_end = Bry_find_.Find_fwd_while_space_or_tab(src, comment_end_pos + 3, src_len);
|
||||
int ws_end = Bry_find_.Find_fwd_while_space_or_tab(src, end_pos + 3, src_len);
|
||||
|
||||
// Keep looking forward as long as we're finding more comments.
|
||||
// Keep looking forward as long as we're finding more
|
||||
// comments.
|
||||
comments_list.Clear();
|
||||
comments_list.Add(new int[] {ws_bgn, ws_end});
|
||||
while (Bry_.Eq(src, ws_end + 1, ws_end + 5, Bry__comment_bgn)) {
|
||||
@ -292,9 +303,10 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
}
|
||||
|
||||
// Eat the line if possible
|
||||
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at the overall start.
|
||||
// That's not how Sanitizer::removeHTMLcomments() did it, but it's a possible beneficial b/c break.
|
||||
int comment_bgn_pos = -1;
|
||||
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
|
||||
// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
|
||||
// it's a possible beneficial b/c break.
|
||||
int bgn_pos = -1;
|
||||
if ( ws_bgn > 0
|
||||
&& Bry_.Eq(src, ws_bgn - 1, ws_bgn , Byte_ascii.Nl_bry)
|
||||
&& Bry_.Eq(src, ws_end + 1, ws_end + 2, Byte_ascii.Nl_bry)
|
||||
@ -313,12 +325,12 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
int comments_list_len = comments_list.Len();
|
||||
for (int j = 0; j < comments_list_len; j++) {
|
||||
int[] com = (int[])comments_list.Get_at(j);
|
||||
comment_bgn_pos = com[0];
|
||||
comment_end_pos = com[1] + 1;
|
||||
bgn_pos = com[0];
|
||||
end_pos = com[1] + 1;
|
||||
if (j == comments_list_len - 1) {
|
||||
break;
|
||||
}
|
||||
inner = Bry_.Mid(src, comment_bgn_pos, comment_end_pos);
|
||||
inner = Bry_.Mid(src, bgn_pos, end_pos);
|
||||
accum.Add_str_a7("<comment>").Add(htmlspecialchars(inner)).Add_str_a7("</comment>");
|
||||
}
|
||||
|
||||
@ -327,8 +339,8 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
}
|
||||
else {
|
||||
// No line to eat, just take the comment itself
|
||||
comment_bgn_pos = i;
|
||||
comment_end_pos += 2;
|
||||
bgn_pos = i;
|
||||
end_pos += 2;
|
||||
}
|
||||
|
||||
if (stack.top != null) {
|
||||
@ -337,22 +349,24 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
part.visual_end = ws_bgn;
|
||||
}
|
||||
// Else comments abutting, no change in visual end
|
||||
part.comment_end = comment_end_pos;
|
||||
part.comment_end = end_pos;
|
||||
}
|
||||
i = comment_end_pos + 1;
|
||||
inner = Bry_.Mid(src, comment_bgn_pos, comment_end_pos + 1);
|
||||
i = end_pos + 1;
|
||||
inner = Bry_.Mid(src, bgn_pos, end_pos + 1);
|
||||
accum.Add_str_a7("<comment>").Add(htmlspecialchars(inner)).Add_str_a7("</comment>");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
byte[] name = element.name;
|
||||
// RELIC:$lowerName = strtolower( $name );
|
||||
int atr_bgn = i + name.length + 1;
|
||||
|
||||
// Find end of tag
|
||||
int tag_end_pos = no_more_gt ? Bry_find_.Not_found : Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, atr_bgn);
|
||||
if (tag_end_pos == Bry_find_.Not_found) {
|
||||
// Infinite backtrack; Disable tag search to prevent worst-case O(N^2) performance
|
||||
// Infinite backtrack
|
||||
// Disable tag search to prevent worst-case O(N^2) performance
|
||||
no_more_gt = true;
|
||||
accum.Add(Bry__escaped_lt);
|
||||
i++;
|
||||
@ -378,32 +392,38 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
else {
|
||||
atr_end = tag_end_pos;
|
||||
// Find closing tag
|
||||
// NOTE: translation of `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
Xomw_prepro_elem elem_end = null;
|
||||
// PORTED: `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
boolean elem_end_found = false;
|
||||
int elem_end_lhs = -1, elem_end_rhs = -1;
|
||||
for (int j = tag_end_pos + 1; j < src_len; j++) {
|
||||
elem_end = (Xomw_prepro_elem)elements_end_trie.Match_at(trv, src, j, src_len);
|
||||
elem_end_lhs = elem_end_rhs = trv.Pos();
|
||||
|
||||
// found a possible elem_end tag; validate "\s*>"
|
||||
if (elem_end != null) {
|
||||
elem_end_rhs = Bry_find_.Find_fwd_while(src, elem_end_rhs, src_len, Byte_ascii.Space);
|
||||
if (elem_end_rhs == src_len) {
|
||||
elem_end = null;
|
||||
}
|
||||
else {
|
||||
if (src[elem_end_rhs] == Byte_ascii.Gt)
|
||||
elem_end_rhs = elem_end_rhs + 1;
|
||||
else
|
||||
elem_end = null;
|
||||
}
|
||||
}
|
||||
if (elem_end != null)
|
||||
int elem_end_cur = tag_end_pos + 1;
|
||||
while (true) {
|
||||
// search for "</"
|
||||
elem_end_lhs = Bry_find_.Find_fwd(src, Bry__end_lhs, elem_end_cur, src_len);
|
||||
if (elem_end_lhs == Bry_find_.Not_found) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// verify $name
|
||||
elem_end_cur = elem_end_lhs + 2; // 2="</"
|
||||
int elem_end_tmp = elem_end_cur + name.length;
|
||||
if (!Bry_.Eq_ci_a7(name, src, elem_end_cur, elem_end_tmp)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// verify "\s*>"
|
||||
elem_end_cur = elem_end_tmp;
|
||||
elem_end_cur = Bry_find_.Find_fwd_while(src, elem_end_cur, src_len, Byte_ascii.Space);
|
||||
if (elem_end_cur == src_len) { // just "\s", but no ">"
|
||||
break;
|
||||
}
|
||||
if (src[elem_end_cur] == Byte_ascii.Gt) {
|
||||
elem_end_rhs = elem_end_cur + 1;
|
||||
elem_end_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( !no_more_closing_tag.Has(name)
|
||||
&& elem_end != null) {
|
||||
&& elem_end_found) {
|
||||
inner = Bry_.Mid(src, tag_end_pos + 1, elem_end_lhs);
|
||||
i = elem_end_rhs;
|
||||
tmp_bfr.Add_str_a7("<close>").Add(htmlspecialchars(Bry_.Mid(src, elem_end_lhs, elem_end_rhs))).Add_str_a7("</close>");
|
||||
@ -446,7 +466,8 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
accum.Add(close).Add_str_a7("</ext>");
|
||||
}
|
||||
else if (found == Found__line_bgn) {
|
||||
// Is this the start of a heading?; Line break belongs before the heading element in any case
|
||||
// Is this the start of a heading?
|
||||
// Line break belongs before the heading element in any case
|
||||
if (fake_line_start) {
|
||||
fake_line_start = false;
|
||||
} else {
|
||||
@ -454,12 +475,14 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
i++;
|
||||
}
|
||||
|
||||
int eq_end = Bry_find_.Find_fwd_while(src, i, i + 6, Byte_ascii.Eq); // strspn( $src, '=', $i, 6 );
|
||||
int eq_end = Bry_find_.Find_fwd_while(src, i, i + 6, Byte_ascii.Eq); // PORTED:strspn( $src, '=', $i, 6 );
|
||||
int count = i - eq_end;
|
||||
if (count == 1 && find_equals) {
|
||||
// DWIM: This looks kind of like a name/value separator.
|
||||
// Let's let the equals handler have it and break the potential heading.
|
||||
// This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
|
||||
// Let's let the equals handler have it and break the
|
||||
// potential heading. This is heuristic, but AFAICT the
|
||||
// methods for completely correct disambiguation are very
|
||||
// complex.
|
||||
}
|
||||
else if (count > 0) {
|
||||
Xomw_prepro_piece piece = new Xomw_prepro_piece(Byte_ascii.Nl_bry, Byte_ascii.Nl_bry, count, i, false);
|
||||
@ -480,10 +503,12 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
Xomw_prepro_part part = piece.Get_current_part();
|
||||
|
||||
// Search back through the input to see if it has a proper close.
|
||||
// Do this using the reversed String since the other solutions (end anchor, etc.) are inefficient.
|
||||
// Do this using the reversed String since the other solutions
|
||||
// (end anchor, etc.) are inefficient.
|
||||
int search_bgn = Bry_find_.Find_bwd__while_space_or_tab(src, i, 0);
|
||||
if (part.comment_end != -1 && search_bgn -1 == part.comment_end) {
|
||||
// Comment found at line end; Search for equals signs before the comment
|
||||
// Comment found at line end
|
||||
// Search for equals signs before the comment
|
||||
search_bgn = part.visual_end;
|
||||
search_bgn -= Bry_find_.Find_bwd__while_space_or_tab(src, search_bgn, 0);
|
||||
}
|
||||
@ -499,14 +524,14 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
count = eq_len;
|
||||
if (count < 3) {
|
||||
count = 0;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
count = (count - 1) / 2;
|
||||
if (count > 6) count = 6;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (eq_len < count)
|
||||
count = eq_len;
|
||||
if (eq_len < count) count = eq_len; // PORTED: $count = min( $equalsLength, $count );
|
||||
}
|
||||
if (count > 0) {
|
||||
// Normal match, output <h>
|
||||
@ -533,17 +558,21 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
|
||||
// Append the result to the enclosing accumulator
|
||||
accum.Add(element);
|
||||
// Note that we do NOT increment the input pointer. This is because the closing linebreak could be the opening linebreak of another heading.
|
||||
// Infinite loops are avoided because the next iteration MUST hit the heading open case above, which unconditionally increments the input pointer.
|
||||
// Note that we do NOT increment the input pointer.
|
||||
// This is because the closing linebreak could be the opening linebreak of
|
||||
// another heading. Infinite loops are avoided because the next iteration MUST
|
||||
// hit the heading open case above, which unconditionally increments the
|
||||
// input pointer.
|
||||
}
|
||||
else if (found == Found__open) {
|
||||
// count opening brace characters
|
||||
int count = Bry_find_.Find_fwd_while(src, i, src_len, cur_char) - i;
|
||||
int count = Bry_find_.Find_fwd_while(src, i, src_len, cur_char) - i; // PORTED: $count = strspn( $text, $curChar, $i );
|
||||
|
||||
// we need to add to stack only if opening brace count is enough for one of the rules
|
||||
if (count >= rule.min) {
|
||||
// Add it to the stack
|
||||
Xomw_prepro_piece piece = new Xomw_prepro_piece(cur_char, rule.end, count, -1, i > 0 && src[i - 1] == Byte_ascii.Nl);
|
||||
|
||||
stack.Push(piece);
|
||||
accum = stack.Get_accum();
|
||||
Xomw_prepro_flags flags = stack.Get_flags();
|
||||
@ -560,15 +589,15 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
else if (found == Found__close) {
|
||||
Xomw_prepro_piece piece = stack.top;
|
||||
// lets check if there are enough characters for closing brace
|
||||
int count = Bry_find_.Find_fwd_while(src, i, src_len, cur_char) - i;
|
||||
int max_count = piece.count;
|
||||
if (count > max_count) count = max_count;
|
||||
int count = Bry_find_.Find_fwd_while(src, i, i + max_count, cur_char) - i; // $count = strspn( $text, $curChar, $i, $maxCount );
|
||||
|
||||
// check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules)
|
||||
rule = Get_rule(piece.open);
|
||||
int matching_count = -1;
|
||||
if (count > rule.max) {
|
||||
// The specified maximum exists in the callback array, unless the caller has made an error
|
||||
// The specified maximum exists in the callback array, unless the caller
|
||||
// has made an error
|
||||
matching_count = rule.max;
|
||||
}
|
||||
else {
|
||||
@ -596,12 +625,14 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
element = tmp_bfr.Add(Bry_.Repeat_bry(rule.end, matching_count)).To_bry_and_clear();
|
||||
}
|
||||
else {
|
||||
// Create XML element; Note: $parts is already XML, does not need to be encoded further
|
||||
// Create XML element
|
||||
// Note: $parts is already XML, does not need to be encoded further
|
||||
List_adp parts = piece.parts;
|
||||
byte[] title = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry_and_clear();
|
||||
parts.Del_at(0);
|
||||
|
||||
// The invocation is at the start of the line if lineStart is set in the stack, and all opening brackets are used up.
|
||||
// The invocation is at the start of the line if lineStart is set in
|
||||
// the stack, and all opening brackets are used up.
|
||||
byte[] attr = null;
|
||||
if (max_count == matching_count && !piece.line_start) {
|
||||
attr = Bry_.new_a7(" lineStart=\"1\"");
|
||||
@ -625,7 +656,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
tmp_bfr.Add_str_a7("<part><name>").Add(arg_key).Add_str_a7("</name>=<value>").Add(arg_val).Add_str_a7("</value></part>");
|
||||
}
|
||||
else {
|
||||
tmp_bfr.Add_str_a7("<part><name index=\"").Add_int_variable(arg_idx).Add_str_a7("\" /><value>{").Add(part.bfr.To_bry()).Add_str_a7("}</value></part>");
|
||||
tmp_bfr.Add_str_a7("<part><name index=\"").Add_int_variable(arg_idx).Add_str_a7("\" /><value>").Add(part.bfr.To_bry()).Add_str_a7("</value></part>");
|
||||
arg_idx++;
|
||||
}
|
||||
}
|
||||
@ -641,7 +672,7 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
|
||||
// Re-add the old stack element if it still has unmatched opening characters remaining
|
||||
if (matching_count < piece.count) {
|
||||
piece.parts.Clear(); // piece.parts = [ new PPDPart ];
|
||||
piece.parts.Clear(); // PORTED: piece.parts = [ new PPDPart ];
|
||||
piece.count -= matching_count;
|
||||
|
||||
// do we still qualify for any callback with remaining count?
|
||||
@ -688,6 +719,12 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
return root_accum.To_bry_and_clear();
|
||||
}
|
||||
private byte[] htmlspecialchars(byte[] bry) {
|
||||
// http://php.net/manual/en/function.htmlspecialchars.php
|
||||
//& (ampersand) &
|
||||
//" (double quote) ", unless ENT_NOQUOTES is set
|
||||
//' (single quote) ' (for ENT_HTML401) or ' (for ENT_XML1, ENT_XHTML or ENT_HTML5), but only when ENT_QUOTES is set
|
||||
//< (less than) <
|
||||
//> (greater than) >
|
||||
return bry;
|
||||
}
|
||||
private Xomw_prepro_rule Get_rule(byte[] bry) {
|
||||
@ -704,9 +741,10 @@ public class Xomw_prepro_wkr { // TS.UNSAFE:caching for repeated calls
|
||||
private static final byte[]
|
||||
Bry__only_include_bgn = Bry_.new_a7("<onlyinclude>")
|
||||
, Bry__only_include_end = Bry_.new_a7("</onlyinclude>")
|
||||
, Bry__comment_bgn = Bry_.new_a7("<!--")
|
||||
, Bry__comment_end = Bry_.new_a7("-->")
|
||||
, Bry__escaped_lt = Bry_.new_a7("<")
|
||||
, Bry__comment_bgn = Bry_.new_a7("<!--")
|
||||
, Bry__comment_end = Bry_.new_a7("-->")
|
||||
, Bry__escaped_lt = Bry_.new_a7("<")
|
||||
, Bry__end_lhs = Bry_.new_a7("</")
|
||||
;
|
||||
private static final int Len__only_include_end = Bry__only_include_end.length;
|
||||
private static final int
|
||||
|
@ -34,6 +34,25 @@ public class Xomw_prepro_wkr__tst {
|
||||
@Test public void Comment() {
|
||||
fxt.Test__parse("a<!--b-->c", "<root>a<comment><!--b--></comment>c</root>");
|
||||
}
|
||||
@Test public void Ext__pre() {
|
||||
fxt.Test__parse("a<pre id=\"1\">b</pre>c", "<root>a<ext><name>pre</name><attr> id=\"1\"</attr><inner>b</inner><close></pre></close></ext>c</root>");
|
||||
}
|
||||
/*
|
||||
TODO:
|
||||
* htmlspecialchars
|
||||
|
||||
* for_inclusion; <onlyinclude> in String
|
||||
* heading.general
|
||||
* heading.EOS: "==a" (no closing ==)
|
||||
* ignored tags
|
||||
* FIX:
|
||||
if ( ws_len > 0
|
||||
&& Bry_find_.Find_fwd_while_space_or_tab(accum_bry, -ws_len, src_len) == ws_len
|
||||
) {
|
||||
accum.Clear().Add(Bry_.Mid(accum_bry, 0, -ws_len));
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
class Xomw_prepro_wkr__fxt {
|
||||
private final Xomw_prepro_wkr wkr = new Xomw_prepro_wkr();
|
||||
|
Loading…
Reference in New Issue
Block a user