1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-10-27 20:34:16 +00:00

Xomw: Convert XomwBlockLevelPass

This commit is contained in:
gnosygnu 2017-02-25 09:31:07 -05:00
parent 0f92bb55db
commit 4781529d12
5 changed files with 675 additions and 592 deletions

View File

@ -0,0 +1,664 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.langs.htmls.*;
/**
* This is the part of the wikitext parser which handles automatic paragraphs
* and conversion of start-of-line prefixes to HTML lists.
*/
public class XomwBlockLevelPass {
private boolean DTopen = false;
private boolean inPre = false;
private int lastSection = LAST_SECTION_NONE;
private boolean linestart;
// private $text;
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private byte[] find_colon_no_links__before, find_colon_no_links__after;
// State constants for the definition list colon extraction
private static final int
COLON_STATE_TEXT = 0
, COLON_STATE_TAG = 1
, COLON_STATE_TAGSTART = 2
, COLON_STATE_CLOSETAG = 3
, COLON_STATE_TAGSLASH = 4
, COLON_STATE_COMMENT = 5
, COLON_STATE_COMMENTDASH = 6
, COLON_STATE_COMMENTDASHDASH = 7
;
/**
* Make lists from lines starting with ':', '*', '#', etc.
*
* @param String $text
* @param boolean $linestart Whether or not this is at the start of a line.
* @return String The lists rendered as HTML
*/
// public static function doBlockLevels($text, $linestart) {
// $pass = new self($text, $linestart);
// return $pass->execute();
// }
public void doBlockLevels(XomwParserCtx pctx, XomwParserBfr pbfr, boolean linestart) {
this.linestart = linestart;
execute(pctx, pbfr, linestart);
}
// /**
// * Private constructor
// */
// private function __construct($text, $linestart) {
// $this->text = $text;
// $this->linestart = $linestart;
// }
/**
* If a pre or p is open, return the corresponding close tag and update
* the state. If no tag is open, return an empty String.
* @return String
*/
private byte[] closeParagraph() {
byte[] result = Bry_.Empty;
if (this.lastSection != LAST_SECTION_NONE) {
result = tmp.Add(lastSection == LAST_SECTION_PARA ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs).Add_byte_nl().To_bry_and_clear(); // $result = '</' . $this->lastSection . ">\n";
}
this.inPre = false;
this.lastSection = LAST_SECTION_NONE;
return result;
}
/**
* getCommon() returns the length of the longest common substring
* of both arguments, starting at the beginning of both.
*
* @param String $st1
* @param String $st2
*
* @return int
*/
// getCommon() returns the length of the longest common substring
// of both arguments, starting at the beginning of both.
private int getCommon(byte[] st1, byte[] st2) {
int st1Len = st1.length, st2Len = st2.length;
int shorter = st1Len < st2Len ? st1Len : st2Len;
int i;
for (i = 0; i < shorter; ++i) {
if (st1[i] != st2[i]) {
break;
}
}
return i;
}
/**
* Open the list item element identified by the prefix character.
*
* @param String $char
*
* @return String
*/
private byte[] openList(byte c) {
byte[] result = this.closeParagraph();
if (c == Byte_ascii.Star)
result = Bry_.Add(result, Bry_.new_a7("<ul><li>"));
else if (c == Byte_ascii.Hash)
result = Bry_.Add(result, Bry_.new_a7("<ol><li>"));
else if (c == Byte_ascii.Colon)
result = Bry_.Add(result, Bry_.new_a7("<dl><dd>"));
else if (c == Byte_ascii.Semic) {
result = Bry_.Add(result, Bry_.new_a7("<dl><dt>"));
this.DTopen = true;
}
else {
result = Bry_.new_a7("<!-- ERR 1 -->");
}
return result;
}
/**
* Close the current list item and open the next one.
* @param String $char
*
* @return String
*/
private byte[] nextItem(byte c) {
if (c == Byte_ascii.Star || c == Byte_ascii.Hash) {
return Bry_.new_a7("</li>\n<li>");
}
else if (c == Byte_ascii.Colon || c == Byte_ascii.Semic) {
byte[] close = Bry_.new_a7("</dd>\n");
if (this.DTopen) {
close = Bry_.new_a7("</dt>\n");
}
if (c == Byte_ascii.Semic) {
this.DTopen = true;
return Bry_.Add(close, Bry_.new_a7("<dt>"));
}
else {
this.DTopen = false;
return Bry_.Add(close, Bry_.new_a7("<dd>"));
}
}
return Bry_.new_a7("<!-- ERR 2 -->");
}
/**
* Close the current list item identified by the prefix character.
* @param String $char
*
* @return String
*/
private byte[] closeList(byte c) {
byte[] text = null;
if (c == Byte_ascii.Star) {
text = Bry_.new_a7("</li></ul>");
}
else if (c == Byte_ascii.Hash) {
text = Bry_.new_a7("</li></ol>");
}
else if (c == Byte_ascii.Colon) {
if (this.DTopen) {
this.DTopen = false;
text = Bry_.new_a7("</dt></dl>");
}
else {
text = Bry_.new_a7("</dd></dl>");
}
}
else {
return Bry_.new_a7("<!-- ERR 3 -->");
}
return text;
}
/**
* Execute the pass.
* @return String
*/
public void execute(XomwParserCtx pctx, XomwParserBfr pbfr, boolean linestart) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
// XO.STATIC
if (block_chars_ary == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
block_chars_ary = Block_chars_ary__new();
openMatchTrie = Btrie_slim_mgr.ci_a7().Add_many_str
( "<table", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<pre", "<tr"
, "<p", "<ul", "<ol", "<dl", "<li", "</tr", "</td", "</th");
closeMatchTrie = Btrie_slim_mgr.ci_a7().Add_many_str
( "</table", "</h1", "</h2", "</h3", "</h4", "</h5", "</h6"
, "<td", "<th", "<blockquote", "</blockquote", "<div", "</div", "<hr", "</pre", "</p", "</mw:"
, XomwParser.MARKER_PREFIX_STR + "-pre"
, "</li", "</ul", "</ol", "</dl", "<center", "</center");
blockquoteTrie = Btrie_slim_mgr.ci_a7().Add_many_str("<blockquote", "</blockquote");
pre_trie = Btrie_slim_mgr.ci_a7().Add_str_int("<pre", PRE_BGN).Add_str_int("</pre", PRE_END);
}
}
// clear state
this.inPre = false;
this.lastSection = LAST_SECTION_NONE;
byte[] prefix2 = null;
bfr.Clear();
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
byte[] lastPrefix = Bry_.Empty;
this.DTopen = false;
boolean inBlockElem = false;
int prefixLen = 0;
byte pendingPTag = PARA_STACK_NONE;
boolean inBlockquote = false;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
int lineBgn = src_bgn;
while (lineBgn < src_end) {
int lineEnd = Bry_find_.Find_fwd(src, Byte_ascii.Nl, lineBgn);
if (lineEnd == Bry_find_.Not_found)
lineEnd = src_end;
// Fix up linestart
if (!this.linestart) {
bfr.Add_mid(src, lineBgn, lineEnd);
this.linestart = true;
continue;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int lastPrefixLen = lastPrefix.length;
// PORTED.BGN: preCloseMatch = preg_match('/<\\/pre/i', $oLine); preOpenMatch = preg_match('/<pre/i', $oLine);
int preCur = lineBgn;
boolean preCloseMatch = false;
boolean preOpenMatch = false;
while (true) {
if (preCur >= lineEnd)
break;
Object o = pre_trie.Match_at(trv, src, preCur, lineEnd);
if (o == null)
preCur++;
else {
int pre_tid = Int_.cast(o);
if (pre_tid == PRE_BGN)
preOpenMatch = true;
else if (pre_tid == PRE_END)
preCloseMatch = true;
preCur = trv.Pos();
}
}
// PORTED.END
byte[] prefix = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!this.inPre) {
// Multiple prefixes may abut each other for nested lists.
prefixLen = XophpString.strspn_fwd__ary(src, block_chars_ary, lineBgn, lineEnd, lineEnd); // strspn($oLine, '*#:;');
prefix = XophpString.substr(src, lineBgn, prefixLen);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
// substr($inputLine, $prefixLength);
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, lineBgn + prefixLen, lineEnd);
this.inPre = preOpenMatch;
}
else {
// Don't interpret any other prefixes in preformatted text
prefixLen = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, lineBgn, lineEnd);
}
// List generation
byte[] term = null, t2 = null;
int commonPrefixLen = -1;
if (prefixLen > 0 && Bry_.Eq(lastPrefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
bfr.Add(this.nextItem(XophpString.substr_byte(prefix, -1)));
pendingPTag = PARA_STACK_NONE;
if (prefixLen > 0 && prefix[prefixLen - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
if (this.findColonNoLinks(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(nextItem(Byte_ascii.Colon));
}
}
}
else if (prefixLen > 0 || lastPrefixLen > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
commonPrefixLen = this.getCommon(prefix, lastPrefix);
pendingPTag = PARA_STACK_NONE;
// Close all the prefixes which aren't shared.
while (commonPrefixLen < lastPrefixLen) {
bfr.Add(this.closeList(lastPrefix[lastPrefixLen - 1]));
--lastPrefixLen;
}
// Continue the current prefix if appropriate.
if (prefixLen <= commonPrefixLen && commonPrefixLen > 0) {
bfr.Add(this.nextItem(prefix[commonPrefixLen - 1]));
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(lastPrefix) && prefixLen > commonPrefixLen) {
bfr.Add_byte_nl();
}
while (prefixLen > commonPrefixLen) {
byte c = XophpString.substr_byte(prefix, commonPrefixLen, 1);
bfr.Add(this.openList(c));
if (c == Byte_ascii.Semic) {
// @todo FIXME: This is dupe of code above
if (findColonNoLinks(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(nextItem(Byte_ascii.Colon));
}
}
++commonPrefixLen;
}
if (prefixLen == 0 && Bry_.Len_gt_0(lastPrefix)) {
bfr.Add_byte_nl();
}
lastPrefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefixLen) {
// No prefix (not in list)--go to paragraph mode
// @todo consider using a stack for nestable elements like span, table and div
int tLen = t.length;
// XO.MW.PORTED.BGN:
boolean openMatch = XophpPreg.match(openMatchTrie, trv, t, 0, tLen) != null;
boolean closeMatch = XophpPreg.match(closeMatchTrie, trv, t, 0, tLen) != null;
// XO.MW.PORTED.END
if (openMatch || closeMatch) {
pendingPTag = PARA_STACK_NONE;
// @todo bug 5718: paragraph closed
bfr.Add(this.closeParagraph());
if (preOpenMatch && !preCloseMatch) {
this.inPre = true;
}
int bqOffset = 0;
// PORTED:preg_match('/<(\\/?)blockquote[\s>]/i', t, $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
while (true) {
Object o = XophpPreg.match(blockquoteTrie, trv, t, bqOffset, tLen);
if (o == null) { // no more blockquotes found; exit
break;
}
else {
byte[] bq_bry = (byte[])o;
inBlockquote = bq_bry[1] != Byte_ascii.Slash; // is this a close tag?
bqOffset = trv.Pos();
}
}
// PORTED:END
inBlockElem = !closeMatch;
}
else if (!inBlockElem && !this.inPre) {
if (XophpString.substr_byte(t, 0) == Byte_ascii.Space
&& (this.lastSection == LAST_SECTION_PRE || Bry_.Trim(t) != Bry_.Empty)
&& !inBlockquote
) {
// pre
if (this.lastSection != LAST_SECTION_PRE) {
pendingPTag = PARA_STACK_NONE;
bfr.Add(closeParagraph()).Add(Gfh_tag_.Pre_lhs);
this.lastSection = LAST_SECTION_PRE;
}
t = Bry_.Mid(t, 1);
}
else {
// paragraph
if (Bry_.Trim(t) == Bry_.Empty) {
if (pendingPTag != PARA_STACK_NONE) {
ParaStackAdd(bfr, pendingPTag);
bfr.Add_str_a7("<br />");
pendingPTag = PARA_STACK_NONE;
this.lastSection = LAST_SECTION_PARA;
}
else {
if (this.lastSection != LAST_SECTION_PARA) {
bfr.Add(this.closeParagraph());
this.lastSection = LAST_SECTION_NONE;
pendingPTag = PARA_STACK_BGN;
}
else {
pendingPTag = PARA_STACK_MID;
}
}
}
else {
if (pendingPTag != PARA_STACK_NONE) {
ParaStackAdd(bfr, pendingPTag);
pendingPTag = PARA_STACK_NONE;
this.lastSection = LAST_SECTION_PARA;
}
else if (lastSection != LAST_SECTION_PARA) {
bfr.Add(this.closeParagraph()).Add(Gfh_tag_.P_lhs);
this.lastSection = LAST_SECTION_PARA;
}
}
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (preCloseMatch && this.inPre) {
this.inPre = false;
}
if (pendingPTag == PARA_STACK_NONE) {
bfr.Add(t);
if (prefixLen == 0) {
bfr.Add_byte_nl();
}
}
lineBgn = lineEnd + 1;
}
while (prefixLen > 0) {
bfr.Add(this.closeList(prefix2[prefixLen - 1]));
--prefixLen;
if (prefixLen > 0) {
bfr.Add_byte_nl();
}
}
if (this.lastSection != LAST_SECTION_NONE) {
bfr.Add(this.lastSection == LAST_SECTION_PARA ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
this.lastSection = LAST_SECTION_NONE;
}
}
/**
* Split up a String on ':', ignoring any occurrences inside tags
* to prevent illegal overlapping.
*
* @param String $str The String to split
* @param String &$before Set to everything before the ':'
* @param String &$after Set to everything after the ':'
* @throws MWException
* @return String The position of the ':', or false if none found
*/
private int findColonNoLinks(byte[] str, byte[] before, byte[] after) {
int len = str.length;
int colonPos = XophpString.strpos(str, Byte_ascii.Colon, 0, len);
if (colonPos == Bry_find_.Not_found) {
// Nothing to find!
return Bry_find_.Not_found;
}
int ltPos = XophpString.strpos(str, Byte_ascii.Angle_bgn, 0, len);
if (ltPos == Bry_find_.Not_found || ltPos > colonPos) {
// Easy; no tag nesting to worry about
// XOMW: MW passes before / after by reference; XO: changes member and depends on callers to update
find_colon_no_links__before = XophpString.substr(str, 0, colonPos);
find_colon_no_links__after = XophpString.substr(str, colonPos + 1);
return colonPos;
}
// Ugly state machine to walk through avoiding tags.
int state = COLON_STATE_TEXT;
int level = 0;
for (int i = 0; i < len; i++) {
byte c = str[i];
switch (state) {
case COLON_STATE_TEXT:
switch (c) {
case Byte_ascii.Angle_bgn:
// Could be either a <start> tag or an </end> tag
state = COLON_STATE_TAGSTART;
break;
case Byte_ascii.Colon:
if (level == 0) {
// We found it!
find_colon_no_links__before = XophpString.substr(str, 0, i);
find_colon_no_links__after = XophpString.substr(str, i + 1);
return i;
}
// Embedded in a tag; don't break it.
break;
default:
// Skip ahead looking for something interesting
colonPos = XophpString.strpos(str, Byte_ascii.Colon, i, len);
if (colonPos == Bry_find_.Not_found) {
// Nothing else interesting
return Bry_find_.Not_found;
}
ltPos = XophpString.strpos(str, Byte_ascii.Angle_bgn, i, len);
if (level == 0) {
if (ltPos == Bry_find_.Not_found || colonPos < ltPos) {
// We found it!
find_colon_no_links__before = XophpString.substr(str, 0, colonPos);
find_colon_no_links__after = XophpString.substr(str, colonPos + 1);
return i;
}
}
if (ltPos == Bry_find_.Not_found) {
// Nothing else interesting to find; abort!
// We're nested, but there's no close tags left. Abort!
i = len; // break 2
break;
}
// Skip ahead to next tag start
i = ltPos;
state = COLON_STATE_TAGSTART;
break;
}
break;
case COLON_STATE_TAG:
// In a <tag>
switch (c) {
case Byte_ascii.Angle_end:
level++;
state = COLON_STATE_TEXT;
break;
case Byte_ascii.Slash:
// Slash may be followed by >?
state = COLON_STATE_TAGSLASH;
break;
default:
// ignore
break;
}
break;
case COLON_STATE_TAGSTART:
switch (c) {
case Byte_ascii.Slash:
state = COLON_STATE_CLOSETAG;
break;
case Byte_ascii.Bang:
state = COLON_STATE_COMMENT;
break;
case Byte_ascii.Angle_end:
// Illegal early close? This shouldn't happen D:
state = COLON_STATE_TEXT;
break;
default:
state = COLON_STATE_TAG;
break;
}
break;
case COLON_STATE_CLOSETAG:
// In a </tag>
if (c == Byte_ascii.Angle_end) {
level--;
if (level < 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; too many close tags");
return Bry_find_.Not_found;
}
state = COLON_STATE_TEXT;
}
break;
case COLON_STATE_TAGSLASH:
if (c == Byte_ascii.Angle_end) {
// Yes, a self-closed tag <blah/>
state = COLON_STATE_TEXT;
}
else {
// Probably we're jumping the gun, and this is an attribute
state = COLON_STATE_TAG;
}
break;
case COLON_STATE_COMMENT:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASH;
}
break;
case COLON_STATE_COMMENTDASH:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASHDASH;
}
else {
state = COLON_STATE_COMMENT;
}
break;
case COLON_STATE_COMMENTDASHDASH:
if (c == Byte_ascii.Angle_bgn) {
state = COLON_STATE_TEXT;
}
else {
state = COLON_STATE_COMMENT;
}
break;
default:
throw Err_.new_wo_type("State machine error");
}
}
if (level > 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; not enough close tags (level ~{0}, state ~{1})", level, state);
return Bry_find_.Not_found;
}
return Bry_find_.Not_found;
}
private static final byte
LAST_SECTION_NONE = 0 // ''
, LAST_SECTION_PARA = 1 // p
, LAST_SECTION_PRE = 2 // pre
;
private static final byte
PARA_STACK_NONE = 0 // false
, PARA_STACK_BGN = 1 // <p>
, PARA_STACK_MID = 2 // </p><p>
;
private static final int PRE_BGN = 0, PRE_END = 1;
private static Btrie_slim_mgr pre_trie;
private static boolean[] block_chars_ary;
private static boolean[] Block_chars_ary__new() {
boolean[] rv = new boolean[256];
rv[Byte_ascii.Star] = true;
rv[Byte_ascii.Hash] = true;
rv[Byte_ascii.Colon] = true;
rv[Byte_ascii.Semic] = true;
return rv;
}
private static Btrie_slim_mgr openMatchTrie, closeMatchTrie, blockquoteTrie;
private static void ParaStackAdd(Bry_bfr bfr, int id) {
switch (id) {
case PARA_STACK_BGN: bfr.Add_str_a7("<p>"); break;
case PARA_STACK_MID: bfr.Add_str_a7("</p><p>"); break;
default: throw Err_.new_unhandled_default(id);
}
}
}

View File

@ -16,8 +16,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mediawiki.includes.linkers.*;
public class Xomw_block_level_pass__tst {
private final Xomw_block_level_pass__fxt fxt = new Xomw_block_level_pass__fxt();
public class XomwBlockLevelPassTest {
private final XomwBlockLevelPassFxt fxt = new XomwBlockLevelPassFxt();
@Test public void Basic() {
fxt.Test__do_block_levels(String_.Concat_lines_nl_skip_last
( "a"
@ -27,8 +27,8 @@ public class Xomw_block_level_pass__tst {
));
}
}
class Xomw_block_level_pass__fxt {
private final Xomw_block_level_pass block_level_pass = new Xomw_block_level_pass();
class XomwBlockLevelPassFxt {
private final XomwBlockLevelPass block_level_pass = new XomwBlockLevelPass();
private final XomwParserCtx pctx = new XomwParserCtx();
private final XomwParserBfr pbfr = new XomwParserBfr();
private boolean apos = true;

View File

@ -271,7 +271,7 @@ public class XomwParser implements XomwParserIface {
private final Xomw_lnke_wkr lnkeWkr;
private final Xomw_magiclinks_wkr magiclinksWkr;
private final Xomw_nbsp_wkr nbspWkr = new Xomw_nbsp_wkr();
private final Xomw_block_level_pass blockWkr = new Xomw_block_level_pass();
private final XomwBlockLevelPass blockWkr = new XomwBlockLevelPass();
private final Xomw_doubleunder_data doubleunder_data = new Xomw_doubleunder_data();
private static Xomw_regex_space regex_space;
private static Xomw_regex_boundary regex_boundary;
@ -1692,7 +1692,7 @@ public class XomwParser implements XomwParserIface {
dirty = true;
byte[] protocol_bry = (byte[])protocol_obj;
if (called_by_bry) trg = Bry_bfr_.New();
trg.Add_bry_many(XomwStripState.Bry__marker__bgn, Bry__noparse, protocol_bry);
trg.Add_bry_many(XomwParser.MARKER_PREFIX, Bry__noparse, protocol_bry);
cur += protocol_bry.length;
prv = cur;
}

View File

@ -327,11 +327,11 @@ public class XomwStripState {
// public function killMarkers($text) {
// return preg_replace(this.regex, '', $text);
// }
public static final String Str__marker_bgn = "\u007f'\"`UNIQ-";
public static final byte[]
Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn)
, Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f")
;
// public static final String Str__marker_bgn = "\u007f'\"`UNIQ-";
// public static final byte[]
// Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn)
// , Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f")
// ;
public static final byte TYPE_GENERAL = 1, TYPE_NOWIKI = 2, TYPE_BOTH = 3;
}
class XomwStripItem {

View File

@ -1,581 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.langs.htmls.*;
public class Xomw_block_level_pass {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private boolean in_pre, dt_open;
private int last_section;
private byte[] find_colon_no_links__before, find_colon_no_links__after;
public void doBlockLevels(XomwParserCtx pctx, XomwParserBfr pbfr, boolean line_start) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
// XO.STATIC
if (block_chars_ary == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
block_chars_ary = Block_chars_ary__new();
open_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
("<table", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<pre", "<tr", "<p", "<ul", "<ol", "<dl", "<li", "</tr", "</td", "</th");
close_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
( "</table", "</h1", "</h2", "</h3", "</h4", "</h5", "</h6", "<td", "<th", "<blockquote", "</blockquote", "<div", "</div", "<hr"
, "</pre", "</p", "</mw:", XomwStripState.Str__marker_bgn + "-pre", "</li", "</ul", "</ol", "</dl", "<center", "</center");
blockquote_trie = Btrie_slim_mgr.ci_a7().Add_many_str("<blockquote", "</blockquote");
pre_trie = Btrie_slim_mgr.ci_a7().Add_str_int("<pre", Pre__bgn).Add_str_int("</pre", Pre__end);
}
}
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
byte[] last_prefix = Bry_.Empty;
bfr.Clear();
this.dt_open = false;
boolean in_block_elem = false;
int prefix_len = 0;
byte para_stack = Para_stack__none;
boolean in_blockquote = false;
this.in_pre = false;
this.last_section = Last_section__none;
byte[] prefix2 = null;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
int line_bgn = src_bgn;
while (line_bgn < src_end) {
int line_end = Bry_find_.Find_fwd(src, Byte_ascii.Nl, line_bgn);
if (line_end == Bry_find_.Not_found)
line_end = src_end;
// Fix up line_start
if (!line_start) {
bfr.Add_mid(src, line_bgn, line_end);
line_start = true;
continue;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int last_prefix_len = last_prefix.length;
// PORTED: pre_close_match = preg_match('/<\\/pre/i', $oLine); pre_open_match = preg_match('/<pre/i', $oLine);
int pre_cur = line_bgn;
boolean pre_close_match = false;
boolean pre_open_match = false;
while (true) {
if (pre_cur >= line_end)
break;
Object o = pre_trie.Match_at(trv, src, pre_cur, line_end);
if (o == null)
pre_cur++;
else {
int pre_tid = Int_.cast(o);
if (pre_tid == Pre__bgn)
pre_open_match = true;
else if (pre_tid == Pre__end)
pre_close_match = true;
pre_cur = trv.Pos();
}
}
byte[] prefix = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!in_pre) {
// Multiple prefixes may abut each other for nested lists.
prefix_len = XophpString.strspn_fwd__ary(src, block_chars_ary, line_bgn, line_end, line_end); // strspn($oLine, '*#:;');
prefix = XophpString.substr(src, line_bgn, prefix_len);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
// substr( $inputLine, $prefixLength );
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, line_bgn + prefix_len, line_end);
in_pre = pre_open_match;
}
else {
// Don't interpret any other prefixes in preformatted text
prefix_len = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, line_bgn, line_end);
}
// List generation
byte[] term = null, t2 = null;
int common_prefix_len = -1;
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
bfr.Add(Next_item(XophpString.substr_byte(prefix, -1)));
para_stack = Para_stack__none;
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
}
else if (prefix_len > 0 || last_prefix_len > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
common_prefix_len = Get_common(prefix, last_prefix);
para_stack = Para_stack__none;
// Close all the prefixes which aren't shared.
while (common_prefix_len < last_prefix_len) {
bfr.Add(Close_list(last_prefix[last_prefix_len - 1]));
last_prefix_len--;
}
// Continue the current prefix if appropriate.
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
bfr.Add(Next_item(prefix[common_prefix_len - 1]));
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
bfr.Add_byte_nl();
}
while (prefix_len > common_prefix_len) {
byte c = XophpString.substr_byte(prefix, common_prefix_len, 1);
bfr.Add(Open_list(c));
if (c == Byte_ascii.Semic) {
// @todo FIXME: This is dupe of code above
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
++common_prefix_len;
}
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
bfr.Add_byte_nl();
}
last_prefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefix_len) {
// No prefix (not in list)--go to paragraph mode
// XXX: use a stack for nestable elements like span, table and div
int t_len = t.length;
boolean open_match = XophpPreg.match(open_match_trie, trv, t, 0, t_len) != null;
boolean close_match = XophpPreg.match(close_match_trie, trv, t, 0, t_len) != null;
if (open_match || close_match) {
para_stack = Para_stack__none;
// @todo bug 5718: paragraph closed
bfr.Add(Close_paragraph());
if (pre_open_match && !pre_close_match) {
in_pre = true;
}
int bq_offset = 0;
// PORTED:preg_match('/<(\\/?)blockquote[\s>]/i', t, $bqMatch, PREG_OFFSET_CAPTURE, $bq_offset)
while (true) {
Object o = XophpPreg.match(blockquote_trie, trv, t, bq_offset, t_len);
if (o == null) { // no more blockquotes found; exit
break;
}
else {
byte[] bq_bry = (byte[])o;
in_blockquote = bq_bry[1] != Byte_ascii.Slash; // is this a close tag?
bq_offset = trv.Pos();
}
}
in_block_elem = !close_match;
}
else if (!in_block_elem && !in_pre) {
if ( XophpString.substr_byte(t, 0) == Byte_ascii.Space
&& (last_section == Last_section__pre || Bry_.Trim(t) != Bry_.Empty)
&& !in_blockquote
) {
// pre
if (last_section != Last_section__pre) {
para_stack = Para_stack__none;
bfr.Add(Close_paragraph()).Add(Gfh_tag_.Pre_lhs);
last_section = Last_section__pre;
}
t = Bry_.Mid(t, 1);
}
else {
// paragraph
if (Bry_.Trim(t) == Bry_.Empty) {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
bfr.Add_str_a7("<br />");
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else {
if (last_section != Last_section__para) {
bfr.Add(Close_paragraph());
last_section = Last_section__none;
para_stack = Para_stack__bgn;
}
else {
para_stack = Para_stack__mid;
}
}
}
else {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else if (last_section != Last_section__para) {
bfr.Add(Close_paragraph()).Add(Gfh_tag_.P_lhs);
this.last_section = Last_section__para;
}
}
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (pre_close_match && in_pre) {
in_pre = false;
}
if (para_stack == Para_stack__none) {
bfr.Add(t);
if (prefix_len == 0) {
bfr.Add_byte_nl();
}
}
line_bgn = line_end + 1;
}
while (prefix_len > 0) {
bfr.Add(Close_list(prefix2[prefix_len - 1]));
prefix_len--;
if (prefix_len > 0) {
bfr.Add_byte_nl();
}
}
if (last_section != Last_section__none) {
bfr.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
last_section = Last_section__none;
}
}
// If a pre or p is open, return the corresponding close tag and update
// the state. If no tag is open, return an empty String.
public byte[] Close_paragraph() {
byte[] result = Bry_.Empty;
if (last_section != Last_section__none) {
tmp.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
result = tmp.Add_byte_nl().To_bry_and_clear();
}
in_pre = false;
last_section = Last_section__none;
return result;
}
// getCommon() returns the length of the longest common substring
// of both arguments, starting at the beginning of both.
private int Get_common(byte[] st1, byte[] st2) {
int st1_len = st1.length, st2_len = st2.length;
int shorter = st1_len < st2_len ? st1_len : st2_len;
int i;
for (i = 0; i < shorter; i++) {
if (st1[i] != st2[i]) {
break;
}
}
return i;
}
// Open the list item element identified by the prefix character.
private byte[] Open_list(byte c) {
byte[] result = Close_paragraph();
if (c == Byte_ascii.Star)
result = tmp.Add(result).Add_str_a7("<ul><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<ol><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<dl><dd>").To_bry_and_clear();
else if (c == Byte_ascii.Semic) {
result = tmp.Add(result).Add_str_a7("<dl><dt>").To_bry_and_clear();
dt_open = true;
}
else
result = tmp.Add_str_a7("<!-- ERR 1 -->").To_bry_and_clear();
return result;
}
// Close the current list item and open the next one.
private byte[] Next_item(byte c) {
if (c == Byte_ascii.Star || c == Byte_ascii.Hash) {
return tmp.Add_str_a7("</li>\n<li>").To_bry_and_clear();
}
else if (c == Byte_ascii.Colon || c == Byte_ascii.Semic) {
byte[] close = tmp.Add_str_a7("</dd>\n").To_bry_and_clear();
if (dt_open) {
close = tmp.Add_str_a7("</dt>\n").To_bry_and_clear();
}
if (c == Byte_ascii.Semic) {
dt_open = true;
return tmp.Add(close).Add_str_a7("<dt>").To_bry_and_clear();
}
else {
dt_open = false;
return tmp.Add(close).Add_str_a7("<dd>").To_bry_and_clear();
}
}
return tmp.Add_str_a7("<!-- ERR 2 -->").To_bry_and_clear();
}
// Close the current list item identified by the prefix character.
private byte[] Close_list(byte c) {
byte[] text = null;
if (c == Byte_ascii.Star) {
text = Bry_.new_a7("</li></ul>");
}
else if (c == Byte_ascii.Hash) {
text = Bry_.new_a7("</li></ol>");
}
else if (c == Byte_ascii.Colon) {
if (dt_open) {
dt_open = false;
text = Bry_.new_a7("</dt></dl>");
}
else {
text = Bry_.new_a7("</dd></dl>");
}
}
else {
return Bry_.new_a7("<!-- ERR 3 -->");
}
return text;
}
// Split up a String on ':', ignoring any occurrences inside tags
// to prevent illegal overlapping.
private int Find_colon_no_links(byte[] str, byte[] before, byte[] after) {
int len = str.length;
int colon_pos = XophpString.strpos(str, Byte_ascii.Colon, 0, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing to find!
return Bry_find_.Not_found;
}
int lt_pos = XophpString.strpos(str, Byte_ascii.Angle_bgn, 0, len);
if (lt_pos == Bry_find_.Not_found || lt_pos > colon_pos) {
// Easy; no tag nesting to worry about
find_colon_no_links__before = XophpString.substr(str, 0, colon_pos);
find_colon_no_links__after = XophpString.substr(str, colon_pos + 1);
return colon_pos;
}
// Ugly state machine to walk through avoiding tags.
int state = COLON_STATE_TEXT;
int level = 0;
for (int i = 0; i < len; i++) {
byte c = str[i];
switch (state) {
case COLON_STATE_TEXT:
switch (c) {
case Byte_ascii.Angle_bgn:
// Could be either a <start> tag or an </end> tag
state = COLON_STATE_TAGSTART;
break;
case Byte_ascii.Colon:
if (level == 0) {
// We found it!
find_colon_no_links__before = XophpString.substr(str, 0, i);
find_colon_no_links__after = XophpString.substr(str, i + 1);
return i;
}
// Embedded in a tag; don't break it.
break;
default:
// Skip ahead looking for something interesting
colon_pos = XophpString.strpos(str, Byte_ascii.Colon, i, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing else interesting
return Bry_find_.Not_found;
}
lt_pos = XophpString.strpos(str, Byte_ascii.Angle_bgn, i, len);
if (level == 0) {
if (lt_pos == Bry_find_.Not_found || colon_pos < lt_pos) {
// We found it!
find_colon_no_links__before = XophpString.substr(str, 0, colon_pos);
find_colon_no_links__after = XophpString.substr(str, colon_pos + 1);
return i;
}
}
if (lt_pos == Bry_find_.Not_found) {
// Nothing else interesting to find; abort!
// We're nested, but there's no close tags left. Abort!
i = len; // break 2
break;
}
// Skip ahead to next tag start
i = lt_pos;
state = COLON_STATE_TAGSTART;
break;
}
break;
case COLON_STATE_TAG:
// In a <tag>
switch (c) {
case Byte_ascii.Angle_bgn:
level++;
state = COLON_STATE_TEXT;
break;
case Byte_ascii.Slash:
// Slash may be followed by >?
state = COLON_STATE_TAGSLASH;
break;
default:
// ignore
break;
}
break;
case COLON_STATE_TAGSTART:
switch (c) {
case Byte_ascii.Slash:
state = COLON_STATE_CLOSETAG;
break;
case Byte_ascii.Bang:
state = COLON_STATE_COMMENT;
break;
case Byte_ascii.Angle_bgn:
// Illegal early close? This shouldn't happen D:
state = COLON_STATE_TEXT;
break;
default:
state = COLON_STATE_TAG;
break;
}
break;
case COLON_STATE_CLOSETAG:
// In a </tag>
if (c == Byte_ascii.Angle_bgn) {
level--;
if (level < 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; too many close tags");
return Bry_find_.Not_found;
}
state = COLON_STATE_TEXT;
}
break;
case COLON_STATE_TAGSLASH:
if (c == Byte_ascii.Angle_bgn) {
// Yes, a self-closed tag <blah/>
state = COLON_STATE_TEXT;
}
else {
// Probably we're jumping the gun, and this is an attribute
state = COLON_STATE_TAG;
}
break;
case COLON_STATE_COMMENT:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASH;
}
break;
case COLON_STATE_COMMENTDASH:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASHDASH;
}
else {
state = COLON_STATE_COMMENT;
}
break;
case COLON_STATE_COMMENTDASHDASH:
if (c == Byte_ascii.Angle_bgn) {
state = COLON_STATE_TEXT;
}
else {
state = COLON_STATE_COMMENT;
}
break;
default:
throw Err_.new_wo_type("State machine error");
}
}
if (level > 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; not enough close tags (level ~{0}, state ~{1})", level, state);
return Bry_find_.Not_found;
}
return Bry_find_.Not_found;
}
private static final int
COLON_STATE_TEXT = 0
, COLON_STATE_TAG = 1
, COLON_STATE_TAGSTART = 2
, COLON_STATE_CLOSETAG = 3
, COLON_STATE_TAGSLASH = 4
, COLON_STATE_COMMENT = 5
, COLON_STATE_COMMENTDASH = 6
, COLON_STATE_COMMENTDASHDASH = 7
;
private static final byte
Last_section__none = 0 // ''
, Last_section__para = 1 // p
, Last_section__pre = 2 // pre
;
private static final byte
Para_stack__none = 0 // false
, Para_stack__bgn = 1 // <p>
, Para_stack__mid = 2 // </p><p>
;
private static final int Pre__bgn = 0, Pre__end = 1;
private static Btrie_slim_mgr pre_trie;
private static boolean[] block_chars_ary;
private static boolean[] Block_chars_ary__new() {
boolean[] rv = new boolean[256];
rv[Byte_ascii.Star] = true;
rv[Byte_ascii.Hash] = true;
rv[Byte_ascii.Colon] = true;
rv[Byte_ascii.Semic] = true;
return rv;
}
private static Btrie_slim_mgr open_match_trie, close_match_trie, blockquote_trie;
private static void Para_stack_bfr(Bry_bfr bfr, int id) {
switch (id) {
case Para_stack__bgn: bfr.Add_str_a7("<p>"); break;
case Para_stack__mid: bfr.Add_str_a7("</p><p>"); break;
default: throw Err_.new_unhandled_default(id);
}
}
}