mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
Mw_parse: Mass checkin of various mediawiki parse files
This commit is contained in:
@@ -16,10 +16,10 @@ You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
|
||||
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*;
|
||||
import gplx.xowa.mws.parsers.*; import gplx.xowa.mws.parsers.headings.*;
|
||||
import gplx.xowa.addons.htmls.tocs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
|
||||
class Xop_section_list implements Xomw_hdr_cbk {
|
||||
private final Xomw_hdr_wkr hdr_wkr = new Xomw_hdr_wkr();
|
||||
class Xop_section_list implements Xomw_heading_cbk {
|
||||
private final Xomw_heading_wkr hdr_wkr = new Xomw_heading_wkr();
|
||||
private final Ordered_hash hash = Ordered_hash_.New_bry();
|
||||
private final Xoh_toc_mgr toc_mgr = new Xoh_toc_mgr();
|
||||
private byte[] src;
|
||||
@@ -92,7 +92,7 @@ class Xop_section_list implements Xomw_hdr_cbk {
|
||||
|
||||
return new int[] {src_bgn, src_end};
|
||||
}
|
||||
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
|
||||
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
|
||||
// get key by taking everything between ==; EX: "== abc ==" -> " abc "
|
||||
byte[] src = wkr.Src();
|
||||
int hdr_txt_bgn = wkr.Hdr_lhs_end();
|
||||
@@ -117,5 +117,5 @@ class Xop_section_list implements Xomw_hdr_cbk {
|
||||
Xop_section_itm itm = new Xop_section_itm(hash.Count(), num, key, wkr.Hdr_bgn(), wkr.Hdr_end());
|
||||
hash.Add(key, itm);
|
||||
}
|
||||
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {}
|
||||
public void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
|
||||
import gplx.langs.htmls.*;
|
||||
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
|
||||
import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
|
||||
public class Xop_section_mgr implements Gfo_invk {
|
||||
private Xoae_app app; private Xowe_wiki wiki;
|
||||
private Xow_tidy_mgr_interface tidy_mgr;
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
import gplx.xowa.parsers.mws.utils.*;
|
||||
import gplx.xowa.parsers.uniqs.*;
|
||||
public class Xomw_parser_ctx {
|
||||
public Xomw_sanitizer_mgr Sanitizer() {return sanitizer;} private final Xomw_sanitizer_mgr sanitizer = new Xomw_sanitizer_mgr();
|
||||
public Xop_uniq_mgr Uniq_mgr() {return uniq_mgr;} private final Xop_uniq_mgr uniq_mgr = new Xop_uniq_mgr();
|
||||
|
||||
public static final int Pos__bos = -1;
|
||||
}
|
||||
@@ -1,261 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.blocks; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private byte[] last_prefix, last_section;
|
||||
private boolean line_start, dt_open, in_block_elem, para_stack, in_blockquote, in_pre = false;
|
||||
private int prefix_len;
|
||||
private int src_len;
|
||||
public byte[] Do_block_levels(byte[] src, boolean line_start) {
|
||||
this.src_len = src.length;
|
||||
this.line_start = line_start;
|
||||
// Parsing through the text line by line. The main thing
|
||||
// happening here is handling of block-level elements p, pre,
|
||||
// and making lists from lines starting with * # : etc.
|
||||
this.last_prefix = Bry_.Empty;
|
||||
bfr.Clear();
|
||||
this.dt_open = this.in_block_elem = false;
|
||||
this.prefix_len = 0;
|
||||
this.para_stack = false;
|
||||
this.in_blockquote = false;
|
||||
|
||||
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
|
||||
Bry_split_.Split(src, 0, src_len, Byte_ascii.Nl, Bool_.N, this);
|
||||
|
||||
while (prefix_len > 0) {
|
||||
// bfr .= this.closeList(prefix2[prefix_len - 1]);
|
||||
prefix_len--;
|
||||
if (prefix_len > 0) {
|
||||
bfr.Add_byte_nl();
|
||||
}
|
||||
}
|
||||
if (Bry_.Len_gt_0(last_section)) {
|
||||
bfr.Add_str_a7("</").Add(last_section).Add_str_a7(">");
|
||||
this.last_section = Bry_.Empty;
|
||||
}
|
||||
|
||||
if (dt_open || in_block_elem || para_stack || in_blockquote || in_pre) {
|
||||
}
|
||||
return bfr.To_bry_and_clear();
|
||||
}
|
||||
public int Split(byte[] src, int itm_bgn, int itm_end) {
|
||||
// Fix up line_start
|
||||
if (!line_start) {
|
||||
bfr.Add_mid(src, itm_bgn, itm_end);
|
||||
line_start = true;
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
|
||||
// * = ul
|
||||
// # = ol
|
||||
// ; = dt
|
||||
// : = dd
|
||||
int last_prefix_len = last_prefix.length;
|
||||
boolean pre_close_match = false; //preg_match('/<\\/pre/i', $oLine);
|
||||
boolean pre_open_match = false; //preg_match('/<pre/i', $oLine);
|
||||
byte[] prefix = null, prefix2 = null, t = null;
|
||||
// If not in a <pre> element, scan for and figure out what prefixes are there.
|
||||
if (!in_pre) {
|
||||
// Multiple prefixes may abut each other for nested lists.
|
||||
prefix_len = 0;// strspn($oLine, '*#:;');
|
||||
prefix = Php_str_.Substr(src, itm_bgn, prefix_len);
|
||||
|
||||
// eh?
|
||||
// ; and : are both from definition-lists, so they're equivalent
|
||||
// for the purposes of determining whether or not we need to open/close
|
||||
// elements.
|
||||
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
|
||||
t = Bry_.Mid(src, itm_bgn + prefix_len, itm_end);
|
||||
// this.in_pre = (boolean)pre_open_match;
|
||||
}
|
||||
else {
|
||||
// Don't interpret any other prefixes in preformatted text
|
||||
prefix_len = 0;
|
||||
prefix = prefix2 = Bry_.Empty;
|
||||
t = Bry_.Mid(src, itm_bgn, itm_end);
|
||||
}
|
||||
|
||||
// List generation
|
||||
byte[] term = null, t2 = null;
|
||||
int common_prefix_len = -1;
|
||||
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
|
||||
// Same as the last item, so no need to deal with nesting or opening stuff
|
||||
// bfr .= this.nextItem(substr(prefix, -1));
|
||||
para_stack = false;
|
||||
|
||||
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
|
||||
// The one nasty exception: definition lists work like this:
|
||||
// ; title : definition text
|
||||
// So we check for : in the remainder text to split up the
|
||||
// title and definition, without b0rking links.
|
||||
term = t2 = Bry_.Empty;
|
||||
// if (this.findColonNoLinks(t, term, t2) !== false) {
|
||||
t = t2;
|
||||
bfr.Add(term); // . this.nextItem(':');
|
||||
// }
|
||||
}
|
||||
}
|
||||
else if (prefix_len > 0 || last_prefix_len > 0) {
|
||||
// We need to open or close prefixes, or both.
|
||||
|
||||
// Either open or close a level...
|
||||
// common_prefix_len = this.getCommon(prefix, last_prefix);
|
||||
para_stack = false;
|
||||
|
||||
// Close all the prefixes which aren't shared.
|
||||
while (common_prefix_len < last_prefix_len) {
|
||||
// bfr .= this.closeList(last_prefix[last_prefix_len - 1]);
|
||||
last_prefix_len--;
|
||||
}
|
||||
//
|
||||
// Continue the current prefix if appropriate.
|
||||
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
|
||||
// bfr .= this.nextItem(prefix[common_prefix_len - 1]);
|
||||
}
|
||||
|
||||
// Open prefixes where appropriate.
|
||||
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
|
||||
bfr.Add_byte_nl();
|
||||
}
|
||||
while (prefix_len > common_prefix_len) {
|
||||
// $char = substr(prefix, common_prefix_len, 1);
|
||||
// bfr .= this.openList($char);
|
||||
//
|
||||
// if (';' == $char) {
|
||||
// // @todo FIXME: This is dupe of code above
|
||||
// if (this.findColonNoLinks(t, term, t2) !== false) {
|
||||
// t = t2;
|
||||
// bfr .= term . this.nextItem(':');
|
||||
// }
|
||||
// }
|
||||
++common_prefix_len;
|
||||
}
|
||||
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
|
||||
bfr.Add_byte_nl();
|
||||
}
|
||||
last_prefix = prefix2;
|
||||
}
|
||||
|
||||
// If we have no prefixes, go to paragraph mode.
|
||||
if (0 == prefix_len) {
|
||||
// No prefix (not in list)--go to paragraph mode
|
||||
// XXX: use a stack for nestable elements like span, table and div
|
||||
boolean open_match = false, close_match = false;
|
||||
// open_match = preg_match(
|
||||
// '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
|
||||
// . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
|
||||
// t
|
||||
// );
|
||||
// close_match = preg_match(
|
||||
// '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
|
||||
// . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
|
||||
// . self::MARKER_PREFIX
|
||||
// . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
|
||||
// t
|
||||
// );
|
||||
|
||||
if (open_match || close_match) {
|
||||
para_stack = false;
|
||||
// @todo bug 5718: paragraph closed
|
||||
// bfr .= this.closeParagraph();
|
||||
if (pre_open_match && !pre_close_match) {
|
||||
this.in_pre = true;
|
||||
}
|
||||
// $bqOffset = 0;
|
||||
// while (preg_match('/<(\\/?)blockquote[\s>]/i', t,
|
||||
// $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
|
||||
// ) {
|
||||
// in_blockquote = !$bqMatch[1][0]; // is this a close tag?
|
||||
// $bqOffset = $bqMatch[0][1] + strlen($bqMatch[0][0]);
|
||||
// }
|
||||
in_block_elem = !close_match;
|
||||
}
|
||||
else if (!in_block_elem && !this.in_pre) {
|
||||
if ( Byte_ascii.Space == t[0]
|
||||
// && (last_section == 'pre' || trim(t) != '')
|
||||
&& !in_blockquote
|
||||
) {
|
||||
// pre
|
||||
// if (this.last_section !== 'pre') {
|
||||
para_stack = false;
|
||||
// bfr .= this.closeParagraph() . '<pre>';
|
||||
// this.last_section = 'pre';
|
||||
// }
|
||||
t = Bry_.Mid(t, 1);
|
||||
}
|
||||
else {
|
||||
// paragraph
|
||||
// if (trim(t) == '') {
|
||||
if (para_stack) {
|
||||
// bfr .= para_stack . '<br />';
|
||||
para_stack = false;
|
||||
// this.last_section = 'p';
|
||||
}
|
||||
else {
|
||||
// if (this.last_section !== 'p') {
|
||||
// bfr .= this.closeParagraph();
|
||||
// this.last_section = '';
|
||||
// para_stack = '<p>';
|
||||
// }
|
||||
// else {
|
||||
// para_stack = '</p><p>';
|
||||
// }
|
||||
}
|
||||
// }
|
||||
// else {
|
||||
if (para_stack) {
|
||||
// bfr .= para_stack;
|
||||
para_stack = false;
|
||||
// this.last_section = 'p';
|
||||
}
|
||||
// else if (this.last_section !== 'p') {
|
||||
// bfr .= this.closeParagraph() . '<p>';
|
||||
// this.last_section = 'p';
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
// somewhere above we forget to get out of pre block (bug 785)
|
||||
if (pre_close_match && this.in_pre) {
|
||||
this.in_pre = false;
|
||||
}
|
||||
if (para_stack == false) {
|
||||
bfr.Add(t);
|
||||
if (prefix_len == 0) {
|
||||
bfr.Add_byte_nl();
|
||||
}
|
||||
}
|
||||
|
||||
if (last_prefix_len == -1 || common_prefix_len == -1) {
|
||||
}
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
// private static final int
|
||||
// Para_stack_none = 0 // false
|
||||
// , Para_stack_bgn = 1 // <p>
|
||||
// , Para_stack_mid = 2 // </p><p>
|
||||
// ;
|
||||
// private static final byte
|
||||
// Mode_none = 0 // ''
|
||||
// , Mode_para = 1 // p
|
||||
// , Mode_pre = 2 // pre
|
||||
// ;
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
class Xomw_prepro_rule {
|
||||
public Xomw_prepro_rule(byte[] bgn, byte[] end, int min, int max, int[] names) {
|
||||
this.bgn = bgn;
|
||||
this.end = end;
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
this.names = names;
|
||||
}
|
||||
public final byte[] bgn;
|
||||
public final byte[] end;
|
||||
public final int min;
|
||||
public final int max;
|
||||
public final int[] names;
|
||||
public boolean Names_exist(int idx) {
|
||||
return idx < names.length && names[idx] != Name__invalid;
|
||||
}
|
||||
private static final byte[] Name__tmpl_bry = Bry_.new_a7("template"), Name__targ_bry = Bry_.new_a7("tplarg");
|
||||
public static final int Name__invalid = -1, Name__null = 0, Name__tmpl = 1, Name__targ = 2;
|
||||
public static byte[] Name(int type) {
|
||||
switch (type) {
|
||||
case Name__tmpl: return Name__tmpl_bry;
|
||||
case Name__targ: return Name__targ_bry;
|
||||
default:
|
||||
case Name__invalid: return null;
|
||||
case Name__null: return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
class Xomw_prepro_elem {
|
||||
private static final byte[] Bry__tag_end = Bry_.new_a7("</");
|
||||
public Xomw_prepro_elem(int type, byte[] name) {
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
this.tag_end_lhs = Bry_.Add(Bry__tag_end, name);
|
||||
}
|
||||
public final int type;
|
||||
public final byte[] name;
|
||||
public final byte[] tag_end_lhs;
|
||||
public static final int Type__comment = 0, Type__other = 1;
|
||||
}
|
||||
class Xomw_prepro_curchar_itm {
|
||||
public Xomw_prepro_curchar_itm(byte[] bry, byte type) {
|
||||
this.bry = bry;
|
||||
this.type = type;
|
||||
}
|
||||
public byte[] bry;
|
||||
public byte type;
|
||||
}
|
||||
@@ -1,170 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
class Xomw_prepro_stack {
|
||||
public List_adp stack = List_adp_.New();
|
||||
public Xomw_prepro_piece top;
|
||||
private Bry_bfr root_accum = Bry_bfr_.New(), accum;
|
||||
private final Xomw_prepro_flags flags = new Xomw_prepro_flags();
|
||||
|
||||
public Xomw_prepro_stack() {
|
||||
accum = root_accum;
|
||||
}
|
||||
public void Clear() {
|
||||
stack.Clear();
|
||||
accum.Clear();
|
||||
top = null;
|
||||
}
|
||||
public int Count() {return stack.Len();}
|
||||
public Bry_bfr Get_accum() {return accum;}
|
||||
public Bry_bfr Get_root_accum() {return root_accum;}
|
||||
|
||||
public Xomw_prepro_part Get_current_part() {
|
||||
if (top == null) {
|
||||
return null;
|
||||
}
|
||||
else {
|
||||
return top.Get_current_part();
|
||||
}
|
||||
}
|
||||
|
||||
public void Push(Xomw_prepro_piece item) {
|
||||
stack.Add(item);
|
||||
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
|
||||
accum = top.Get_accum();
|
||||
}
|
||||
|
||||
public Xomw_prepro_piece Pop() {
|
||||
int len = stack.Count();
|
||||
if (len == 0) {
|
||||
throw Err_.new_wo_type("Xomw_prepro_stack: no elements remaining");
|
||||
}
|
||||
|
||||
Xomw_prepro_piece rv = (Xomw_prepro_piece)stack.Get_at(len - 1);
|
||||
stack.Del_at(len - 1);
|
||||
len--;
|
||||
|
||||
if (len > 0) {
|
||||
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
|
||||
accum = top.Get_accum();
|
||||
} else {
|
||||
this.top = null;
|
||||
this.accum = root_accum;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
public void Add_part(byte[] bry) {
|
||||
top.Add_part(bry);
|
||||
accum = top.Get_accum();
|
||||
}
|
||||
|
||||
public Xomw_prepro_flags Get_flags() {
|
||||
if (stack.Count() == 0) {
|
||||
flags.Find_eq = false;
|
||||
flags.Find_pipe = false;
|
||||
flags.In_heading = false;
|
||||
return flags;
|
||||
}
|
||||
else {
|
||||
top.Set_flags(flags);
|
||||
return flags;
|
||||
}
|
||||
}
|
||||
}
|
||||
class Xomw_prepro_flags {
|
||||
public boolean Find_pipe;
|
||||
public boolean Find_eq;
|
||||
public boolean In_heading;
|
||||
}
|
||||
class Xomw_prepro_piece {
|
||||
public final byte[] open; // Opening character (\n for heading)
|
||||
public final byte[] close; // Matching closing char;
|
||||
public int count; // Number of opening characters found (number of "=" for heading)
|
||||
public final boolean line_start; // True if the open char appeared at the start of the input line; Not set for headings.
|
||||
public final int start_pos;
|
||||
public List_adp parts = List_adp_.New();
|
||||
public Xomw_prepro_piece(byte[] open, byte[] close, int count, int start_pos, boolean line_start) {
|
||||
this.open = open;
|
||||
this.close = close;
|
||||
this.count = count;
|
||||
this.start_pos = start_pos;
|
||||
this.line_start = line_start;
|
||||
parts.Add(new Xomw_prepro_part(Bry_.Empty));
|
||||
}
|
||||
public void Parts__renew() {
|
||||
parts.Clear();
|
||||
this.Add_part(Bry_.Empty);
|
||||
}
|
||||
public Xomw_prepro_part Get_current_part() {
|
||||
return (Xomw_prepro_part)parts.Get_at(parts.Len() - 1);
|
||||
}
|
||||
public Bry_bfr Get_accum() {
|
||||
return Get_current_part().bfr;
|
||||
}
|
||||
public void Add_part(byte[] bry) {
|
||||
parts.Add(new Xomw_prepro_part(bry));
|
||||
}
|
||||
public static final byte[] Brack_bgn_bry = Bry_.new_a7("[");
|
||||
public void Set_flags(Xomw_prepro_flags flags) {
|
||||
int parts_len = parts.Len();
|
||||
boolean open_is_nl = Bry_.Eq(open, Byte_ascii.Nl_bry);
|
||||
boolean find_pipe = !open_is_nl && !Bry_.Eq(open, Brack_bgn_bry);
|
||||
flags.Find_pipe = find_pipe;
|
||||
flags.Find_eq = find_pipe && parts_len > 1 && ((Xomw_prepro_part)parts.Get_at(parts_len - 1)).Eqpos != -1;
|
||||
flags.In_heading = open_is_nl;
|
||||
}
|
||||
// Get the output String that would result if the close is not found.
|
||||
public byte[] Break_syntax(Bry_bfr tmp_bfr, int opening_count) {
|
||||
byte[] rv = Bry_.Empty;
|
||||
if (Bry_.Eq(open, Byte_ascii.Nl_bry)) {
|
||||
rv = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry();
|
||||
}
|
||||
else {
|
||||
if (opening_count == -1) {
|
||||
opening_count = count;
|
||||
}
|
||||
tmp_bfr.Add(Bry_.Repeat_bry(open, opening_count));
|
||||
|
||||
// concat parts with "|"
|
||||
boolean first = true;
|
||||
int len = parts.Len();
|
||||
for (int i = 0; i < len; i++) {
|
||||
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(i);
|
||||
if (first) {
|
||||
first = false;
|
||||
}
|
||||
else {
|
||||
tmp_bfr.Add_byte_pipe();
|
||||
}
|
||||
tmp_bfr.Add(part.bfr.To_bry());
|
||||
}
|
||||
rv = tmp_bfr.To_bry_and_clear();
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
class Xomw_prepro_part {
|
||||
public Xomw_prepro_part(byte[] bry) {
|
||||
bfr.Add(bry);
|
||||
}
|
||||
public final Bry_bfr bfr = Bry_bfr_.New();
|
||||
public int Eqpos = -1;
|
||||
public int comment_end = -1;
|
||||
public int visual_end = -1;
|
||||
}
|
||||
@@ -1,789 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.core.btries.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
public class Xomw_prepro_wkr { // THREAD.UNSAFE: caching for repeated calls
|
||||
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
|
||||
private final List_adp comments_list = List_adp_.New();
|
||||
private final Btrie_slim_mgr elements_trie__y = Btrie_slim_mgr.ci_a7(), elements_trie__n = Btrie_slim_mgr.ci_a7();
|
||||
private final Hash_adp_bry xmlish_allow_missing_end_tag = Hash_adp_bry.cs().Add_many_str("includeonly", "noinclude", "onlyinclude");
|
||||
private final Hash_adp_bry no_more_closing_tag = Hash_adp_bry.cs();
|
||||
private final Xomw_prepro_stack stack = new Xomw_prepro_stack();
|
||||
private final Btrie_rv trv = new Btrie_rv();
|
||||
private Bry_bfr accum = Bry_bfr_.New();
|
||||
|
||||
public void Init_by_wiki(String... xmlish_elems_ary) {
|
||||
Elements_trie__init_by_wiki(elements_trie__y, ignored_tags_y, xmlish_elems_ary, "noinclude");
|
||||
Elements_trie__init_by_wiki(elements_trie__n, ignored_tags_n, xmlish_elems_ary, "includeonly");
|
||||
}
|
||||
private void Elements_trie__init_by_wiki(Btrie_slim_mgr trie, Ordered_hash ignored_tags, String[] strip_list_ary, String xmlish_elem) {
|
||||
trie.Clear();
|
||||
Elements_trie__add(trie, Bool_.Y, "!--", "comment");
|
||||
// PORTED: $xmlishElements = parser->getStripList();
|
||||
for (String itm : strip_list_ary) {
|
||||
Elements_trie__add(trie, Bool_.N, itm, itm);
|
||||
}
|
||||
// PORTED: "$xmlishElements[] = 'noinclude';" or "$xmlishElements[] = 'includeonly';"
|
||||
Elements_trie__add(trie, Bool_.N, xmlish_elem, xmlish_elem);
|
||||
|
||||
// PORTED: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
|
||||
int ignored_tags_len = ignored_tags.Count();
|
||||
for (int j = 0; j < ignored_tags_len; j++) {
|
||||
byte[] bry = (byte[])ignored_tags.Get_at(j);
|
||||
String str = String_.new_u8(bry);
|
||||
Elements_trie__add(trie, Bool_.N, str, str);
|
||||
}
|
||||
}
|
||||
private static void Elements_trie__add(Btrie_slim_mgr trie, boolean type_is_comment, String hook, String name) {
|
||||
trie.Add_obj(hook, new Xomw_prepro_elem(type_is_comment ? Xomw_prepro_elem.Type__comment : Xomw_prepro_elem.Type__other, Bry_.new_a7(name)));
|
||||
}
|
||||
public byte[] Preprocess_to_xml(byte[] src, boolean for_inclusion) {
|
||||
// RELIC.PROC_VAR: forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
|
||||
// RELIC.INIT_BY_WIKI: $xmlishElements = parser->getStripList();
|
||||
// RELIC.CLASS_VAR: $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
|
||||
boolean enable_only_include = false;
|
||||
|
||||
// PORTED: rewritten so that all add / del is done in INIT_BY_WIKI
|
||||
Ordered_hash ignored_tags;
|
||||
Hash_adp ignored_elements;
|
||||
Btrie_slim_mgr elements_trie;
|
||||
if (for_inclusion) {
|
||||
ignored_tags = ignored_tags_y; // RELIC: $ignoredTags = [ 'includeonly', '/includeonly' ];
|
||||
ignored_elements = ignored_elements__y; // RELIC: $ignoredElements = [ 'noinclude' ];
|
||||
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'noinclude';
|
||||
if ( Bry_.Has(src, Bry__only_include_bgn)
|
||||
&& Bry_.Has(src, Bry__only_include_end)) {
|
||||
enable_only_include = true;
|
||||
}
|
||||
elements_trie = elements_trie__y;
|
||||
}
|
||||
else {
|
||||
ignored_tags = ignored_tags_n; // $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ];
|
||||
ignored_elements = ignored_elements__n; // $ignoredElements = [ 'includeonly' ];
|
||||
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'includeonly';
|
||||
elements_trie = elements_trie__n;
|
||||
}
|
||||
|
||||
// RELIC.INIT_BY_WIKI: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
|
||||
|
||||
// RELIC.REGEX
|
||||
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
|
||||
// $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
|
||||
|
||||
stack.Clear();
|
||||
|
||||
// RELIC.REGEX:
|
||||
// $searchBase = "[{<\n"; # }
|
||||
|
||||
// RELIC.BRY_FIND
|
||||
// For fast reverse searches
|
||||
// $revText = strrev( $text );
|
||||
// $lengthText = strlen( $text );
|
||||
|
||||
// Input pointer, starts out pointing to a pseudo-newline before the start
|
||||
int i = 0;
|
||||
|
||||
// Current accumulator
|
||||
accum = stack.Get_accum();
|
||||
accum.Add_str_a7("<root>");
|
||||
|
||||
// True to find equals signs in arguments
|
||||
boolean find_equals = false;
|
||||
|
||||
// True to take notice of pipe characters
|
||||
boolean find_pipe = false;
|
||||
int heading_index = 1;
|
||||
|
||||
// True if $i is inside a possible heading
|
||||
boolean in_heading = false;
|
||||
|
||||
// True if there are no more greater-than (>) signs right of $i
|
||||
boolean no_more_gt = false;
|
||||
|
||||
// Map of tag name => true if there are no more closing tags of given type right of $i
|
||||
no_more_closing_tag.Clear();
|
||||
|
||||
// True to ignore all input up to the next <onlyinclude>
|
||||
boolean find_only_include = enable_only_include;
|
||||
|
||||
// Do a line-start run without outputting an LF character
|
||||
boolean fake_line_start = true;
|
||||
|
||||
// XOWA: init
|
||||
int src_len = src.length;
|
||||
int found = -1;
|
||||
byte[] cur_char = Bry_.Empty;
|
||||
byte[] cur_closing = Bry_.Empty;
|
||||
byte[] inner = null;
|
||||
Xomw_prepro_rule rule = null;
|
||||
|
||||
while (true) {
|
||||
if (find_only_include) {
|
||||
// Ignore all input up to the next <onlyinclude>
|
||||
int start_pos = Bry_find_.Find_fwd(src, Bry__only_include_bgn, i, src_len);
|
||||
if (start_pos == Bry_find_.Not_found) {
|
||||
// Ignored section runs to the end
|
||||
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</ignore>");
|
||||
break;
|
||||
}
|
||||
int tag_end_pos = start_pos + Bry__only_include_bgn.length; // past-the-end
|
||||
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos).Add_str_a7("</ignore>");
|
||||
i = tag_end_pos;
|
||||
find_only_include = false;
|
||||
}
|
||||
|
||||
if (fake_line_start) {
|
||||
found = Found__line_bgn;
|
||||
cur_char = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
// Find next opening brace, closing brace or pipe
|
||||
// RELIC.REGEX: $search = $searchBase;
|
||||
if (stack.top == null) {
|
||||
cur_closing = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
cur_closing = stack.top.close;
|
||||
// RELIC.REGEX: $search .= $currentClosing;
|
||||
}
|
||||
if (find_pipe) {
|
||||
// RELIC.REGEX: $search .= '|';
|
||||
}
|
||||
if (find_equals) {
|
||||
// First equals will be for the template
|
||||
// RELIC.REGEX: $search .= '=';
|
||||
}
|
||||
|
||||
// Output literal section, advance input counter
|
||||
// PORTED: "$literalLength = strcspn(src, $search, i)"; NOTE: no trie b/c of frequent changes to $search
|
||||
int literal_len = 0;
|
||||
boolean loop_stop = false;
|
||||
// loop chars until search_char is found
|
||||
for (int j = i; j < src_len; j++) {
|
||||
byte b = src[j];
|
||||
switch (b) { // handle '$searchBase = "[{<\n";'
|
||||
case Byte_ascii.Brack_bgn:
|
||||
case Byte_ascii.Curly_bgn:
|
||||
case Byte_ascii.Angle_bgn:
|
||||
case Byte_ascii.Nl:
|
||||
loop_stop = true;
|
||||
break;
|
||||
case Byte_ascii.Pipe: // handle "find_pipe"
|
||||
if (find_pipe) loop_stop = true;
|
||||
break;
|
||||
case Byte_ascii.Eq: // handle "find_equals"
|
||||
if (find_equals) loop_stop = true;
|
||||
break;
|
||||
default: // handle "cur_closing"; specified by piece.close and rule.close, so "\n", "}", "]" and "}-"
|
||||
if (cur_closing != Bry_.Empty) {
|
||||
byte cur_closing_0 = cur_closing[0];
|
||||
if (b == cur_closing_0) {
|
||||
if (cur_closing.length == 1) { // handle "\n", "}", "]"
|
||||
loop_stop = true;
|
||||
}
|
||||
else {// handle "}-"
|
||||
int nxt_idx = j + 1;
|
||||
if (nxt_idx < src_len && src[nxt_idx] == Byte_ascii.Dash)
|
||||
loop_stop = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (loop_stop)
|
||||
break;
|
||||
else
|
||||
literal_len++;
|
||||
}
|
||||
if (literal_len > 0) {
|
||||
accum.Add_bry_escape_html(src, i, i + literal_len);
|
||||
i += literal_len;
|
||||
}
|
||||
if (i >= src_len) {
|
||||
if (Bry_.Eq(cur_closing, Byte_ascii.Nl_bry)) {
|
||||
// Do a past-the-end run to finish off the heading
|
||||
cur_char = Bry_.Empty;
|
||||
found = Found__line_end;
|
||||
}
|
||||
else {
|
||||
// All done
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// PORTED: "if ( $curChar == '|' ) {", etc..
|
||||
Xomw_prepro_curchar_itm cur_char_itm = (Xomw_prepro_curchar_itm)cur_char_trie.Match_at(trv, src, i, src_len);
|
||||
if (cur_char_itm != null) {
|
||||
cur_char = cur_char_itm.bry;
|
||||
switch (cur_char_itm.type) {
|
||||
case Byte_ascii.Pipe: found = Found__pipe; break;
|
||||
case Byte_ascii.Eq: found = Found__equals; break;
|
||||
case Byte_ascii.Angle_bgn: found = Found__angle; break;
|
||||
case Byte_ascii.Nl: found = in_heading ? Found__line_end : Found__line_bgn; break;
|
||||
|
||||
// PORTED: "elseif ( $curChar == $currentClosing )"
|
||||
case Byte_ascii.Curly_end: found = Found__close; break;
|
||||
case Byte_ascii.Brack_end: found = Found__close; break;
|
||||
case Byte_ascii.At: found = Found__close; break; // NOTE: At is type for "}-"
|
||||
|
||||
// PORTED: "elseif ( isset( $this->rules[$curChar] ) )"
|
||||
case Byte_ascii.Curly_bgn: {found = Found__open; rule = rule_curly; break;}
|
||||
case Byte_ascii.Brack_bgn: {found = Found__open; rule = rule_brack; break;}
|
||||
case Byte_ascii.Dash: {found = Found__open; rule = rule_langv; break;}
|
||||
}
|
||||
}
|
||||
else {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (found == Found__angle) {
|
||||
// Handle </onlyinclude>
|
||||
if ( enable_only_include
|
||||
&& Bry_.Eq(src, i, i + Len__only_include_end, Bry__only_include_end)) {
|
||||
find_only_include = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Determine element name
|
||||
// PORTED: $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; EX: "(pre|ref)(?:\s|\/>|>)|(!--)
|
||||
Xomw_prepro_elem element = (Xomw_prepro_elem)elements_trie.Match_at(trv, src, i + 1, src_len);
|
||||
if (element == null) {
|
||||
// Element name missing or not listed
|
||||
accum.Add(Bry__escaped_lt);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle comments
|
||||
if (element.type == Xomw_prepro_elem.Type__comment) {
|
||||
// To avoid leaving blank lines, when a sequence of
|
||||
// space-separated comments is both preceded and followed by
|
||||
// a newline (ignoring spaces), then
|
||||
// trim leading and trailing spaces and the trailing newline.
|
||||
|
||||
// Find the end
|
||||
int end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len);
|
||||
if (end_pos == Bry_find_.Not_found) {
|
||||
// Unclosed comment in input, runs to end
|
||||
accum.Add_str_a7("<comment>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</comment>");
|
||||
i = src_len;
|
||||
}
|
||||
else {
|
||||
// Search backwards for leading whitespace
|
||||
int ws_bgn = i > 0 ? i - Php_str_.Strspn_bwd__space_or_tab(src, i, -1) : 0;
|
||||
|
||||
// Search forwards for trailing whitespace
|
||||
// $wsEnd will be the position of the last space (or the '>' if there's none)
|
||||
int ws_end = end_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, end_pos + 3, -1, src_len);
|
||||
|
||||
// Keep looking forward as long as we're finding more
|
||||
// comments.
|
||||
comments_list.Clear();
|
||||
comments_list.Add(new int[] {ws_bgn, ws_end});
|
||||
while (ws_end + 5 < src_len && Bry_.Eq(src, ws_end + 1, ws_end + 5, Bry__comment_bgn)) {
|
||||
int cur_char_pos = Bry_find_.Find_fwd(src, Bry__comment_end, ws_end + 4);
|
||||
if (cur_char_pos == Bry_find_.Not_found) {
|
||||
break;
|
||||
}
|
||||
cur_char_pos = cur_char_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, cur_char_pos + 3, -1, src_len);
|
||||
comments_list.Add(new int[] {ws_end + 1, cur_char_pos});
|
||||
ws_end = cur_char_pos;
|
||||
}
|
||||
|
||||
// Eat the line if possible
|
||||
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
|
||||
// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
|
||||
// it's a possible beneficial b/c break.
|
||||
int bgn_pos = -1;
|
||||
if ( ws_bgn > 0
|
||||
&& Bry_.Eq(src, ws_bgn - 1, ws_bgn , Byte_ascii.Nl_bry)
|
||||
&& Bry_.Eq(src, ws_end + 1, ws_end + 2, Byte_ascii.Nl_bry)
|
||||
) {
|
||||
// Remove leading whitespace from the end of the accumulator
|
||||
// Sanity check first though
|
||||
int ws_len = i - ws_bgn;
|
||||
int accum_len = accum.Len();
|
||||
if ( ws_len > 0
|
||||
&& Php_str_.Strspn_fwd__space_or_tab(accum.Bfr(), accum_len - ws_len, -1, accum_len) == ws_len) {
|
||||
accum.Del_by(ws_len);
|
||||
}
|
||||
|
||||
// Dump all but the last comment to the accumulator
|
||||
int comments_list_len = comments_list.Len();
|
||||
for (int j = 0; j < comments_list_len; j++) {
|
||||
int[] com = (int[])comments_list.Get_at(j);
|
||||
bgn_pos = com[0];
|
||||
end_pos = com[1] + 1;
|
||||
if (j == comments_list_len - 1) {
|
||||
break;
|
||||
}
|
||||
inner = Bry_.Mid(src, bgn_pos, end_pos);
|
||||
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
|
||||
}
|
||||
|
||||
// Do a line-start run next time to look for headings after the comment
|
||||
fake_line_start = true;
|
||||
}
|
||||
else {
|
||||
// No line to eat, just take the comment itself
|
||||
bgn_pos = i;
|
||||
end_pos += 2;
|
||||
}
|
||||
|
||||
if (stack.top != null) {
|
||||
Xomw_prepro_part part = stack.top.Get_current_part();
|
||||
if (!(part.comment_end != -1 && part.comment_end == ws_bgn - 1)) {
|
||||
part.visual_end = ws_bgn;
|
||||
}
|
||||
// Else comments abutting, no change in visual end
|
||||
part.comment_end = end_pos;
|
||||
}
|
||||
i = end_pos + 1;
|
||||
inner = Bry_.Mid(src, bgn_pos, end_pos + 1);
|
||||
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
byte[] name = element.name;
|
||||
// RELIC.BTRIE_CI: $lowerName = strtolower( $name );
|
||||
int atr_bgn = i + name.length + 1;
|
||||
|
||||
// Find end of tag
|
||||
int tag_end_pos = no_more_gt ? Bry_find_.Not_found : Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, atr_bgn);
|
||||
if (tag_end_pos == Bry_find_.Not_found) {
|
||||
// Infinite backtrack
|
||||
// Disable tag search to prevent worst-case O(N^2) performance
|
||||
no_more_gt = true;
|
||||
accum.Add(Bry__escaped_lt);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle ignored tags
|
||||
if (ignored_tags.Has(name)) {
|
||||
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos + 1).Add_str_a7("</ignore>");
|
||||
i = tag_end_pos + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
int tag_bgn_pos = i;
|
||||
int atr_end = -1;
|
||||
byte[] close = null;
|
||||
if (src[tag_end_pos - 1] == Byte_ascii.Slash) {
|
||||
atr_end = tag_end_pos - 1;
|
||||
inner = null;
|
||||
i = tag_end_pos + 1;
|
||||
close = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
atr_end = tag_end_pos;
|
||||
// Find closing tag
|
||||
// PORTED: `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
boolean elem_end_found = false;
|
||||
int elem_end_lhs = -1, elem_end_rhs = -1;
|
||||
int elem_end_cur = tag_end_pos + 1;
|
||||
while (true) {
|
||||
// search for "</"
|
||||
elem_end_lhs = Bry_find_.Find_fwd(src, Bry__end_lhs, elem_end_cur, src_len);
|
||||
if (elem_end_lhs == Bry_find_.Not_found) {
|
||||
break;
|
||||
}
|
||||
|
||||
// verify $name
|
||||
elem_end_cur = elem_end_lhs + 2; // 2="</"
|
||||
int elem_end_tmp = elem_end_cur + name.length;
|
||||
if (!Bry_.Eq_ci_a7(name, src, elem_end_cur, elem_end_tmp)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// verify "\s*>"
|
||||
elem_end_cur = elem_end_tmp;
|
||||
elem_end_cur = Bry_find_.Find_fwd_while(src, elem_end_cur, src_len, Byte_ascii.Space);
|
||||
if (elem_end_cur == src_len) { // just "\s", but no ">"
|
||||
break;
|
||||
}
|
||||
if (src[elem_end_cur] == Byte_ascii.Gt) {
|
||||
elem_end_rhs = elem_end_cur + 1;
|
||||
elem_end_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( !no_more_closing_tag.Has(name)
|
||||
&& elem_end_found) {
|
||||
inner = Bry_.Mid(src, tag_end_pos + 1, elem_end_lhs);
|
||||
i = elem_end_rhs;
|
||||
tmp_bfr.Add_str_a7("<close>").Add_bry_escape_html(src, elem_end_lhs, elem_end_rhs).Add_str_a7("</close>");
|
||||
close = tmp_bfr.To_bry_and_clear();
|
||||
}
|
||||
else {
|
||||
// No end tag
|
||||
if (xmlish_allow_missing_end_tag.Has(name)) {
|
||||
// Let it run out to the end of the src.
|
||||
inner = Bry_.Mid(src, tag_end_pos + 1);
|
||||
i = src_len;
|
||||
close = Bry_.Empty;
|
||||
}
|
||||
else {
|
||||
// Don't match the tag, treat opening tag as literal and resume parsing.
|
||||
i = tag_end_pos + 1;
|
||||
accum.Add_bry_escape_html(src, tag_bgn_pos, tag_end_pos + 1);
|
||||
// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
|
||||
no_more_closing_tag.Add_if_dupe_use_nth(name, name);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// <includeonly> and <noinclude> just become <ignore> tags
|
||||
if (ignored_elements.Has(name)) {
|
||||
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, tag_bgn_pos, i).Add_str_a7("</ignore>");
|
||||
continue;
|
||||
}
|
||||
|
||||
accum.Add_str_a7("<ext>");
|
||||
// PORTED:
|
||||
// if ( $attrEnd <= $attrStart ) {
|
||||
// $attr = '';
|
||||
// } else {
|
||||
// $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
|
||||
// }
|
||||
accum.Add_str_a7("<name>").Add(name).Add_str_a7("</name>");
|
||||
// Note that the attr element contains the whitespace between name and attribute,
|
||||
// this is necessary for precise reconstruction during pre-save transform.
|
||||
accum.Add_str_a7("<attr>");
|
||||
if (atr_end > atr_bgn)
|
||||
accum.Add_bry_escape_html(src, atr_bgn, atr_end);
|
||||
accum.Add_str_a7("</attr>");
|
||||
if (inner != null) {
|
||||
accum.Add_str_a7("<inner>").Add_bry_escape_html(inner).Add_str_a7("</inner>");
|
||||
}
|
||||
accum.Add(close).Add_str_a7("</ext>");
|
||||
}
|
||||
else if (found == Found__line_bgn) {
|
||||
// Is this the start of a heading?
|
||||
// Line break belongs before the heading element in any case
|
||||
if (fake_line_start) {
|
||||
fake_line_start = false;
|
||||
} else {
|
||||
accum.Add(cur_char);
|
||||
i++;
|
||||
}
|
||||
|
||||
int count = Php_str_.Strspn_fwd__byte(src, Byte_ascii.Eq, i, 6, src_len);
|
||||
if (count == 1 && find_equals) { // EX: "{{a|\n=b=\n"
|
||||
// DWIM: This looks kind of like a name/value separator.
|
||||
// Let's let the equals handler have it and break the
|
||||
// potential heading. This is heuristic, but AFAICT the
|
||||
// methods for completely correct disambiguation are very
|
||||
// complex.
|
||||
}
|
||||
else if (count > 0) {
|
||||
Xomw_prepro_piece piece = new Xomw_prepro_piece(Byte_ascii.Nl_bry, Byte_ascii.Nl_bry, count, i, false);
|
||||
piece.Add_part(Bry_.Repeat(Byte_ascii.Eq, count));
|
||||
stack.Push(piece);
|
||||
accum = stack.Get_accum();
|
||||
Xomw_prepro_flags flags = stack.Get_flags();
|
||||
find_pipe = flags.Find_pipe;
|
||||
find_equals = flags.Find_eq;
|
||||
in_heading = flags.In_heading;
|
||||
i += count;
|
||||
}
|
||||
}
|
||||
else if (found == Found__line_end) {
|
||||
Xomw_prepro_piece piece = stack.top;
|
||||
// A heading must be open, otherwise \n wouldn't have been in the search list
|
||||
if (!Bry_.Eq(piece.open, Byte_ascii.Nl_bry)) throw Err_.new_wo_type("assertion:piece must start with \\n");
|
||||
Xomw_prepro_part part = piece.Get_current_part();
|
||||
|
||||
// Search back through the input to see if it has a proper close.
|
||||
// Do this using the reversed String since the other solutions
|
||||
// (end anchor, etc.) are inefficient.
|
||||
int ws_len = Php_str_.Strspn_bwd__space_or_tab(src, src_len - i, -1);
|
||||
int search_bgn = i - ws_len;
|
||||
|
||||
if (part.comment_end != -1 && search_bgn -1 == part.comment_end) {
|
||||
// Comment found at line end
|
||||
// Search for equals signs before the comment
|
||||
search_bgn = part.visual_end;
|
||||
search_bgn = Bry_find_.Find_bwd__while_space_or_tab(src, search_bgn, 0);
|
||||
search_bgn -= Php_str_.Strspn_bwd__space_or_tab(src, search_bgn, -1);
|
||||
}
|
||||
int count = piece.count;
|
||||
int eq_len = Php_str_.Strspn_bwd__byte(src, Byte_ascii.Eq, search_bgn, -1);
|
||||
|
||||
byte[] element = Bry_.Empty;
|
||||
if (eq_len > 0) {
|
||||
if (search_bgn - eq_len == piece.start_pos) {
|
||||
// This is just a single String of equals signs on its own line
|
||||
// Replicate the doHeadings behavior /={count}(.+)={count}/
|
||||
// First find out how many equals signs there really are (don't stop at 6)
|
||||
count = eq_len;
|
||||
if (count < 3) {
|
||||
count = 0;
|
||||
}
|
||||
else {
|
||||
count = (count - 1) / 2;
|
||||
if (count > 6) count = 6;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (eq_len < count) count = eq_len; // PORTED: $count = min( $equalsLength, $count );
|
||||
}
|
||||
if (count > 0) {
|
||||
// Normal match, output <h>
|
||||
element = tmp_bfr.Add_str_a7("<h level=\"").Add_int_variable(count).Add_str_a7("\" i=\"").Add_int_variable(heading_index).Add_str_a7("\">").Add_bfr_and_preserve(accum).Add_str_a7("</h>").To_bry_and_clear();
|
||||
heading_index++;
|
||||
} else {
|
||||
// Single equals sign on its own line, count=0
|
||||
element = accum.To_bry();
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No match, no <h>, just pass down the inner src
|
||||
element = accum.To_bry();
|
||||
}
|
||||
|
||||
// Unwind the stack
|
||||
stack.Pop();
|
||||
accum = stack.Get_accum();
|
||||
|
||||
Xomw_prepro_flags flags = stack.Get_flags();
|
||||
find_pipe = flags.Find_pipe;
|
||||
find_equals = flags.Find_eq;
|
||||
in_heading = flags.In_heading;
|
||||
|
||||
// Append the result to the enclosing accumulator
|
||||
accum.Add(element);
|
||||
// Note that we do NOT increment the input pointer.
|
||||
// This is because the closing linebreak could be the opening linebreak of
|
||||
// another heading. Infinite loops are avoided because the next iteration MUST
|
||||
// hit the heading open case above, which unconditionally increments the
|
||||
// input pointer.
|
||||
}
|
||||
else if (found == Found__open) {
|
||||
// count opening brace characters
|
||||
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, -1, src_len); // NOTE: don't know how MediaWiki will handle "-{"
|
||||
|
||||
// we need to add to stack only if opening brace count is enough for one of the rules
|
||||
if (count >= rule.min) {
|
||||
// Add it to the stack
|
||||
Xomw_prepro_piece piece = new Xomw_prepro_piece(cur_char, rule.end, count, -1, i > 0 && src[i - 1] == Byte_ascii.Nl);
|
||||
|
||||
stack.Push(piece);
|
||||
accum = stack.Get_accum();
|
||||
Xomw_prepro_flags flags = stack.Get_flags();
|
||||
find_pipe = flags.Find_pipe;
|
||||
find_equals = flags.Find_eq;
|
||||
in_heading = flags.In_heading;
|
||||
}
|
||||
else {
|
||||
// Add literal brace(s)
|
||||
for (int j = 0; j < count; j++)
|
||||
accum.Add_bry_escape_html(cur_char);
|
||||
}
|
||||
i += count;
|
||||
}
|
||||
else if (found == Found__close) {
|
||||
Xomw_prepro_piece piece = stack.top;
|
||||
// lets check if there are enough characters for closing brace
|
||||
int max_count = piece.count;
|
||||
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, max_count, src_len);
|
||||
|
||||
// check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules)
|
||||
rule = Get_rule(piece.open);
|
||||
int matching_count = -1;
|
||||
if (count > rule.max) {
|
||||
// The specified maximum exists in the callback array, unless the caller
|
||||
// has made an error
|
||||
matching_count = rule.max;
|
||||
}
|
||||
else {
|
||||
// Count is less than the maximum
|
||||
// Skip any gaps in the callback array to find the true largest match
|
||||
// Need to use array_key_exists not isset because the callback can be null
|
||||
matching_count = count;
|
||||
while (matching_count > 0 && !rule.Names_exist(matching_count)) {
|
||||
matching_count--;
|
||||
}
|
||||
}
|
||||
|
||||
if (matching_count <= 0) {
|
||||
// No matching element found in callback array
|
||||
// Output a literal closing brace and continue
|
||||
for (int j = 0; j < count; j++)
|
||||
accum.Add_bry_escape_html(cur_char);
|
||||
i += count;
|
||||
continue;
|
||||
}
|
||||
int name_type = rule.names[matching_count];
|
||||
byte[] element = null;
|
||||
if (name_type == Xomw_prepro_rule.Name__null) {
|
||||
// No element, just literal text
|
||||
tmp_bfr.Add(piece.Break_syntax(tmp_bfr, matching_count));
|
||||
element = tmp_bfr.Add(Bry_.Repeat_bry(rule.end, matching_count)).To_bry_and_clear();
|
||||
}
|
||||
else {
|
||||
// Create XML element
|
||||
// Note: $parts is already XML, does not need to be encoded further
|
||||
List_adp parts = piece.parts;
|
||||
byte[] title = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry_and_clear();
|
||||
parts.Del_at(0);
|
||||
|
||||
// The invocation is at the start of the line if lineStart is set in
|
||||
// the stack, and all opening brackets are used up.
|
||||
byte[] attr = null;
|
||||
if (max_count == matching_count && piece.line_start) { // RELIC:!empty( $piece->lineStart )
|
||||
attr = Bry_.new_a7(" lineStart=\"1\"");
|
||||
}
|
||||
else {
|
||||
attr = Bry_.Empty;
|
||||
}
|
||||
|
||||
byte[] name_bry = Xomw_prepro_rule.Name(name_type);
|
||||
tmp_bfr.Add_str_a7("<").Add(name_bry).Add(attr).Add_str_a7(">");
|
||||
tmp_bfr.Add_str_a7("<title>").Add(title).Add_str_a7("</title>");
|
||||
|
||||
int arg_idx = 1;
|
||||
int parts_len = parts.Len();
|
||||
for (int j = 0; j < parts_len; j++) {
|
||||
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(j);
|
||||
if (part.Eqpos != -1) {
|
||||
Bry_bfr part_bfr = part.bfr;
|
||||
byte[] part_bfr_bry = part_bfr.Bfr();
|
||||
tmp_bfr.Add_str_a7("<part><name>").Add_mid(part_bfr_bry, 0, part.Eqpos);
|
||||
tmp_bfr.Add_str_a7("</name>=<value>").Add_mid(part_bfr_bry, part.Eqpos + 1, part_bfr.Len());
|
||||
tmp_bfr.Add_str_a7("</value></part>");
|
||||
}
|
||||
else {
|
||||
tmp_bfr.Add_str_a7("<part><name index=\"").Add_int_variable(arg_idx).Add_str_a7("\" /><value>").Add(part.bfr.To_bry()).Add_str_a7("</value></part>");
|
||||
arg_idx++;
|
||||
}
|
||||
}
|
||||
element = tmp_bfr.Add_str_a7("</").Add(name_bry).Add_str_a7(">").To_bry_and_clear();
|
||||
}
|
||||
|
||||
// Advance input pointer
|
||||
i += matching_count;
|
||||
|
||||
// Unwind the stack
|
||||
stack.Pop();
|
||||
accum = stack.Get_accum();
|
||||
|
||||
// Re-add the old stack element if it still has unmatched opening characters remaining
|
||||
if (matching_count < piece.count) {
|
||||
piece.Parts__renew(); // PORTED: piece.parts = [ new PPDPart ];
|
||||
piece.count -= matching_count;
|
||||
|
||||
// do we still qualify for any callback with remaining count?
|
||||
int min = Get_rule(piece.open).min;
|
||||
if (piece.count >= min) {
|
||||
stack.Push(piece);
|
||||
accum = stack.Get_accum();
|
||||
}
|
||||
else {
|
||||
accum.Add(Bry_.Repeat_bry(piece.open, piece.count));
|
||||
}
|
||||
}
|
||||
|
||||
Xomw_prepro_flags flags = stack.Get_flags();
|
||||
find_pipe = flags.Find_pipe;
|
||||
find_equals = flags.Find_eq;
|
||||
in_heading = flags.In_heading;
|
||||
|
||||
// Add XML element to the enclosing accumulator
|
||||
accum.Add(element);
|
||||
}
|
||||
else if (found == Found__pipe) {
|
||||
find_equals = true; // shortcut for getFlags()
|
||||
stack.Add_part(Bry_.Empty);
|
||||
accum = stack.Get_accum();
|
||||
i++;
|
||||
}
|
||||
else if (found == Found__equals) {
|
||||
find_equals = false; // shortcut for getFlags()
|
||||
stack.Get_current_part().Eqpos = accum.Len();
|
||||
accum.Add_byte(Byte_ascii.Eq);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// Output any remaining unclosed brackets
|
||||
Bry_bfr root_accum = stack.Get_root_accum();
|
||||
int stack_len = stack.stack.Len();
|
||||
for (int j = 0; j < stack_len; j++) {
|
||||
Xomw_prepro_piece piece = (Xomw_prepro_piece)stack.stack.Get_at(j);
|
||||
root_accum.Add(piece.Break_syntax(tmp_bfr, -1));
|
||||
}
|
||||
root_accum.Add_str_a7("</root>");
|
||||
return root_accum.To_bry_and_clear();
|
||||
}
|
||||
private Xomw_prepro_rule Get_rule(byte[] bry) {
|
||||
if (Bry_.Eq(bry, rule_curly.bgn)) return rule_curly;
|
||||
else if (Bry_.Eq(bry, rule_brack.bgn)) return rule_brack;
|
||||
else if (Bry_.Eq(bry, rule_langv.bgn)) return rule_langv;
|
||||
else throw Err_.new_unhandled(bry);
|
||||
}
|
||||
private static final int
|
||||
Found__line_bgn = 0
|
||||
, Found__line_end = 1
|
||||
, Found__pipe = 2
|
||||
, Found__equals = 3
|
||||
, Found__angle = 4
|
||||
, Found__close = 5
|
||||
, Found__open = 6
|
||||
;
|
||||
private static final Xomw_prepro_rule
|
||||
rule_curly = new Xomw_prepro_rule(Bry_.new_a7("{"), Bry_.new_a7("}") , 2, 3, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__tmpl, Xomw_prepro_rule.Name__targ})
|
||||
, rule_brack = new Xomw_prepro_rule(Bry_.new_a7("["), Bry_.new_a7("]") , 2, 2, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
|
||||
, rule_langv = new Xomw_prepro_rule(Bry_.new_a7("-{"), Bry_.new_a7("}-"), 1, 1, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
|
||||
;
|
||||
private static final byte[]
|
||||
Bry__only_include_bgn = Bry_.new_a7("<onlyinclude>")
|
||||
, Bry__only_include_end = Bry_.new_a7("</onlyinclude>")
|
||||
, Bry__comment_bgn = Bry_.new_a7("<!--")
|
||||
, Bry__comment_end = Bry_.new_a7("-->")
|
||||
, Bry__escaped_lt = Bry_.new_a7("<")
|
||||
, Bry__end_lhs = Bry_.new_a7("</")
|
||||
;
|
||||
private static final int Len__only_include_end = Bry__only_include_end.length;
|
||||
private static final Btrie_slim_mgr cur_char_trie = Cur_char_trie__new();
|
||||
private static final Ordered_hash
|
||||
ignored_tags_y = Ordered_hash_.New_bry().Add_many_str("includeonly", "/includeonly")
|
||||
, ignored_tags_n = Ordered_hash_.New_bry().Add_many_str("noinclude", "/noinclude", "onlyinclude", "/onlyinclude");
|
||||
private static final Hash_adp_bry
|
||||
ignored_elements__y = Hash_adp_bry.cs().Add_many_str("noinclude")
|
||||
, ignored_elements__n = Hash_adp_bry.cs().Add_many_str("includeonly");
|
||||
private static Btrie_slim_mgr Cur_char_trie__new() {
|
||||
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
|
||||
String[] ary = new String[] {"|", "=", "<", "\n", "{", "[", "-{", "}", "]"};
|
||||
for (String str : ary) {
|
||||
byte[] bry = Bry_.new_a7(str);
|
||||
rv.Add_obj(bry, new Xomw_prepro_curchar_itm(bry, bry[0]));
|
||||
}
|
||||
|
||||
// handle "}-" separately
|
||||
byte[] langv_end = Bry_.new_a7("}-");
|
||||
rv.Add_obj(langv_end, new Xomw_prepro_curchar_itm(langv_end, Byte_ascii.At));
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
@@ -1,232 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_prepro_wkr__tst {
|
||||
private final Xomw_prepro_wkr__fxt fxt = new Xomw_prepro_wkr__fxt();
|
||||
@Test public void Text() {
|
||||
fxt.Test__parse("abc", "<root>abc</root>");
|
||||
}
|
||||
@Test public void Brack() {
|
||||
fxt.Test__parse("a[[b]]c", "<root>a[[b]]c</root>");
|
||||
}
|
||||
@Test public void Brack__one() { // COVERS: "Add literal brace(s)"
|
||||
fxt.Test__parse("a[b]c", "<root>a[b]c</root>");
|
||||
}
|
||||
@Test public void Brack__max() { // COVERS: "The specified maximum exists in the callback array, unless the caller"
|
||||
fxt.Test__parse("a[[[[[b]]]]]c", "<root>a[[[[[b]]]]]c</root>");
|
||||
}
|
||||
@Test public void Template() {
|
||||
fxt.Test__parse("a{{b}}c", "<root>a<template><title>b</title></template>c</root>");
|
||||
}
|
||||
@Test public void Template__args__idx() {
|
||||
fxt.Test__parse("a{{b|c|d}}e", "<root>a<template><title>b</title><part><name index=\"1\" /><value>c</value></part><part><name index=\"2\" /><value>d</value></part></template>e</root>");
|
||||
}
|
||||
@Test public void Template__args__key() {
|
||||
fxt.Test__parse("a{{b|c=d}}e", "<root>a<template><title>b</title><part><name>c</name>=<value>d</value></part></template>e</root>");
|
||||
}
|
||||
@Test public void Template__line_start() { // COVERS: "The invocation is at the start of the line if lineStart is set in"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "{{b}}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<template lineStart=\"1\"><title>b</title></template></root>"
|
||||
));
|
||||
}
|
||||
@Test public void Template__max() { // COVERS: "do we still qualify for any callback with remaining count?"
|
||||
fxt.Test__parse("a{{{{{b}}}}}c", "<root>a<template><title><tplarg><title>b</title></tplarg></title></template>c</root>");
|
||||
}
|
||||
@Test public void Tplarg() {
|
||||
fxt.Test__parse("a{{{b}}}c", "<root>a<tplarg><title>b</title></tplarg>c</root>");
|
||||
}
|
||||
@Test public void Comment() {
|
||||
fxt.Test__parse("a<!--b-->c", "<root>a<comment><!--b--></comment>c</root>");
|
||||
}
|
||||
@Test public void Comment__dangling() {// COVERS: "Unclosed comment in input, runs to end"
|
||||
fxt.Test__parse("a<!--b", "<root>a<comment><!--b</comment></root>");
|
||||
}
|
||||
@Test public void Comment__ws() { // COVERS: "Search backwards for leading whitespace"
|
||||
fxt.Test__parse("a <!--b--> c", "<root>a <comment><!--b--></comment> c</root>"); // NOTE: space is outside comment
|
||||
}
|
||||
@Test public void Comment__many__ws() {// COVERS: "Dump all but the last comment to the accumulator"
|
||||
fxt.Test__parse("a <!--1--> <!--2--> z", "<root>a <comment><!--1--></comment> <comment><!--2--></comment> z</root>"); // NOTE: space is outside comment;
|
||||
}
|
||||
@Test public void Comment__nl__ws() { // COVERS: "Eat the line if possible"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, " <!--1--> "
|
||||
, " <!--2--> "
|
||||
, "z"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<comment> <!--1--> " // NOTE: space is inside </comment> if flanked by nl;
|
||||
, "</comment><comment> <!--2--> "
|
||||
, "</comment>z</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Ext() { // COVERS.ALSO: "Note that the attr element contains the whitespace between name and attribute,"
|
||||
fxt.Test__parse("a<pre id=\"1\">b</pre>c", "<root>a<ext><name>pre</name><attr> id="1"</attr><inner>b</inner><close></pre></close></ext>c</root>");
|
||||
}
|
||||
@Test public void Ext__inline() { // COVERS: "if ( $text[$tagEndPos - 1] == '/' ) {"
|
||||
fxt.Test__parse("a<pre/>b" , "<root>a<ext><name>pre</name><attr></attr></ext>b</root>");
|
||||
fxt.Test__parse("a<pre />b" , "<root>a<ext><name>pre</name><attr> </attr></ext>b</root>");
|
||||
}
|
||||
@Test public void Ext__end__pass__space() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
fxt.Test__parse("a<pre>b</pre >c", "<root>a<ext><name>pre</name><attr></attr><inner>b</inner><close></pre ></close></ext>c</root>");
|
||||
}
|
||||
@Test public void Ext__end__pass__name() { // COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
fxt.Test__parse("a<pre>b</pro></pre>c", "<root>a<ext><name>pre</name><attr></attr><inner>b</pro></inner><close></pre></close></ext>c</root>");
|
||||
}
|
||||
@Test public void Ext__end__fail__angle() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
|
||||
fxt.Test__parse("a<pre>b</pre c", "<root>a<pre>b</pre c</root>");
|
||||
}
|
||||
@Test public void Ext__dangling() { // COVERS: "Let it run out to the end of the text."
|
||||
fxt.Test__parse("a<pre>bc", "<root>a<pre>bc</root>");
|
||||
}
|
||||
@Test public void Ext__dangling__many() { // COVERS: "Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>..."
|
||||
fxt.Test__parse("a<pre><pre><pre>bc", "<root>a<pre><pre><pre>bc</root>");
|
||||
}
|
||||
@Test public void Ext__unclosed() { // COVERS: "Infinite backtrack"
|
||||
fxt.Test__parse("a<pre bcd", "<root>a<pre bcd</root>");
|
||||
}
|
||||
@Test public void Ext__noinclude() { // COVERS: "<includeonly> and <noinclude> just become <ignore> tags"
|
||||
fxt.Init__for_inclusion_(Bool_.N);
|
||||
fxt.Test__parse("a<includeonly>b<noinclude>c</noinclude>d</includeonly>e", "<root>a<ignore><includeonly>b<noinclude>c</noinclude>d</includeonly></ignore>e</root>");
|
||||
}
|
||||
@Test public void Heading() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "== b1 =="
|
||||
, "z"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<h level=\"2\" i=\"1\">== b1 ==</h>"
|
||||
, "z</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__eos__no_nl() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "== b1 =="
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<h level=\"2\" i=\"1\">== b1 ==</h></root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__bos__implied_nl() { // COVERS: "Is this the start of a heading?"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "== b1 =="
|
||||
, "z"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root><h level=\"2\" i=\"1\">== b1 ==</h>"
|
||||
, "z</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__dwim__y() { // COVERS: "DWIM: This looks kind of like a name/value separator."
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a{{b|"
|
||||
, "=c="
|
||||
, "}}d"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a<template><title>b</title><part><name>"
|
||||
, "</name>=<value>c="
|
||||
, "</value></part></template>d</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__dwim__n() { // COVERS: "DWIM: This looks kind of like a name/value separator."
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a{{b|"
|
||||
, "==c=="
|
||||
, "}}d"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a<template><title>b</title><part><name index=\"1\" /><value>"
|
||||
, "<h level=\"2\" i=\"1\">==c==</h>"
|
||||
, "</value></part></template>d</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__comment() { // COVERS: "Comment found at line end"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "==b== <!--c-->"
|
||||
, ""
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<h level=\"2\" i=\"1\">==b== <comment><!--c--></comment></h>"
|
||||
, "</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__consecutive__5() { // COVERS: "This is just a single String of equals signs on its own line"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "====="
|
||||
, ""
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "<h level=\"2\" i=\"1\">=====</h>"
|
||||
, "</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__consecutive__1() { // COVERS: "Single equals sign on its own line, count=0"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "="
|
||||
, ""
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "="
|
||||
, "</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Heading__unclosed() { // COVERS: "No match, no <h>, just pass down the inner src"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "a"
|
||||
, "===b"
|
||||
, ""
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<root>a"
|
||||
, "===b"
|
||||
, "</root>"
|
||||
));
|
||||
}
|
||||
@Test public void Inclusion__n() {
|
||||
fxt.Init__for_inclusion_(Bool_.N);
|
||||
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root>a<ignore><onlyinclude></ignore>b<ignore></onlyinclude></ignore>c</root>");
|
||||
}
|
||||
@Test public void Inclusion__y() {
|
||||
fxt.Init__for_inclusion_(Bool_.Y);
|
||||
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root><ignore>a<onlyinclude></ignore>b<ignore></onlyinclude>c</ignore></root>");
|
||||
}
|
||||
@Test public void Ignored__noinclude() { // COVERS: "Handle ignored tags"
|
||||
fxt.Init__for_inclusion_(Bool_.N);
|
||||
fxt.Test__parse("a<noinclude>b</noinclude>c", "<root>a<ignore><noinclude></ignore>b<ignore></noinclude></ignore>c</root>");
|
||||
}
|
||||
}
|
||||
class Xomw_prepro_wkr__fxt {
|
||||
private final Xomw_prepro_wkr wkr = new Xomw_prepro_wkr();
|
||||
private boolean for_inclusion = false;
|
||||
public Xomw_prepro_wkr__fxt() {
|
||||
wkr.Init_by_wiki("pre");
|
||||
}
|
||||
public void Init__for_inclusion_(boolean v) {for_inclusion = v;}
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
byte[] actl = wkr.Preprocess_to_xml(src_bry, for_inclusion);
|
||||
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
|
||||
}
|
||||
}
|
||||
@@ -1,239 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
import gplx.core.primitives.*;
|
||||
public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private final Bry_bfr tmp = Bry_bfr_.New();
|
||||
private final Int_list apos_pos_ary = new Int_list(32);
|
||||
public byte[] Do_all_quotes(byte[] src) {
|
||||
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
|
||||
bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 );
|
||||
apos_pos_ary.Clear();
|
||||
return bfr.To_bry_and_clear();
|
||||
}
|
||||
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
|
||||
public int Split(byte[] src, int itm_bgn, int itm_end) {
|
||||
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
||||
if (arr == null) {
|
||||
bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
int arr_len = arr.length;
|
||||
|
||||
// First, do some preliminary work. This may shift some apostrophes from
|
||||
// being mark-up to being text. It also counts the number of occurrences
|
||||
// of bold and italics mark-ups.
|
||||
int num_bold = 0;
|
||||
int num_italics = 0;
|
||||
for (int i = 1; i < arr_len; i += 2) {
|
||||
int apos_len = arr[i].length;
|
||||
// If there are ever four apostrophes, assume the first is supposed to
|
||||
// be text, and the remaining three constitute mark-up for bold text.
|
||||
// (bug 13227: ''''foo'''' turns into ' ''' foo ' ''')
|
||||
if (apos_len == 4) {
|
||||
arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry);
|
||||
arr[i] = Bry_.new_a7("'''");
|
||||
apos_len = 3;
|
||||
}
|
||||
else if (apos_len > 5) {
|
||||
// If there are more than 5 apostrophes in a row, assume they're all
|
||||
// text except for the last 5.
|
||||
// (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''')
|
||||
arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5));
|
||||
arr[i] = Bry_.new_a7("'''''");
|
||||
apos_len = 5;
|
||||
}
|
||||
// Count the number of occurrences of bold and italics mark-ups.
|
||||
if (apos_len == 2) {
|
||||
num_italics++;
|
||||
}
|
||||
else if (apos_len == 3) {
|
||||
num_bold++;
|
||||
}
|
||||
else if (apos_len == 5) {
|
||||
num_italics++;
|
||||
num_bold++;
|
||||
}
|
||||
}
|
||||
|
||||
// If there is an odd number of both bold and italics, it is likely
|
||||
// that one of the bold ones was meant to be an apostrophe followed
|
||||
// by italics. Which one we cannot know for certain, but it is more
|
||||
// likely to be one that has a single-letter word before it.
|
||||
// NOTE: this code primarily handles italicized possessives; EX: The ''[[Main Page]]'''s talk page.
|
||||
if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
|
||||
int prv_ends_w_word_1char = -1;
|
||||
int prv_ends_w_word_nchar = -1;
|
||||
int prv_ends_w_space = -1;
|
||||
for (int i = 1; i < arr_len; i += 2) {
|
||||
if (arr[i].length == 3) {
|
||||
byte[] prv = arr[i - 1];
|
||||
byte prv__last_char = Php_str_.Substr_byte(prv, -1);
|
||||
byte prv__last_minus_1_char = Php_str_.Substr_byte(prv, -2, 1);
|
||||
if (prv__last_char == Byte_ascii.Space) { // NOTE: prv ends in space; EX: "''prv '''"
|
||||
if (prv_ends_w_space == -1) {
|
||||
prv_ends_w_space = i;
|
||||
}
|
||||
}
|
||||
else if (prv__last_minus_1_char == Byte_ascii.Space) { // NOTE: prv ends in 1-char word; EX: "''prv a'''"
|
||||
prv_ends_w_word_1char = i;
|
||||
// if $firstsingleletterword is set, we don't
|
||||
// look at the other options, so we can bail early.
|
||||
break;
|
||||
}
|
||||
else {
|
||||
if (prv_ends_w_word_nchar == -1) {
|
||||
prv_ends_w_word_nchar = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there is a single-letter word, use it!
|
||||
if (prv_ends_w_word_1char > -1) {
|
||||
arr[prv_ends_w_word_1char] = Wtxt__apos;
|
||||
arr[prv_ends_w_word_1char - 1] = Bry_.Add(arr[prv_ends_w_word_1char - 1], Byte_ascii.Apos);
|
||||
}
|
||||
else if (prv_ends_w_word_nchar > -1) {
|
||||
// If not, but there's a multi-letter word, use that one.
|
||||
arr[prv_ends_w_word_nchar] = Wtxt__apos;
|
||||
arr[prv_ends_w_word_nchar - 1] = Bry_.Add(arr[prv_ends_w_word_nchar - 1], Byte_ascii.Apos);
|
||||
}
|
||||
else if (prv_ends_w_space > -1) {
|
||||
// ... otherwise use the first one that has neither.
|
||||
// (notice that it is possible for all three to be -1 if, for example,
|
||||
// there is only one pentuple-apostrophe in the line)
|
||||
arr[prv_ends_w_space] = Wtxt__apos;
|
||||
arr[prv_ends_w_space - 1] = Bry_.Add(arr[prv_ends_w_space - 1], Byte_ascii.Apos);
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's actually convert our apostrophic mush to HTML!
|
||||
int state = State__empty;
|
||||
for (int j = 0; j < arr_len; j++) {
|
||||
if ((j % 2) == 0) {
|
||||
if (state == State__both) {
|
||||
tmp.Add(arr[j]);
|
||||
}
|
||||
else {
|
||||
bfr.Add(arr[j]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int apos_len = arr[j].length;
|
||||
if (apos_len == 2) {
|
||||
if (state == State__i) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b></i><b>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<b><i>").Add_bfr_and_preserve(tmp).Add_str_a7("</i>");
|
||||
state = State__b;
|
||||
}
|
||||
else { // state can be 'b' or ''
|
||||
bfr.Add_str_a7("<i>");
|
||||
state = state == State__b ? State__bi : State__i;
|
||||
}
|
||||
}
|
||||
else if (apos_len == 3) {
|
||||
if (state == State__b) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i></b><i>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b>");
|
||||
state = State__i;
|
||||
}
|
||||
else { // state can be 'i' or ''
|
||||
bfr.Add_str_a7("<b>");
|
||||
state = state == State__i ? State__ib : State__b;
|
||||
}
|
||||
}
|
||||
else if (apos_len == 5) {
|
||||
if (state == State__b) {
|
||||
bfr.Add_str_a7("</b><i>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__i) {
|
||||
bfr.Add_str_a7("</i><b>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i></b>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b></i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b></i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else { // (state == '')
|
||||
tmp.Clear();
|
||||
state = State__both;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now close all remaining tags. Notice that the order is important.
|
||||
if (state == State__b || state == State__ib) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
}
|
||||
if (state == State__i || state == State__bi || state == State__ib) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
}
|
||||
if (state == State__bi) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
}
|
||||
// There might be lonely ''''', so make sure we have a buffer
|
||||
if (state == State__both && tmp.Len_gt_0()) {
|
||||
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
|
||||
}
|
||||
bfr.Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
private static final int
|
||||
State__empty = 0
|
||||
, State__b = 1
|
||||
, State__i = 2
|
||||
, State__bi = 3
|
||||
, State__ib = 4
|
||||
, State__both = 5
|
||||
;
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_quote_wkr__tst {
|
||||
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
|
||||
@Test public void Apos__0() {fxt.Test__parse("abc" , "abc");}
|
||||
@Test public void Apos__1() {fxt.Test__parse("a'b'c" , "a'b'c");}
|
||||
@Test public void Apos__2() {fxt.Test__parse("a''b''c" , "a<i>b</i>c");}
|
||||
@Test public void Apos__3() {fxt.Test__parse("a'''b'''c" , "a<b>b</b>c");}
|
||||
@Test public void Apos__4() {fxt.Test__parse("a''''b''''c" , "a'<b>b'</b>c");} // COVERS: "If there are ever four apostrophes"
|
||||
@Test public void Apos__5() {fxt.Test__parse("a'''''b'''''c" , "a<i><b>b</b></i>c");}
|
||||
@Test public void Apos__7() {fxt.Test__parse("a'''''''b'''''''c" , "a''<i><b>b''</b></i>c");} // COVERS: "If there are more than 5 apostrophes in a row"
|
||||
@Test public void Mix__single() {fxt.Test__parse("''a ''' ''b b''' ''cc'''" , "<i>a <b> </b></i><b>b b'<i> </i>cc</b>");} // COVERS: "If there is a single-letter word, use it!"
|
||||
@Test public void Mix__multi() {fxt.Test__parse("''a ''' ''b ''' ''cc'''" , "<i>a <b> </b></i><b>b </b> <i>cc'</i>");} // COVERS: "If not, but there's a multi-letter word, use that one."
|
||||
@Test public void Mix__space() {fxt.Test__parse("''a ''' ''b ''' ''c '''" , "<i>a '</i> <i>b <b> </b></i><b>c </b>");} // COVERS: "... otherwise use the first one that has neither."
|
||||
@Test public void Dangling__b() {fxt.Test__parse("a'''b" , "a<b>b</b>");} // COVERS: "if (state == State__b || state == State__ib)"
|
||||
@Test public void Dangling__i() {fxt.Test__parse("a''b" , "a<i>b</i>");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)"
|
||||
@Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "a<b><i>b</i></b>");} // COVERS: "There might be lonely ''''', so make sure we have a buffer"
|
||||
}
|
||||
class Xomw_quote_wkr__fxt {
|
||||
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr();
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
byte[] actl = wkr.Do_all_quotes(src_bry);
|
||||
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
|
||||
}
|
||||
}
|
||||
@@ -1,281 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*;
|
||||
public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
private final Bry_bfr bfr = Bry_bfr_.New(), tmp_bfr = Bry_bfr_.New();
|
||||
private final List_adp
|
||||
td_history = List_adp_.New() // Is currently a td tag open?
|
||||
, last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption)
|
||||
, tr_history = List_adp_.New() // Is currently a tr tag open?
|
||||
, tr_attributes = List_adp_.New() // history of tr attributes
|
||||
, has_opened_tr = List_adp_.New() // Did this table open a <tr> element?
|
||||
;
|
||||
private int indent_level = 0; // indent level of the table
|
||||
private byte[] first_2 = new byte[2];
|
||||
private Xomw_sanitizer_mgr sanitizer;
|
||||
private Xop_uniq_mgr uniq_mgr;
|
||||
public byte[] Do_table_stuff(Xomw_parser_ctx ctx, byte[] src) {
|
||||
this.sanitizer = ctx.Sanitizer();
|
||||
this.uniq_mgr = ctx.Uniq_mgr();
|
||||
indent_level = 0;
|
||||
|
||||
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text);
|
||||
|
||||
// Closing open td, tr && table
|
||||
while (td_history.Len() > 0) {
|
||||
if (Php_ary_.Pop_bool_or_n(td_history)) {
|
||||
bfr.Add_str_a7("</td>\n");
|
||||
}
|
||||
if (Php_ary_.Pop_bool_or_n(tr_history)) {
|
||||
bfr.Add_str_a7("</tr>\n");
|
||||
}
|
||||
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
|
||||
bfr.Add_str_a7("<tr><td></td></tr>\n");
|
||||
}
|
||||
bfr.Add_str_a7("</table>\n");
|
||||
}
|
||||
|
||||
// Remove trailing line-ending (b/c)
|
||||
if (bfr.Get_at_last_or_nil_if_empty() == Byte_ascii.Nl) {
|
||||
bfr.Del_by_1();
|
||||
}
|
||||
|
||||
// special case: don't return empty table
|
||||
if ( bfr.Len() == Len__tb__empty
|
||||
&& Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) {
|
||||
bfr.Clear();
|
||||
return Bry_.Empty;
|
||||
}
|
||||
return bfr.To_bry_and_clear();
|
||||
}
|
||||
public int Split(byte[] src, int itm_bgn, int itm_end) {
|
||||
byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine"
|
||||
byte[] line = Bry_.Trim(out_line); // MW: "$line"
|
||||
|
||||
int line_len = line.length;
|
||||
if (line_len == 0) { // empty line, go to next line
|
||||
bfr.Add(out_line).Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
|
||||
byte first_char = line[0];
|
||||
first_2[0] = line[0];
|
||||
if (line_len > 1) first_2[1] = line[1];
|
||||
|
||||
// PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)
|
||||
byte[] tblw_atrs = null;
|
||||
boolean tblw_bgn_found = false;
|
||||
int colons_end = Bry_find_.Find_fwd_while(src, 0, line_len, Byte_ascii.Colon);
|
||||
int tblw_bgn = Bry_find_.Find_fwd_while(line, colons_end, line_len, Byte_ascii.Space);
|
||||
int tblw_atrs_bgn = tblw_bgn + 2;
|
||||
if (Bry_.Eq(line, tblw_bgn, tblw_atrs_bgn, Wtxt__tb__bgn)) {
|
||||
tblw_bgn_found = true;
|
||||
tblw_atrs = (tblw_atrs_bgn == line_len) ? Bry_.Empty : Bry_.Mid(line, tblw_atrs_bgn, line_len);
|
||||
}
|
||||
if (tblw_bgn_found) {
|
||||
// First check if we are starting a new table
|
||||
indent_level = colons_end;
|
||||
|
||||
tblw_atrs = uniq_mgr.Convert(tblw_atrs);
|
||||
|
||||
// PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>";
|
||||
for (int j = 0; j < indent_level; j++)
|
||||
tmp_bfr.Add(Html__dl__bgn);
|
||||
tmp_bfr.Add_str_a7("<table");
|
||||
sanitizer.Fix_tag_attributes(tmp_bfr, Name__table, tblw_atrs);
|
||||
tmp_bfr.Add_byte(Byte_ascii.Angle_end);
|
||||
out_line = tmp_bfr.To_bry_and_clear();
|
||||
td_history.Add(false);
|
||||
last_tag_history.Add(Bry_.Empty);
|
||||
tr_history.Add(false);
|
||||
tr_attributes.Add(Bry_.Empty);
|
||||
has_opened_tr.Add(false);
|
||||
}
|
||||
else if (td_history.Len() == 0) {
|
||||
// Don't do any of the following
|
||||
bfr.Add(out_line).Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
else if (Bry_.Eq(first_2, Wtxt__tb__end)) {
|
||||
// We are ending a table
|
||||
line = tmp_bfr.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear();
|
||||
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
|
||||
|
||||
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
|
||||
line = tmp_bfr.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear();
|
||||
}
|
||||
|
||||
if (Php_ary_.Pop_bool_or_n(tr_history)) {
|
||||
line = tmp_bfr.Add_str_a7("</tr>").Add(line).To_bry_and_clear();
|
||||
}
|
||||
|
||||
if (Php_ary_.Pop_bool_or_n(td_history)) {
|
||||
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear();
|
||||
}
|
||||
Php_ary_.Pop_bry_or_null(tr_attributes);
|
||||
// PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level );
|
||||
tmp_bfr.Add(line);
|
||||
for (int j = 0; j < indent_level; j++)
|
||||
tmp_bfr.Add(Html__dl__end);
|
||||
out_line = tmp_bfr.To_bry_and_clear();
|
||||
}
|
||||
else if (Bry_.Eq(first_2, Wtxt__tr)) {
|
||||
// Now we have a table row
|
||||
line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line);
|
||||
|
||||
// Whats after the tag is now only attributes
|
||||
byte[] atrs = uniq_mgr.Unstrip_both(line);
|
||||
sanitizer.Fix_tag_attributes(tmp_bfr, Name__tr, atrs);
|
||||
atrs = tmp_bfr.To_bry_and_clear();
|
||||
|
||||
Php_ary_.Pop_bry_or_null(tr_attributes);
|
||||
tr_attributes.Add(atrs);
|
||||
|
||||
line = Bry_.Empty;
|
||||
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
|
||||
Php_ary_.Pop_bool_or_n(has_opened_tr);
|
||||
has_opened_tr.Add(true);
|
||||
|
||||
if (Php_ary_.Pop_bool_or_n(tr_history)) {
|
||||
line = Html__tr__end;
|
||||
}
|
||||
|
||||
if (Php_ary_.Pop_bool_or_n(td_history)) {
|
||||
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear();
|
||||
}
|
||||
|
||||
out_line = line;
|
||||
tr_history.Add(false);
|
||||
td_history.Add(false);
|
||||
last_tag_history.Add(Bry_.Empty);
|
||||
}
|
||||
else if ( first_char == Byte_ascii.Pipe
|
||||
|| first_char == Byte_ascii.Bang
|
||||
|| Bry_.Eq(first_2, Wtxt__caption)
|
||||
) {
|
||||
// This might be cell elements, td, th or captions
|
||||
if (Bry_.Eq(first_2, Wtxt__caption)) {
|
||||
first_char = Byte_ascii.Plus;
|
||||
line = Bry_.Mid(line, 2);
|
||||
} else {
|
||||
line = Bry_.Mid(line, 1);
|
||||
}
|
||||
|
||||
// Implies both are valid for table headings.
|
||||
if (first_char == Byte_ascii.Bang) {
|
||||
Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
|
||||
}
|
||||
|
||||
// Split up multiple cells on the same line.
|
||||
// FIXME : This can result in improper nesting of tags processed
|
||||
// by earlier parser steps.
|
||||
byte[][] cells = Bry_split_.Split(line, Wtxt__td2);
|
||||
|
||||
out_line = Bry_.Empty;
|
||||
|
||||
byte[] previous = null;
|
||||
// Loop through each table cell
|
||||
int cells_len = cells.length;
|
||||
for (int j = 0; j < cells_len; j++) {
|
||||
byte[] cell = cells[j];
|
||||
previous = Bry_.Empty;
|
||||
if (first_char != Byte_ascii.Plus) {
|
||||
byte[] tr_after = Php_ary_.Pop_bry_or_null(tr_attributes);
|
||||
if (!Php_ary_.Pop_bool_or_n(tr_history)) {
|
||||
previous = tmp_bfr.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear();
|
||||
}
|
||||
tr_history.Add(true);
|
||||
tr_attributes.Add(Bry_.Empty);
|
||||
Php_ary_.Pop_bool_or_n(has_opened_tr);
|
||||
has_opened_tr.Add(true);
|
||||
}
|
||||
|
||||
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
|
||||
|
||||
if (Php_ary_.Pop_bool_or_n(td_history)) {
|
||||
previous = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear();
|
||||
}
|
||||
|
||||
if (first_char == Byte_ascii.Pipe) {
|
||||
last_tag = Name__td;
|
||||
}
|
||||
else if (first_char == Byte_ascii.Bang) {
|
||||
last_tag = Name__th;
|
||||
}
|
||||
else if (first_char == Byte_ascii.Plus) {
|
||||
last_tag = Name__caption;
|
||||
}
|
||||
else {
|
||||
last_tag = Bry_.Empty;
|
||||
}
|
||||
|
||||
last_tag_history.Add(last_tag);
|
||||
|
||||
// A cell could contain both parameters and data
|
||||
byte[][] cell_data = Bry_split_.Split_w_max(cell, Byte_ascii.Pipe, 2);
|
||||
|
||||
// Bug 553: Note that a '|' inside an invalid link should not
|
||||
// be mistaken as delimiting cell parameters
|
||||
byte[] cell_data_0 = cell_data[0];
|
||||
byte[] cell_data_1 = cell_data[1];
|
||||
if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) {
|
||||
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear();
|
||||
}
|
||||
else if (cell_data_1 == null) {
|
||||
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear();
|
||||
}
|
||||
else {
|
||||
byte[] atrs = uniq_mgr.Unstrip_both(cell_data_0);
|
||||
tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
|
||||
sanitizer.Fix_tag_attributes(tmp_bfr, last_tag, atrs);
|
||||
tmp_bfr.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
|
||||
cell = tmp_bfr.To_bry_and_clear();
|
||||
}
|
||||
|
||||
out_line = Bry_.Add(out_line, cell);
|
||||
td_history.Add(true);
|
||||
}
|
||||
}
|
||||
bfr.Add(out_line).Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
private static final byte[]
|
||||
Wtxt__tb__bgn = Bry_.new_a7("{|")
|
||||
, Wtxt__tb__end = Bry_.new_a7("|}")
|
||||
, Wtxt__tr = Bry_.new_a7("|-")
|
||||
, Wtxt__caption = Bry_.new_a7("|+")
|
||||
, Wtxt__th2 = Bry_.new_a7("!!")
|
||||
, Wtxt__td2 = Bry_.new_a7("||")
|
||||
, Wtxt__lnki__bgn = Bry_.new_a7("[[")
|
||||
|
||||
, Name__table = Bry_.new_a7("table")
|
||||
, Name__tr = Bry_.new_a7("tr")
|
||||
, Name__td = Bry_.new_a7("td")
|
||||
, Name__th = Bry_.new_a7("th")
|
||||
, Name__caption = Bry_.new_a7("caption")
|
||||
|
||||
, Html__tr__end = Bry_.new_a7("</tr>")
|
||||
, Html__dl__bgn = Bry_.new_a7("<dl><dd>")
|
||||
, Html__dl__end = Bry_.new_a7("</dd></dl>")
|
||||
, Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>")
|
||||
;
|
||||
private static final int Len__tb__empty = Html__tb__empty.length;
|
||||
}
|
||||
@@ -1,113 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_table_wkr__tst {
|
||||
private final Xomw_table_wkr__fxt fxt = new Xomw_table_wkr__fxt();
|
||||
@Test public void Basic() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "{|"
|
||||
, "|-"
|
||||
, "|a"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<table>"
|
||||
, ""
|
||||
, "<tr>"
|
||||
, "<td>a"
|
||||
, "</td></tr></table>"
|
||||
));
|
||||
}
|
||||
@Test public void Tb__atrs() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "{|id='1'"
|
||||
, "|-"
|
||||
, "|a"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<table id=\"1\">"
|
||||
, ""
|
||||
, "<tr>"
|
||||
, "<td>a"
|
||||
, "</td></tr></table>"
|
||||
));
|
||||
}
|
||||
@Test public void Tc__atrs() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "{|"
|
||||
, "|+id='1'|a"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<table>"
|
||||
, "<caption id=\"1\">a"
|
||||
, "</caption><tr><td></td></tr></table>"
|
||||
));
|
||||
}
|
||||
@Test public void Th__double() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "{|"
|
||||
, "!a!!b"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<table>"
|
||||
, "<tr>"
|
||||
, "<th>a</th>"
|
||||
, "<th>b"
|
||||
, "</th></tr></table>"
|
||||
));
|
||||
}
|
||||
@Test public void Blank() { // COVERS: "empty line, go to next line"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( " "
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( " "
|
||||
));
|
||||
}
|
||||
@Test public void Tb__indent() {
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "::{|"
|
||||
, "|-"
|
||||
, "|a"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<dl><dd><dl><dd><table>"
|
||||
, ""
|
||||
, "<tr>"
|
||||
, "<td>a"
|
||||
, "</td></tr></table></dd></dl></dd></dl>"
|
||||
));
|
||||
}
|
||||
@Test public void Tb__empty() { // COVERS: "if (has_opened_tr.Len() == 0) {"
|
||||
fxt.Test__parse(String_.Concat_lines_nl_skip_last
|
||||
( "{|"
|
||||
, "|}"
|
||||
), String_.Concat_lines_nl_skip_last
|
||||
( "<table>"
|
||||
, "<tr><td></td></tr></table>"
|
||||
));
|
||||
}
|
||||
}
|
||||
class Xomw_table_wkr__fxt {
|
||||
private final Xomw_parser_ctx ctx = new Xomw_parser_ctx();
|
||||
private final Xomw_table_wkr wkr = new Xomw_table_wkr();
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
byte[] actl = wkr.Do_table_stuff(ctx, src_bry);
|
||||
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
|
||||
}
|
||||
}
|
||||
@@ -1,41 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
public class Xomw_sanitizer_mgr {
|
||||
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
|
||||
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
|
||||
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
|
||||
atr_bldr.Atrs__clear();
|
||||
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
|
||||
int len = atr_bldr.Atrs__len();
|
||||
|
||||
// PORTED: Sanitizer.php|safeEncodeTagAttributes
|
||||
for (int i = 0; i < len; i++) {
|
||||
// $encAttribute = htmlspecialchars( $attribute );
|
||||
// $encValue = Sanitizer::safeEncodeAttribute( $value );
|
||||
// $attribs[] = "$encAttribute=\"$encValue\"";
|
||||
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
|
||||
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
|
||||
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
|
||||
bfr.Add_byte_eq().Add_byte_quote();
|
||||
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
|
||||
bfr.Add_byte_quote();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,62 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
public class Xomw_string_utils {
|
||||
public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
|
||||
// PORTED: avoiding multiple regex calls / String creations
|
||||
// $placeholder = "\x00";
|
||||
|
||||
// Remove placeholder instances
|
||||
// $text = str_replace( $placeholder, '', $text );
|
||||
|
||||
// Replace instances of the separator inside HTML-like tags with the placeholder
|
||||
// $replacer = new DoubleReplacer( $search, $placeholder );
|
||||
// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
|
||||
|
||||
// Explode, then put the replaced separators back in
|
||||
// $cleaned = str_replace( $search, $replace, $cleaned );
|
||||
// $text = str_replace( $placeholder, $search, $cleaned );
|
||||
|
||||
// if same length find / repl, do in-place replacement; EX: "!!" -> "||"
|
||||
int find_len = find.length;
|
||||
int repl_len = repl.length;
|
||||
if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
|
||||
|
||||
byte find_0 = find[0];
|
||||
byte dlm_bgn = Byte_ascii.Angle_bgn;
|
||||
byte dlm_end = Byte_ascii.Angle_end;
|
||||
boolean repl_active = true;
|
||||
|
||||
// loop every char in array
|
||||
for (int i = src_bgn; i < src_end; i++) {
|
||||
byte b = src[i];
|
||||
if ( b == find_0
|
||||
&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
|
||||
&& repl_active
|
||||
) {
|
||||
Bry_.Set(src, i, i + find_len, repl);
|
||||
}
|
||||
else if (b == dlm_bgn) {
|
||||
repl_active = false;
|
||||
}
|
||||
else if (b == dlm_end) {
|
||||
repl_active = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_string_utils__tst {
|
||||
private final Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
|
||||
@Test public void Basic() {
|
||||
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
|
||||
}
|
||||
@Test public void Missing() {
|
||||
fxt.Test__replace_markup("abcd" , "!!", "||", "abcd");
|
||||
}
|
||||
@Test public void Eos() {
|
||||
fxt.Test__replace_markup("a!!" , "!!", "||", "a||");
|
||||
}
|
||||
@Test public void Ignore() {
|
||||
fxt.Test__replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
|
||||
}
|
||||
@Test public void Ignore__asym__lhs() {
|
||||
fxt.Test__replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
|
||||
}
|
||||
@Test public void Ignore__asym__rhs() {
|
||||
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to ">"
|
||||
}
|
||||
}
|
||||
class Xomw_string_utils__fxt {
|
||||
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
|
||||
Tfds.Eq_str(expd, src_bry);
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
public interface Xomw_hdr_cbk {
|
||||
void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr);
|
||||
void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr);
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
public class Xomw_hdr_cbk__html implements Xomw_hdr_cbk {
|
||||
public Bry_bfr Bfr() {return bfr;} private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
|
||||
// add from txt_bgn to hdr_bgn; EX: "abc\n==A==\n"; "\n==" seen -> add "abc"
|
||||
byte[] src = wkr.Src();
|
||||
int hdr_bgn = wkr.Hdr_bgn(), txt_bgn = wkr.Txt_bgn();
|
||||
if (hdr_bgn > txt_bgn)
|
||||
bfr.Add_mid(src, txt_bgn, hdr_bgn);
|
||||
|
||||
// add "\n" unless BOS
|
||||
if (hdr_bgn != Xomw_parser_ctx.Pos__bos) bfr.Add_byte_nl();
|
||||
|
||||
// add <h2>...</h2>
|
||||
int hdr_num = wkr.Hdr_num();
|
||||
bfr.Add(Tag__lhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // <h2>
|
||||
bfr.Add_mid(wkr.Src(), wkr.Hdr_lhs_end(), wkr.Hdr_rhs_bgn());
|
||||
bfr.Add(Tag__rhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // </h2>
|
||||
}
|
||||
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
|
||||
// add from txt_bgn to EOS;
|
||||
byte[] src = wkr.Src();
|
||||
int txt_bgn = wkr.Txt_bgn(), src_end = wkr.Src_end();
|
||||
if (txt_bgn != src_end) // PERF: don't call Add_mid() if hdr is at end of EOS
|
||||
bfr.Add_mid(src, txt_bgn, src_end);
|
||||
}
|
||||
private static final byte[]
|
||||
Tag__lhs = Bry_.new_a7("<h")
|
||||
, Tag__rhs = Bry_.new_a7("</h")
|
||||
;
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.core.btries.*; import gplx.xowa.langs.*;
|
||||
public class Xomw_hdr_wkr {
|
||||
private Xomw_parser_ctx pctx;
|
||||
private Xomw_hdr_cbk cbk;
|
||||
public byte[] Src() {return src;} private byte[] src;
|
||||
public int Src_end() {return src_end;} private int src_end;
|
||||
public int Txt_bgn() {return txt_bgn;} private int txt_bgn;
|
||||
public int Hdr_bgn() {return hdr_bgn;} private int hdr_bgn;
|
||||
public int Hdr_end() {return hdr_end;} private int hdr_end;
|
||||
public int Hdr_num() {return hdr_num;} private int hdr_num;
|
||||
public int Hdr_lhs_bgn() {return hdr_lhs_bgn;} private int hdr_lhs_bgn;
|
||||
public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end;
|
||||
public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn;
|
||||
public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end;
|
||||
public void Parse(Xomw_parser_ctx pctx, byte[] src, int src_bgn, int src_end, Xomw_hdr_cbk cbk) { // REF.MW: /includes/parser/Parser.php|doHeadings
|
||||
// init members
|
||||
this.pctx = pctx;
|
||||
this.src = src;
|
||||
this.src_end = src_end;
|
||||
this.cbk = cbk;
|
||||
|
||||
// do loop
|
||||
int pos = src_bgn;
|
||||
this.txt_bgn = pos == Xomw_parser_ctx.Pos__bos ? 0 : pos;
|
||||
byte b = Byte_ascii.Nl;
|
||||
while (true) {
|
||||
int nxt = pos + 1;
|
||||
// check if (a) cur is \n; (b) nxt is '='
|
||||
if ( b == Byte_ascii.Nl
|
||||
&& nxt < src_end
|
||||
&& src[nxt] == Byte_ascii.Eq
|
||||
) {
|
||||
pos = Parse_hdr_nl(txt_bgn, pos, nxt + 1);
|
||||
this.txt_bgn = pos;
|
||||
}
|
||||
else
|
||||
++pos;
|
||||
|
||||
// EOS; add all text after last "==\n"
|
||||
if (pos == src_end) {
|
||||
cbk.On_src_done(pctx, this);
|
||||
break;
|
||||
}
|
||||
b = src[pos];
|
||||
}
|
||||
}
|
||||
private int Parse_hdr_nl(int txt_bgn, int nl_lhs, int pos) {
|
||||
// calc lhs vars
|
||||
this.hdr_bgn = nl_lhs;
|
||||
this.hdr_lhs_bgn = nl_lhs == 0 ? 0 : nl_lhs + 1; // set pos of 1st "="; note that "==" can be at BOS;
|
||||
this.hdr_lhs_end = Bry_find_.Find_fwd_while(src, pos, src_end, Byte_ascii.Eq);
|
||||
|
||||
// calc rhs vars
|
||||
int nl_rhs = Bry_find_.Find_fwd_or(src, Byte_ascii.Nl, hdr_lhs_end + 1, src_end, src_end); // if no "\n", src_end is rest of text; EX: "\n==<text>EOS
|
||||
this.hdr_end = nl_rhs;
|
||||
this.hdr_rhs_end = Bry_find_.Find_bwd__skip_ws(src, nl_rhs, hdr_lhs_end);
|
||||
this.hdr_rhs_bgn = Bry_find_.Find_bwd__skip(src, hdr_rhs_end - 1, hdr_lhs_end, Byte_ascii.Eq);
|
||||
|
||||
int hdr_lhs_len = hdr_lhs_end - hdr_lhs_bgn;
|
||||
int hdr_rhs_len = hdr_rhs_end - hdr_rhs_bgn;
|
||||
|
||||
// handle rare situations like "\n====\n"
|
||||
if (hdr_rhs_len == 0) {
|
||||
int hdr_lhs_len_half = hdr_lhs_len / 2;
|
||||
hdr_rhs_len = hdr_lhs_len - hdr_lhs_len_half;
|
||||
hdr_lhs_len = hdr_lhs_len_half;
|
||||
this.hdr_lhs_end = hdr_lhs_bgn + hdr_lhs_len;
|
||||
this.hdr_rhs_bgn = hdr_lhs_end;
|
||||
}
|
||||
|
||||
this.hdr_num = hdr_lhs_len < hdr_rhs_len ? hdr_lhs_len : hdr_rhs_len;
|
||||
|
||||
cbk.On_hdr_seen(pctx, this);
|
||||
return nl_rhs;
|
||||
}
|
||||
}
|
||||
// for ( $i = 6; $i >= 1; --$i ) {
|
||||
// $h = str_repeat( '=', $i );
|
||||
// $text = preg_replace( "/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text );
|
||||
// }
|
||||
@@ -1,40 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_hdr_wkr_tst {
|
||||
private final Xomw_hdr_wkr_fxt fxt = new Xomw_hdr_wkr_fxt();
|
||||
@Test public void Basic() {
|
||||
fxt.Test__parse("==A==" , "<h2>A</h2>");
|
||||
fxt.Test__parse("abc\n==A==\ndef" , "abc\n<h2>A</h2>\ndef");
|
||||
|
||||
fxt.Test__parse("abc" , "abc");
|
||||
fxt.Test__parse("abc\ndef" , "abc\ndef");
|
||||
fxt.Test__parse("abc\n==" , "abc\n<h1></h1>");
|
||||
}
|
||||
}
|
||||
class Xomw_hdr_wkr_fxt {
|
||||
private final Xomw_hdr_wkr wkr = new Xomw_hdr_wkr();
|
||||
private final Xomw_hdr_cbk__html cbk = new Xomw_hdr_cbk__html();
|
||||
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
wkr.Parse(pctx, src_bry, -1, src_bry.length, cbk);
|
||||
Tfds.Eq_str_lines(expd, cbk.Bfr().To_str_and_clear(), src_str);
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,7 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
|
||||
private final Bry_bfr key_bfr = Bry_bfr_.New_w_size(32);
|
||||
private int idx = -1;
|
||||
public void Clear() {idx = -1; general_trie.Clear();}
|
||||
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
|
||||
public byte[] Add(byte[] val) { // "<b>" -> "\u007fUNIQ-item-1--QINU\u007f"
|
||||
byte[] key = key_bfr
|
||||
.Add(Bry__uniq__add__bgn)
|
||||
@@ -30,10 +31,6 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
|
||||
general_trie.Add_bry_bry(key, val);
|
||||
return key;
|
||||
}
|
||||
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
|
||||
public byte[] Unstrip_both(byte[] src) {
|
||||
return Convert(src);
|
||||
}
|
||||
public byte[] Convert(byte[] src) {
|
||||
if (general_trie.Count() == 0) return src;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user