1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Mw_parse: Mass checkin of various mediawiki parse files

This commit is contained in:
gnosygnu
2017-01-25 01:27:18 -05:00
parent 6a5c114998
commit cef2d7e2f6
81 changed files with 6723 additions and 485 deletions

View File

@@ -16,10 +16,10 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*;
import gplx.xowa.mws.parsers.*; import gplx.xowa.mws.parsers.headings.*;
import gplx.xowa.addons.htmls.tocs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
class Xop_section_list implements Xomw_hdr_cbk {
private final Xomw_hdr_wkr hdr_wkr = new Xomw_hdr_wkr();
class Xop_section_list implements Xomw_heading_cbk {
private final Xomw_heading_wkr hdr_wkr = new Xomw_heading_wkr();
private final Ordered_hash hash = Ordered_hash_.New_bry();
private final Xoh_toc_mgr toc_mgr = new Xoh_toc_mgr();
private byte[] src;
@@ -92,7 +92,7 @@ class Xop_section_list implements Xomw_hdr_cbk {
return new int[] {src_bgn, src_end};
}
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// get key by taking everything between ==; EX: "== abc ==" -> " abc "
byte[] src = wkr.Src();
int hdr_txt_bgn = wkr.Hdr_lhs_end();
@@ -117,5 +117,5 @@ class Xop_section_list implements Xomw_hdr_cbk {
Xop_section_itm itm = new Xop_section_itm(hash.Count(), num, key, wkr.Hdr_bgn(), wkr.Hdr_end());
hash.Add(key, itm);
}
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {}
public void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {}
}

View File

@@ -17,7 +17,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.hdrs.sections; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.hdrs.*;
import gplx.langs.htmls.*;
import gplx.xowa.parsers.mws.*; import gplx.xowa.parsers.mws.wkrs.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; import gplx.xowa.parsers.hdrs.*; import gplx.xowa.htmls.core.htmls.tidy.*;
public class Xop_section_mgr implements Gfo_invk {
private Xoae_app app; private Xowe_wiki wiki;
private Xow_tidy_mgr_interface tidy_mgr;

View File

@@ -1,27 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.parsers.mws.utils.*;
import gplx.xowa.parsers.uniqs.*;
public class Xomw_parser_ctx {
public Xomw_sanitizer_mgr Sanitizer() {return sanitizer;} private final Xomw_sanitizer_mgr sanitizer = new Xomw_sanitizer_mgr();
public Xop_uniq_mgr Uniq_mgr() {return uniq_mgr;} private final Xop_uniq_mgr uniq_mgr = new Xop_uniq_mgr();
public static final int Pos__bos = -1;
}

View File

@@ -1,261 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.blocks; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.langs.phps.utls.*;
public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New();
private byte[] last_prefix, last_section;
private boolean line_start, dt_open, in_block_elem, para_stack, in_blockquote, in_pre = false;
private int prefix_len;
private int src_len;
public byte[] Do_block_levels(byte[] src, boolean line_start) {
this.src_len = src.length;
this.line_start = line_start;
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
this.last_prefix = Bry_.Empty;
bfr.Clear();
this.dt_open = this.in_block_elem = false;
this.prefix_len = 0;
this.para_stack = false;
this.in_blockquote = false;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
Bry_split_.Split(src, 0, src_len, Byte_ascii.Nl, Bool_.N, this);
while (prefix_len > 0) {
// bfr .= this.closeList(prefix2[prefix_len - 1]);
prefix_len--;
if (prefix_len > 0) {
bfr.Add_byte_nl();
}
}
if (Bry_.Len_gt_0(last_section)) {
bfr.Add_str_a7("</").Add(last_section).Add_str_a7(">");
this.last_section = Bry_.Empty;
}
if (dt_open || in_block_elem || para_stack || in_blockquote || in_pre) {
}
return bfr.To_bry_and_clear();
}
public int Split(byte[] src, int itm_bgn, int itm_end) {
// Fix up line_start
if (!line_start) {
bfr.Add_mid(src, itm_bgn, itm_end);
line_start = true;
return Bry_split_.Rv__ok;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int last_prefix_len = last_prefix.length;
boolean pre_close_match = false; //preg_match('/<\\/pre/i', $oLine);
boolean pre_open_match = false; //preg_match('/<pre/i', $oLine);
byte[] prefix = null, prefix2 = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!in_pre) {
// Multiple prefixes may abut each other for nested lists.
prefix_len = 0;// strspn($oLine, '*#:;');
prefix = Php_str_.Substr(src, itm_bgn, prefix_len);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, itm_bgn + prefix_len, itm_end);
// this.in_pre = (boolean)pre_open_match;
}
else {
// Don't interpret any other prefixes in preformatted text
prefix_len = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, itm_bgn, itm_end);
}
// List generation
byte[] term = null, t2 = null;
int common_prefix_len = -1;
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
// bfr .= this.nextItem(substr(prefix, -1));
para_stack = false;
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
// if (this.findColonNoLinks(t, term, t2) !== false) {
t = t2;
bfr.Add(term); // . this.nextItem(':');
// }
}
}
else if (prefix_len > 0 || last_prefix_len > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
// common_prefix_len = this.getCommon(prefix, last_prefix);
para_stack = false;
// Close all the prefixes which aren't shared.
while (common_prefix_len < last_prefix_len) {
// bfr .= this.closeList(last_prefix[last_prefix_len - 1]);
last_prefix_len--;
}
//
// Continue the current prefix if appropriate.
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
// bfr .= this.nextItem(prefix[common_prefix_len - 1]);
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
bfr.Add_byte_nl();
}
while (prefix_len > common_prefix_len) {
// $char = substr(prefix, common_prefix_len, 1);
// bfr .= this.openList($char);
//
// if (';' == $char) {
// // @todo FIXME: This is dupe of code above
// if (this.findColonNoLinks(t, term, t2) !== false) {
// t = t2;
// bfr .= term . this.nextItem(':');
// }
// }
++common_prefix_len;
}
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
bfr.Add_byte_nl();
}
last_prefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefix_len) {
// No prefix (not in list)--go to paragraph mode
// XXX: use a stack for nestable elements like span, table and div
boolean open_match = false, close_match = false;
// open_match = preg_match(
// '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
// . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
// t
// );
// close_match = preg_match(
// '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
// . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
// . self::MARKER_PREFIX
// . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
// t
// );
if (open_match || close_match) {
para_stack = false;
// @todo bug 5718: paragraph closed
// bfr .= this.closeParagraph();
if (pre_open_match && !pre_close_match) {
this.in_pre = true;
}
// $bqOffset = 0;
// while (preg_match('/<(\\/?)blockquote[\s>]/i', t,
// $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
// ) {
// in_blockquote = !$bqMatch[1][0]; // is this a close tag?
// $bqOffset = $bqMatch[0][1] + strlen($bqMatch[0][0]);
// }
in_block_elem = !close_match;
}
else if (!in_block_elem && !this.in_pre) {
if ( Byte_ascii.Space == t[0]
// && (last_section == 'pre' || trim(t) != '')
&& !in_blockquote
) {
// pre
// if (this.last_section !== 'pre') {
para_stack = false;
// bfr .= this.closeParagraph() . '<pre>';
// this.last_section = 'pre';
// }
t = Bry_.Mid(t, 1);
}
else {
// paragraph
// if (trim(t) == '') {
if (para_stack) {
// bfr .= para_stack . '<br />';
para_stack = false;
// this.last_section = 'p';
}
else {
// if (this.last_section !== 'p') {
// bfr .= this.closeParagraph();
// this.last_section = '';
// para_stack = '<p>';
// }
// else {
// para_stack = '</p><p>';
// }
}
// }
// else {
if (para_stack) {
// bfr .= para_stack;
para_stack = false;
// this.last_section = 'p';
}
// else if (this.last_section !== 'p') {
// bfr .= this.closeParagraph() . '<p>';
// this.last_section = 'p';
// }
// }
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (pre_close_match && this.in_pre) {
this.in_pre = false;
}
if (para_stack == false) {
bfr.Add(t);
if (prefix_len == 0) {
bfr.Add_byte_nl();
}
}
if (last_prefix_len == -1 || common_prefix_len == -1) {
}
return Bry_split_.Rv__ok;
}
// private static final int
// Para_stack_none = 0 // false
// , Para_stack_bgn = 1 // <p>
// , Para_stack_mid = 2 // </p><p>
// ;
// private static final byte
// Mode_none = 0 // ''
// , Mode_para = 1 // p
// , Mode_pre = 2 // pre
// ;
}

View File

@@ -1,66 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
class Xomw_prepro_rule {
public Xomw_prepro_rule(byte[] bgn, byte[] end, int min, int max, int[] names) {
this.bgn = bgn;
this.end = end;
this.min = min;
this.max = max;
this.names = names;
}
public final byte[] bgn;
public final byte[] end;
public final int min;
public final int max;
public final int[] names;
public boolean Names_exist(int idx) {
return idx < names.length && names[idx] != Name__invalid;
}
private static final byte[] Name__tmpl_bry = Bry_.new_a7("template"), Name__targ_bry = Bry_.new_a7("tplarg");
public static final int Name__invalid = -1, Name__null = 0, Name__tmpl = 1, Name__targ = 2;
public static byte[] Name(int type) {
switch (type) {
case Name__tmpl: return Name__tmpl_bry;
case Name__targ: return Name__targ_bry;
default:
case Name__invalid: return null;
case Name__null: return null;
}
}
}
class Xomw_prepro_elem {
private static final byte[] Bry__tag_end = Bry_.new_a7("</");
public Xomw_prepro_elem(int type, byte[] name) {
this.type = type;
this.name = name;
this.tag_end_lhs = Bry_.Add(Bry__tag_end, name);
}
public final int type;
public final byte[] name;
public final byte[] tag_end_lhs;
public static final int Type__comment = 0, Type__other = 1;
}
class Xomw_prepro_curchar_itm {
public Xomw_prepro_curchar_itm(byte[] bry, byte type) {
this.bry = bry;
this.type = type;
}
public byte[] bry;
public byte type;
}

View File

@@ -1,170 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
class Xomw_prepro_stack {
public List_adp stack = List_adp_.New();
public Xomw_prepro_piece top;
private Bry_bfr root_accum = Bry_bfr_.New(), accum;
private final Xomw_prepro_flags flags = new Xomw_prepro_flags();
public Xomw_prepro_stack() {
accum = root_accum;
}
public void Clear() {
stack.Clear();
accum.Clear();
top = null;
}
public int Count() {return stack.Len();}
public Bry_bfr Get_accum() {return accum;}
public Bry_bfr Get_root_accum() {return root_accum;}
public Xomw_prepro_part Get_current_part() {
if (top == null) {
return null;
}
else {
return top.Get_current_part();
}
}
public void Push(Xomw_prepro_piece item) {
stack.Add(item);
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
accum = top.Get_accum();
}
public Xomw_prepro_piece Pop() {
int len = stack.Count();
if (len == 0) {
throw Err_.new_wo_type("Xomw_prepro_stack: no elements remaining");
}
Xomw_prepro_piece rv = (Xomw_prepro_piece)stack.Get_at(len - 1);
stack.Del_at(len - 1);
len--;
if (len > 0) {
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
accum = top.Get_accum();
} else {
this.top = null;
this.accum = root_accum;
}
return rv;
}
public void Add_part(byte[] bry) {
top.Add_part(bry);
accum = top.Get_accum();
}
public Xomw_prepro_flags Get_flags() {
if (stack.Count() == 0) {
flags.Find_eq = false;
flags.Find_pipe = false;
flags.In_heading = false;
return flags;
}
else {
top.Set_flags(flags);
return flags;
}
}
}
class Xomw_prepro_flags {
public boolean Find_pipe;
public boolean Find_eq;
public boolean In_heading;
}
class Xomw_prepro_piece {
public final byte[] open; // Opening character (\n for heading)
public final byte[] close; // Matching closing char;
public int count; // Number of opening characters found (number of "=" for heading)
public final boolean line_start; // True if the open char appeared at the start of the input line; Not set for headings.
public final int start_pos;
public List_adp parts = List_adp_.New();
public Xomw_prepro_piece(byte[] open, byte[] close, int count, int start_pos, boolean line_start) {
this.open = open;
this.close = close;
this.count = count;
this.start_pos = start_pos;
this.line_start = line_start;
parts.Add(new Xomw_prepro_part(Bry_.Empty));
}
public void Parts__renew() {
parts.Clear();
this.Add_part(Bry_.Empty);
}
public Xomw_prepro_part Get_current_part() {
return (Xomw_prepro_part)parts.Get_at(parts.Len() - 1);
}
public Bry_bfr Get_accum() {
return Get_current_part().bfr;
}
public void Add_part(byte[] bry) {
parts.Add(new Xomw_prepro_part(bry));
}
public static final byte[] Brack_bgn_bry = Bry_.new_a7("[");
public void Set_flags(Xomw_prepro_flags flags) {
int parts_len = parts.Len();
boolean open_is_nl = Bry_.Eq(open, Byte_ascii.Nl_bry);
boolean find_pipe = !open_is_nl && !Bry_.Eq(open, Brack_bgn_bry);
flags.Find_pipe = find_pipe;
flags.Find_eq = find_pipe && parts_len > 1 && ((Xomw_prepro_part)parts.Get_at(parts_len - 1)).Eqpos != -1;
flags.In_heading = open_is_nl;
}
// Get the output String that would result if the close is not found.
public byte[] Break_syntax(Bry_bfr tmp_bfr, int opening_count) {
byte[] rv = Bry_.Empty;
if (Bry_.Eq(open, Byte_ascii.Nl_bry)) {
rv = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry();
}
else {
if (opening_count == -1) {
opening_count = count;
}
tmp_bfr.Add(Bry_.Repeat_bry(open, opening_count));
// concat parts with "|"
boolean first = true;
int len = parts.Len();
for (int i = 0; i < len; i++) {
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(i);
if (first) {
first = false;
}
else {
tmp_bfr.Add_byte_pipe();
}
tmp_bfr.Add(part.bfr.To_bry());
}
rv = tmp_bfr.To_bry_and_clear();
}
return rv;
}
}
class Xomw_prepro_part {
public Xomw_prepro_part(byte[] bry) {
bfr.Add(bry);
}
public final Bry_bfr bfr = Bry_bfr_.New();
public int Eqpos = -1;
public int comment_end = -1;
public int visual_end = -1;
}

View File

@@ -1,789 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.core.btries.*;
import gplx.langs.phps.utls.*;
public class Xomw_prepro_wkr { // THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final List_adp comments_list = List_adp_.New();
private final Btrie_slim_mgr elements_trie__y = Btrie_slim_mgr.ci_a7(), elements_trie__n = Btrie_slim_mgr.ci_a7();
private final Hash_adp_bry xmlish_allow_missing_end_tag = Hash_adp_bry.cs().Add_many_str("includeonly", "noinclude", "onlyinclude");
private final Hash_adp_bry no_more_closing_tag = Hash_adp_bry.cs();
private final Xomw_prepro_stack stack = new Xomw_prepro_stack();
private final Btrie_rv trv = new Btrie_rv();
private Bry_bfr accum = Bry_bfr_.New();
public void Init_by_wiki(String... xmlish_elems_ary) {
Elements_trie__init_by_wiki(elements_trie__y, ignored_tags_y, xmlish_elems_ary, "noinclude");
Elements_trie__init_by_wiki(elements_trie__n, ignored_tags_n, xmlish_elems_ary, "includeonly");
}
private void Elements_trie__init_by_wiki(Btrie_slim_mgr trie, Ordered_hash ignored_tags, String[] strip_list_ary, String xmlish_elem) {
trie.Clear();
Elements_trie__add(trie, Bool_.Y, "!--", "comment");
// PORTED: $xmlishElements = parser->getStripList();
for (String itm : strip_list_ary) {
Elements_trie__add(trie, Bool_.N, itm, itm);
}
// PORTED: "$xmlishElements[] = 'noinclude';" or "$xmlishElements[] = 'includeonly';"
Elements_trie__add(trie, Bool_.N, xmlish_elem, xmlish_elem);
// PORTED: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
int ignored_tags_len = ignored_tags.Count();
for (int j = 0; j < ignored_tags_len; j++) {
byte[] bry = (byte[])ignored_tags.Get_at(j);
String str = String_.new_u8(bry);
Elements_trie__add(trie, Bool_.N, str, str);
}
}
private static void Elements_trie__add(Btrie_slim_mgr trie, boolean type_is_comment, String hook, String name) {
trie.Add_obj(hook, new Xomw_prepro_elem(type_is_comment ? Xomw_prepro_elem.Type__comment : Xomw_prepro_elem.Type__other, Bry_.new_a7(name)));
}
public byte[] Preprocess_to_xml(byte[] src, boolean for_inclusion) {
// RELIC.PROC_VAR: forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
// RELIC.INIT_BY_WIKI: $xmlishElements = parser->getStripList();
// RELIC.CLASS_VAR: $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
boolean enable_only_include = false;
// PORTED: rewritten so that all add / del is done in INIT_BY_WIKI
Ordered_hash ignored_tags;
Hash_adp ignored_elements;
Btrie_slim_mgr elements_trie;
if (for_inclusion) {
ignored_tags = ignored_tags_y; // RELIC: $ignoredTags = [ 'includeonly', '/includeonly' ];
ignored_elements = ignored_elements__y; // RELIC: $ignoredElements = [ 'noinclude' ];
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'noinclude';
if ( Bry_.Has(src, Bry__only_include_bgn)
&& Bry_.Has(src, Bry__only_include_end)) {
enable_only_include = true;
}
elements_trie = elements_trie__y;
}
else {
ignored_tags = ignored_tags_n; // $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ];
ignored_elements = ignored_elements__n; // $ignoredElements = [ 'includeonly' ];
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'includeonly';
elements_trie = elements_trie__n;
}
// RELIC.INIT_BY_WIKI: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
// RELIC.REGEX
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
// $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
stack.Clear();
// RELIC.REGEX:
// $searchBase = "[{<\n"; # }
// RELIC.BRY_FIND
// For fast reverse searches
// $revText = strrev( $text );
// $lengthText = strlen( $text );
// Input pointer, starts out pointing to a pseudo-newline before the start
int i = 0;
// Current accumulator
accum = stack.Get_accum();
accum.Add_str_a7("<root>");
// True to find equals signs in arguments
boolean find_equals = false;
// True to take notice of pipe characters
boolean find_pipe = false;
int heading_index = 1;
// True if $i is inside a possible heading
boolean in_heading = false;
// True if there are no more greater-than (>) signs right of $i
boolean no_more_gt = false;
// Map of tag name => true if there are no more closing tags of given type right of $i
no_more_closing_tag.Clear();
// True to ignore all input up to the next <onlyinclude>
boolean find_only_include = enable_only_include;
// Do a line-start run without outputting an LF character
boolean fake_line_start = true;
// XOWA: init
int src_len = src.length;
int found = -1;
byte[] cur_char = Bry_.Empty;
byte[] cur_closing = Bry_.Empty;
byte[] inner = null;
Xomw_prepro_rule rule = null;
while (true) {
if (find_only_include) {
// Ignore all input up to the next <onlyinclude>
int start_pos = Bry_find_.Find_fwd(src, Bry__only_include_bgn, i, src_len);
if (start_pos == Bry_find_.Not_found) {
// Ignored section runs to the end
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</ignore>");
break;
}
int tag_end_pos = start_pos + Bry__only_include_bgn.length; // past-the-end
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos).Add_str_a7("</ignore>");
i = tag_end_pos;
find_only_include = false;
}
if (fake_line_start) {
found = Found__line_bgn;
cur_char = Bry_.Empty;
}
else {
// Find next opening brace, closing brace or pipe
// RELIC.REGEX: $search = $searchBase;
if (stack.top == null) {
cur_closing = Bry_.Empty;
}
else {
cur_closing = stack.top.close;
// RELIC.REGEX: $search .= $currentClosing;
}
if (find_pipe) {
// RELIC.REGEX: $search .= '|';
}
if (find_equals) {
// First equals will be for the template
// RELIC.REGEX: $search .= '=';
}
// Output literal section, advance input counter
// PORTED: "$literalLength = strcspn(src, $search, i)"; NOTE: no trie b/c of frequent changes to $search
int literal_len = 0;
boolean loop_stop = false;
// loop chars until search_char is found
for (int j = i; j < src_len; j++) {
byte b = src[j];
switch (b) { // handle '$searchBase = "[{<\n";'
case Byte_ascii.Brack_bgn:
case Byte_ascii.Curly_bgn:
case Byte_ascii.Angle_bgn:
case Byte_ascii.Nl:
loop_stop = true;
break;
case Byte_ascii.Pipe: // handle "find_pipe"
if (find_pipe) loop_stop = true;
break;
case Byte_ascii.Eq: // handle "find_equals"
if (find_equals) loop_stop = true;
break;
default: // handle "cur_closing"; specified by piece.close and rule.close, so "\n", "}", "]" and "}-"
if (cur_closing != Bry_.Empty) {
byte cur_closing_0 = cur_closing[0];
if (b == cur_closing_0) {
if (cur_closing.length == 1) { // handle "\n", "}", "]"
loop_stop = true;
}
else {// handle "}-"
int nxt_idx = j + 1;
if (nxt_idx < src_len && src[nxt_idx] == Byte_ascii.Dash)
loop_stop = true;
}
}
}
break;
}
if (loop_stop)
break;
else
literal_len++;
}
if (literal_len > 0) {
accum.Add_bry_escape_html(src, i, i + literal_len);
i += literal_len;
}
if (i >= src_len) {
if (Bry_.Eq(cur_closing, Byte_ascii.Nl_bry)) {
// Do a past-the-end run to finish off the heading
cur_char = Bry_.Empty;
found = Found__line_end;
}
else {
// All done
break;
}
}
else {
// PORTED: "if ( $curChar == '|' ) {", etc..
Xomw_prepro_curchar_itm cur_char_itm = (Xomw_prepro_curchar_itm)cur_char_trie.Match_at(trv, src, i, src_len);
if (cur_char_itm != null) {
cur_char = cur_char_itm.bry;
switch (cur_char_itm.type) {
case Byte_ascii.Pipe: found = Found__pipe; break;
case Byte_ascii.Eq: found = Found__equals; break;
case Byte_ascii.Angle_bgn: found = Found__angle; break;
case Byte_ascii.Nl: found = in_heading ? Found__line_end : Found__line_bgn; break;
// PORTED: "elseif ( $curChar == $currentClosing )"
case Byte_ascii.Curly_end: found = Found__close; break;
case Byte_ascii.Brack_end: found = Found__close; break;
case Byte_ascii.At: found = Found__close; break; // NOTE: At is type for "}-"
// PORTED: "elseif ( isset( $this->rules[$curChar] ) )"
case Byte_ascii.Curly_bgn: {found = Found__open; rule = rule_curly; break;}
case Byte_ascii.Brack_bgn: {found = Found__open; rule = rule_brack; break;}
case Byte_ascii.Dash: {found = Found__open; rule = rule_langv; break;}
}
}
else {
i++;
continue;
}
}
}
if (found == Found__angle) {
// Handle </onlyinclude>
if ( enable_only_include
&& Bry_.Eq(src, i, i + Len__only_include_end, Bry__only_include_end)) {
find_only_include = true;
continue;
}
// Determine element name
// PORTED: $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; EX: "(pre|ref)(?:\s|\/>|>)|(!--)
Xomw_prepro_elem element = (Xomw_prepro_elem)elements_trie.Match_at(trv, src, i + 1, src_len);
if (element == null) {
// Element name missing or not listed
accum.Add(Bry__escaped_lt);
i++;
continue;
}
// Handle comments
if (element.type == Xomw_prepro_elem.Type__comment) {
// To avoid leaving blank lines, when a sequence of
// space-separated comments is both preceded and followed by
// a newline (ignoring spaces), then
// trim leading and trailing spaces and the trailing newline.
// Find the end
int end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len);
if (end_pos == Bry_find_.Not_found) {
// Unclosed comment in input, runs to end
accum.Add_str_a7("<comment>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</comment>");
i = src_len;
}
else {
// Search backwards for leading whitespace
int ws_bgn = i > 0 ? i - Php_str_.Strspn_bwd__space_or_tab(src, i, -1) : 0;
// Search forwards for trailing whitespace
// $wsEnd will be the position of the last space (or the '>' if there's none)
int ws_end = end_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, end_pos + 3, -1, src_len);
// Keep looking forward as long as we're finding more
// comments.
comments_list.Clear();
comments_list.Add(new int[] {ws_bgn, ws_end});
while (ws_end + 5 < src_len && Bry_.Eq(src, ws_end + 1, ws_end + 5, Bry__comment_bgn)) {
int cur_char_pos = Bry_find_.Find_fwd(src, Bry__comment_end, ws_end + 4);
if (cur_char_pos == Bry_find_.Not_found) {
break;
}
cur_char_pos = cur_char_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, cur_char_pos + 3, -1, src_len);
comments_list.Add(new int[] {ws_end + 1, cur_char_pos});
ws_end = cur_char_pos;
}
// Eat the line if possible
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
// it's a possible beneficial b/c break.
int bgn_pos = -1;
if ( ws_bgn > 0
&& Bry_.Eq(src, ws_bgn - 1, ws_bgn , Byte_ascii.Nl_bry)
&& Bry_.Eq(src, ws_end + 1, ws_end + 2, Byte_ascii.Nl_bry)
) {
// Remove leading whitespace from the end of the accumulator
// Sanity check first though
int ws_len = i - ws_bgn;
int accum_len = accum.Len();
if ( ws_len > 0
&& Php_str_.Strspn_fwd__space_or_tab(accum.Bfr(), accum_len - ws_len, -1, accum_len) == ws_len) {
accum.Del_by(ws_len);
}
// Dump all but the last comment to the accumulator
int comments_list_len = comments_list.Len();
for (int j = 0; j < comments_list_len; j++) {
int[] com = (int[])comments_list.Get_at(j);
bgn_pos = com[0];
end_pos = com[1] + 1;
if (j == comments_list_len - 1) {
break;
}
inner = Bry_.Mid(src, bgn_pos, end_pos);
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
}
// Do a line-start run next time to look for headings after the comment
fake_line_start = true;
}
else {
// No line to eat, just take the comment itself
bgn_pos = i;
end_pos += 2;
}
if (stack.top != null) {
Xomw_prepro_part part = stack.top.Get_current_part();
if (!(part.comment_end != -1 && part.comment_end == ws_bgn - 1)) {
part.visual_end = ws_bgn;
}
// Else comments abutting, no change in visual end
part.comment_end = end_pos;
}
i = end_pos + 1;
inner = Bry_.Mid(src, bgn_pos, end_pos + 1);
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
}
continue;
}
byte[] name = element.name;
// RELIC.BTRIE_CI: $lowerName = strtolower( $name );
int atr_bgn = i + name.length + 1;
// Find end of tag
int tag_end_pos = no_more_gt ? Bry_find_.Not_found : Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, atr_bgn);
if (tag_end_pos == Bry_find_.Not_found) {
// Infinite backtrack
// Disable tag search to prevent worst-case O(N^2) performance
no_more_gt = true;
accum.Add(Bry__escaped_lt);
i++;
continue;
}
// Handle ignored tags
if (ignored_tags.Has(name)) {
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos + 1).Add_str_a7("</ignore>");
i = tag_end_pos + 1;
continue;
}
int tag_bgn_pos = i;
int atr_end = -1;
byte[] close = null;
if (src[tag_end_pos - 1] == Byte_ascii.Slash) {
atr_end = tag_end_pos - 1;
inner = null;
i = tag_end_pos + 1;
close = Bry_.Empty;
}
else {
atr_end = tag_end_pos;
// Find closing tag
// PORTED: `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
boolean elem_end_found = false;
int elem_end_lhs = -1, elem_end_rhs = -1;
int elem_end_cur = tag_end_pos + 1;
while (true) {
// search for "</"
elem_end_lhs = Bry_find_.Find_fwd(src, Bry__end_lhs, elem_end_cur, src_len);
if (elem_end_lhs == Bry_find_.Not_found) {
break;
}
// verify $name
elem_end_cur = elem_end_lhs + 2; // 2="</"
int elem_end_tmp = elem_end_cur + name.length;
if (!Bry_.Eq_ci_a7(name, src, elem_end_cur, elem_end_tmp)) {
continue;
}
// verify "\s*>"
elem_end_cur = elem_end_tmp;
elem_end_cur = Bry_find_.Find_fwd_while(src, elem_end_cur, src_len, Byte_ascii.Space);
if (elem_end_cur == src_len) { // just "\s", but no ">"
break;
}
if (src[elem_end_cur] == Byte_ascii.Gt) {
elem_end_rhs = elem_end_cur + 1;
elem_end_found = true;
break;
}
}
if ( !no_more_closing_tag.Has(name)
&& elem_end_found) {
inner = Bry_.Mid(src, tag_end_pos + 1, elem_end_lhs);
i = elem_end_rhs;
tmp_bfr.Add_str_a7("<close>").Add_bry_escape_html(src, elem_end_lhs, elem_end_rhs).Add_str_a7("</close>");
close = tmp_bfr.To_bry_and_clear();
}
else {
// No end tag
if (xmlish_allow_missing_end_tag.Has(name)) {
// Let it run out to the end of the src.
inner = Bry_.Mid(src, tag_end_pos + 1);
i = src_len;
close = Bry_.Empty;
}
else {
// Don't match the tag, treat opening tag as literal and resume parsing.
i = tag_end_pos + 1;
accum.Add_bry_escape_html(src, tag_bgn_pos, tag_end_pos + 1);
// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
no_more_closing_tag.Add_if_dupe_use_nth(name, name);
continue;
}
}
}
// <includeonly> and <noinclude> just become <ignore> tags
if (ignored_elements.Has(name)) {
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, tag_bgn_pos, i).Add_str_a7("</ignore>");
continue;
}
accum.Add_str_a7("<ext>");
// PORTED:
// if ( $attrEnd <= $attrStart ) {
// $attr = '';
// } else {
// $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
// }
accum.Add_str_a7("<name>").Add(name).Add_str_a7("</name>");
// Note that the attr element contains the whitespace between name and attribute,
// this is necessary for precise reconstruction during pre-save transform.
accum.Add_str_a7("<attr>");
if (atr_end > atr_bgn)
accum.Add_bry_escape_html(src, atr_bgn, atr_end);
accum.Add_str_a7("</attr>");
if (inner != null) {
accum.Add_str_a7("<inner>").Add_bry_escape_html(inner).Add_str_a7("</inner>");
}
accum.Add(close).Add_str_a7("</ext>");
}
else if (found == Found__line_bgn) {
// Is this the start of a heading?
// Line break belongs before the heading element in any case
if (fake_line_start) {
fake_line_start = false;
} else {
accum.Add(cur_char);
i++;
}
int count = Php_str_.Strspn_fwd__byte(src, Byte_ascii.Eq, i, 6, src_len);
if (count == 1 && find_equals) { // EX: "{{a|\n=b=\n"
// DWIM: This looks kind of like a name/value separator.
// Let's let the equals handler have it and break the
// potential heading. This is heuristic, but AFAICT the
// methods for completely correct disambiguation are very
// complex.
}
else if (count > 0) {
Xomw_prepro_piece piece = new Xomw_prepro_piece(Byte_ascii.Nl_bry, Byte_ascii.Nl_bry, count, i, false);
piece.Add_part(Bry_.Repeat(Byte_ascii.Eq, count));
stack.Push(piece);
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
i += count;
}
}
else if (found == Found__line_end) {
Xomw_prepro_piece piece = stack.top;
// A heading must be open, otherwise \n wouldn't have been in the search list
if (!Bry_.Eq(piece.open, Byte_ascii.Nl_bry)) throw Err_.new_wo_type("assertion:piece must start with \\n");
Xomw_prepro_part part = piece.Get_current_part();
// Search back through the input to see if it has a proper close.
// Do this using the reversed String since the other solutions
// (end anchor, etc.) are inefficient.
int ws_len = Php_str_.Strspn_bwd__space_or_tab(src, src_len - i, -1);
int search_bgn = i - ws_len;
if (part.comment_end != -1 && search_bgn -1 == part.comment_end) {
// Comment found at line end
// Search for equals signs before the comment
search_bgn = part.visual_end;
search_bgn = Bry_find_.Find_bwd__while_space_or_tab(src, search_bgn, 0);
search_bgn -= Php_str_.Strspn_bwd__space_or_tab(src, search_bgn, -1);
}
int count = piece.count;
int eq_len = Php_str_.Strspn_bwd__byte(src, Byte_ascii.Eq, search_bgn, -1);
byte[] element = Bry_.Empty;
if (eq_len > 0) {
if (search_bgn - eq_len == piece.start_pos) {
// This is just a single String of equals signs on its own line
// Replicate the doHeadings behavior /={count}(.+)={count}/
// First find out how many equals signs there really are (don't stop at 6)
count = eq_len;
if (count < 3) {
count = 0;
}
else {
count = (count - 1) / 2;
if (count > 6) count = 6;
}
}
else {
if (eq_len < count) count = eq_len; // PORTED: $count = min( $equalsLength, $count );
}
if (count > 0) {
// Normal match, output <h>
element = tmp_bfr.Add_str_a7("<h level=\"").Add_int_variable(count).Add_str_a7("\" i=\"").Add_int_variable(heading_index).Add_str_a7("\">").Add_bfr_and_preserve(accum).Add_str_a7("</h>").To_bry_and_clear();
heading_index++;
} else {
// Single equals sign on its own line, count=0
element = accum.To_bry();
}
}
else {
// No match, no <h>, just pass down the inner src
element = accum.To_bry();
}
// Unwind the stack
stack.Pop();
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
// Append the result to the enclosing accumulator
accum.Add(element);
// Note that we do NOT increment the input pointer.
// This is because the closing linebreak could be the opening linebreak of
// another heading. Infinite loops are avoided because the next iteration MUST
// hit the heading open case above, which unconditionally increments the
// input pointer.
}
else if (found == Found__open) {
// count opening brace characters
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, -1, src_len); // NOTE: don't know how MediaWiki will handle "-{"
// we need to add to stack only if opening brace count is enough for one of the rules
if (count >= rule.min) {
// Add it to the stack
Xomw_prepro_piece piece = new Xomw_prepro_piece(cur_char, rule.end, count, -1, i > 0 && src[i - 1] == Byte_ascii.Nl);
stack.Push(piece);
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
}
else {
// Add literal brace(s)
for (int j = 0; j < count; j++)
accum.Add_bry_escape_html(cur_char);
}
i += count;
}
else if (found == Found__close) {
Xomw_prepro_piece piece = stack.top;
// lets check if there are enough characters for closing brace
int max_count = piece.count;
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, max_count, src_len);
// check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules)
rule = Get_rule(piece.open);
int matching_count = -1;
if (count > rule.max) {
// The specified maximum exists in the callback array, unless the caller
// has made an error
matching_count = rule.max;
}
else {
// Count is less than the maximum
// Skip any gaps in the callback array to find the true largest match
// Need to use array_key_exists not isset because the callback can be null
matching_count = count;
while (matching_count > 0 && !rule.Names_exist(matching_count)) {
matching_count--;
}
}
if (matching_count <= 0) {
// No matching element found in callback array
// Output a literal closing brace and continue
for (int j = 0; j < count; j++)
accum.Add_bry_escape_html(cur_char);
i += count;
continue;
}
int name_type = rule.names[matching_count];
byte[] element = null;
if (name_type == Xomw_prepro_rule.Name__null) {
// No element, just literal text
tmp_bfr.Add(piece.Break_syntax(tmp_bfr, matching_count));
element = tmp_bfr.Add(Bry_.Repeat_bry(rule.end, matching_count)).To_bry_and_clear();
}
else {
// Create XML element
// Note: $parts is already XML, does not need to be encoded further
List_adp parts = piece.parts;
byte[] title = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry_and_clear();
parts.Del_at(0);
// The invocation is at the start of the line if lineStart is set in
// the stack, and all opening brackets are used up.
byte[] attr = null;
if (max_count == matching_count && piece.line_start) { // RELIC:!empty( $piece->lineStart )
attr = Bry_.new_a7(" lineStart=\"1\"");
}
else {
attr = Bry_.Empty;
}
byte[] name_bry = Xomw_prepro_rule.Name(name_type);
tmp_bfr.Add_str_a7("<").Add(name_bry).Add(attr).Add_str_a7(">");
tmp_bfr.Add_str_a7("<title>").Add(title).Add_str_a7("</title>");
int arg_idx = 1;
int parts_len = parts.Len();
for (int j = 0; j < parts_len; j++) {
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(j);
if (part.Eqpos != -1) {
Bry_bfr part_bfr = part.bfr;
byte[] part_bfr_bry = part_bfr.Bfr();
tmp_bfr.Add_str_a7("<part><name>").Add_mid(part_bfr_bry, 0, part.Eqpos);
tmp_bfr.Add_str_a7("</name>=<value>").Add_mid(part_bfr_bry, part.Eqpos + 1, part_bfr.Len());
tmp_bfr.Add_str_a7("</value></part>");
}
else {
tmp_bfr.Add_str_a7("<part><name index=\"").Add_int_variable(arg_idx).Add_str_a7("\" /><value>").Add(part.bfr.To_bry()).Add_str_a7("</value></part>");
arg_idx++;
}
}
element = tmp_bfr.Add_str_a7("</").Add(name_bry).Add_str_a7(">").To_bry_and_clear();
}
// Advance input pointer
i += matching_count;
// Unwind the stack
stack.Pop();
accum = stack.Get_accum();
// Re-add the old stack element if it still has unmatched opening characters remaining
if (matching_count < piece.count) {
piece.Parts__renew(); // PORTED: piece.parts = [ new PPDPart ];
piece.count -= matching_count;
// do we still qualify for any callback with remaining count?
int min = Get_rule(piece.open).min;
if (piece.count >= min) {
stack.Push(piece);
accum = stack.Get_accum();
}
else {
accum.Add(Bry_.Repeat_bry(piece.open, piece.count));
}
}
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
// Add XML element to the enclosing accumulator
accum.Add(element);
}
else if (found == Found__pipe) {
find_equals = true; // shortcut for getFlags()
stack.Add_part(Bry_.Empty);
accum = stack.Get_accum();
i++;
}
else if (found == Found__equals) {
find_equals = false; // shortcut for getFlags()
stack.Get_current_part().Eqpos = accum.Len();
accum.Add_byte(Byte_ascii.Eq);
i++;
}
}
// Output any remaining unclosed brackets
Bry_bfr root_accum = stack.Get_root_accum();
int stack_len = stack.stack.Len();
for (int j = 0; j < stack_len; j++) {
Xomw_prepro_piece piece = (Xomw_prepro_piece)stack.stack.Get_at(j);
root_accum.Add(piece.Break_syntax(tmp_bfr, -1));
}
root_accum.Add_str_a7("</root>");
return root_accum.To_bry_and_clear();
}
private Xomw_prepro_rule Get_rule(byte[] bry) {
if (Bry_.Eq(bry, rule_curly.bgn)) return rule_curly;
else if (Bry_.Eq(bry, rule_brack.bgn)) return rule_brack;
else if (Bry_.Eq(bry, rule_langv.bgn)) return rule_langv;
else throw Err_.new_unhandled(bry);
}
private static final int
Found__line_bgn = 0
, Found__line_end = 1
, Found__pipe = 2
, Found__equals = 3
, Found__angle = 4
, Found__close = 5
, Found__open = 6
;
private static final Xomw_prepro_rule
rule_curly = new Xomw_prepro_rule(Bry_.new_a7("{"), Bry_.new_a7("}") , 2, 3, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__tmpl, Xomw_prepro_rule.Name__targ})
, rule_brack = new Xomw_prepro_rule(Bry_.new_a7("["), Bry_.new_a7("]") , 2, 2, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
, rule_langv = new Xomw_prepro_rule(Bry_.new_a7("-{"), Bry_.new_a7("}-"), 1, 1, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
;
private static final byte[]
Bry__only_include_bgn = Bry_.new_a7("<onlyinclude>")
, Bry__only_include_end = Bry_.new_a7("</onlyinclude>")
, Bry__comment_bgn = Bry_.new_a7("<!--")
, Bry__comment_end = Bry_.new_a7("-->")
, Bry__escaped_lt = Bry_.new_a7("&lt;")
, Bry__end_lhs = Bry_.new_a7("</")
;
private static final int Len__only_include_end = Bry__only_include_end.length;
private static final Btrie_slim_mgr cur_char_trie = Cur_char_trie__new();
private static final Ordered_hash
ignored_tags_y = Ordered_hash_.New_bry().Add_many_str("includeonly", "/includeonly")
, ignored_tags_n = Ordered_hash_.New_bry().Add_many_str("noinclude", "/noinclude", "onlyinclude", "/onlyinclude");
private static final Hash_adp_bry
ignored_elements__y = Hash_adp_bry.cs().Add_many_str("noinclude")
, ignored_elements__n = Hash_adp_bry.cs().Add_many_str("includeonly");
private static Btrie_slim_mgr Cur_char_trie__new() {
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
String[] ary = new String[] {"|", "=", "<", "\n", "{", "[", "-{", "}", "]"};
for (String str : ary) {
byte[] bry = Bry_.new_a7(str);
rv.Add_obj(bry, new Xomw_prepro_curchar_itm(bry, bry[0]));
}
// handle "}-" separately
byte[] langv_end = Bry_.new_a7("}-");
rv.Add_obj(langv_end, new Xomw_prepro_curchar_itm(langv_end, Byte_ascii.At));
return rv;
}
}

View File

@@ -1,232 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_prepro_wkr__tst {
private final Xomw_prepro_wkr__fxt fxt = new Xomw_prepro_wkr__fxt();
@Test public void Text() {
fxt.Test__parse("abc", "<root>abc</root>");
}
@Test public void Brack() {
fxt.Test__parse("a[[b]]c", "<root>a[[b]]c</root>");
}
@Test public void Brack__one() { // COVERS: "Add literal brace(s)"
fxt.Test__parse("a[b]c", "<root>a[b]c</root>");
}
@Test public void Brack__max() { // COVERS: "The specified maximum exists in the callback array, unless the caller"
fxt.Test__parse("a[[[[[b]]]]]c", "<root>a[[[[[b]]]]]c</root>");
}
@Test public void Template() {
fxt.Test__parse("a{{b}}c", "<root>a<template><title>b</title></template>c</root>");
}
@Test public void Template__args__idx() {
fxt.Test__parse("a{{b|c|d}}e", "<root>a<template><title>b</title><part><name index=\"1\" /><value>c</value></part><part><name index=\"2\" /><value>d</value></part></template>e</root>");
}
@Test public void Template__args__key() {
fxt.Test__parse("a{{b|c=d}}e", "<root>a<template><title>b</title><part><name>c</name>=<value>d</value></part></template>e</root>");
}
@Test public void Template__line_start() { // COVERS: "The invocation is at the start of the line if lineStart is set in"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "{{b}}"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<template lineStart=\"1\"><title>b</title></template></root>"
));
}
@Test public void Template__max() { // COVERS: "do we still qualify for any callback with remaining count?"
fxt.Test__parse("a{{{{{b}}}}}c", "<root>a<template><title><tplarg><title>b</title></tplarg></title></template>c</root>");
}
@Test public void Tplarg() {
fxt.Test__parse("a{{{b}}}c", "<root>a<tplarg><title>b</title></tplarg>c</root>");
}
@Test public void Comment() {
fxt.Test__parse("a<!--b-->c", "<root>a<comment>&lt;!--b--&gt;</comment>c</root>");
}
@Test public void Comment__dangling() {// COVERS: "Unclosed comment in input, runs to end"
fxt.Test__parse("a<!--b", "<root>a<comment>&lt;!--b</comment></root>");
}
@Test public void Comment__ws() { // COVERS: "Search backwards for leading whitespace"
fxt.Test__parse("a <!--b--> c", "<root>a <comment>&lt;!--b--&gt;</comment> c</root>"); // NOTE: space is outside comment
}
@Test public void Comment__many__ws() {// COVERS: "Dump all but the last comment to the accumulator"
fxt.Test__parse("a <!--1--> <!--2--> z", "<root>a <comment>&lt;!--1--&gt;</comment> <comment>&lt;!--2--&gt;</comment> z</root>"); // NOTE: space is outside comment;
}
@Test public void Comment__nl__ws() { // COVERS: "Eat the line if possible"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, " <!--1--> "
, " <!--2--> "
, "z"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<comment> &lt;!--1--&gt; " // NOTE: space is inside </comment> if flanked by nl;
, "</comment><comment> &lt;!--2--&gt; "
, "</comment>z</root>"
));
}
@Test public void Ext() { // COVERS.ALSO: "Note that the attr element contains the whitespace between name and attribute,"
fxt.Test__parse("a<pre id=\"1\">b</pre>c", "<root>a<ext><name>pre</name><attr> id=&quot;1&quot;</attr><inner>b</inner><close>&lt;/pre&gt;</close></ext>c</root>");
}
@Test public void Ext__inline() { // COVERS: "if ( $text[$tagEndPos - 1] == '/' ) {"
fxt.Test__parse("a<pre/>b" , "<root>a<ext><name>pre</name><attr></attr></ext>b</root>");
fxt.Test__parse("a<pre />b" , "<root>a<ext><name>pre</name><attr> </attr></ext>b</root>");
}
@Test public void Ext__end__pass__space() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pre >c", "<root>a<ext><name>pre</name><attr></attr><inner>b</inner><close>&lt;/pre &gt;</close></ext>c</root>");
}
@Test public void Ext__end__pass__name() { // COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pro></pre>c", "<root>a<ext><name>pre</name><attr></attr><inner>b&lt;/pro&gt;</inner><close>&lt;/pre&gt;</close></ext>c</root>");
}
@Test public void Ext__end__fail__angle() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pre c", "<root>a&lt;pre&gt;b&lt;/pre c</root>");
}
@Test public void Ext__dangling() { // COVERS: "Let it run out to the end of the text."
fxt.Test__parse("a<pre>bc", "<root>a&lt;pre&gt;bc</root>");
}
@Test public void Ext__dangling__many() { // COVERS: "Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>..."
fxt.Test__parse("a<pre><pre><pre>bc", "<root>a&lt;pre&gt;&lt;pre&gt;&lt;pre&gt;bc</root>");
}
@Test public void Ext__unclosed() { // COVERS: "Infinite backtrack"
fxt.Test__parse("a<pre bcd", "<root>a&lt;pre bcd</root>");
}
@Test public void Ext__noinclude() { // COVERS: "<includeonly> and <noinclude> just become <ignore> tags"
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<includeonly>b<noinclude>c</noinclude>d</includeonly>e", "<root>a<ignore>&lt;includeonly&gt;b&lt;noinclude&gt;c&lt;/noinclude&gt;d&lt;/includeonly&gt;</ignore>e</root>");
}
@Test public void Heading() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "== b1 =="
, "z"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">== b1 ==</h>"
, "z</root>"
));
}
@Test public void Heading__eos__no_nl() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "== b1 =="
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">== b1 ==</h></root>"
));
}
@Test public void Heading__bos__implied_nl() { // COVERS: "Is this the start of a heading?"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "== b1 =="
, "z"
), String_.Concat_lines_nl_skip_last
( "<root><h level=\"2\" i=\"1\">== b1 ==</h>"
, "z</root>"
));
}
@Test public void Heading__dwim__y() { // COVERS: "DWIM: This looks kind of like a name/value separator."
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a{{b|"
, "=c="
, "}}d"
), String_.Concat_lines_nl_skip_last
( "<root>a<template><title>b</title><part><name>"
, "</name>=<value>c="
, "</value></part></template>d</root>"
));
}
@Test public void Heading__dwim__n() { // COVERS: "DWIM: This looks kind of like a name/value separator."
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a{{b|"
, "==c=="
, "}}d"
), String_.Concat_lines_nl_skip_last
( "<root>a<template><title>b</title><part><name index=\"1\" /><value>"
, "<h level=\"2\" i=\"1\">==c==</h>"
, "</value></part></template>d</root>"
));
}
@Test public void Heading__comment() { // COVERS: "Comment found at line end"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "==b== <!--c-->"
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">==b== <comment>&lt;!--c--&gt;</comment></h>"
, "</root>"
));
}
@Test public void Heading__consecutive__5() { // COVERS: "This is just a single String of equals signs on its own line"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "====="
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">=====</h>"
, "</root>"
));
}
@Test public void Heading__consecutive__1() { // COVERS: "Single equals sign on its own line, count=0"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "="
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "="
, "</root>"
));
}
@Test public void Heading__unclosed() { // COVERS: "No match, no <h>, just pass down the inner src"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "===b"
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "===b"
, "</root>"
));
}
@Test public void Inclusion__n() {
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root>a<ignore>&lt;onlyinclude&gt;</ignore>b<ignore>&lt;/onlyinclude&gt;</ignore>c</root>");
}
@Test public void Inclusion__y() {
fxt.Init__for_inclusion_(Bool_.Y);
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root><ignore>a&lt;onlyinclude&gt;</ignore>b<ignore>&lt;/onlyinclude&gt;c</ignore></root>");
}
@Test public void Ignored__noinclude() { // COVERS: "Handle ignored tags"
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<noinclude>b</noinclude>c", "<root>a<ignore>&lt;noinclude&gt;</ignore>b<ignore>&lt;/noinclude&gt;</ignore>c</root>");
}
}
class Xomw_prepro_wkr__fxt {
private final Xomw_prepro_wkr wkr = new Xomw_prepro_wkr();
private boolean for_inclusion = false;
public Xomw_prepro_wkr__fxt() {
wkr.Init_by_wiki("pre");
}
public void Init__for_inclusion_(boolean v) {for_inclusion = v;}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Preprocess_to_xml(src_bry, for_inclusion);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
}
}

View File

@@ -1,239 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.parsers.htmls.*;
import gplx.core.primitives.*;
public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New();
private final Bry_bfr tmp = Bry_bfr_.New();
private final Int_list apos_pos_ary = new Int_list(32);
public byte[] Do_all_quotes(byte[] src) {
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 );
apos_pos_ary.Clear();
return bfr.To_bry_and_clear();
}
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
public int Split(byte[] src, int itm_bgn, int itm_end) {
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
if (arr == null) {
bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl();
return Bry_split_.Rv__ok;
}
int arr_len = arr.length;
// First, do some preliminary work. This may shift some apostrophes from
// being mark-up to being text. It also counts the number of occurrences
// of bold and italics mark-ups.
int num_bold = 0;
int num_italics = 0;
for (int i = 1; i < arr_len; i += 2) {
int apos_len = arr[i].length;
// If there are ever four apostrophes, assume the first is supposed to
// be text, and the remaining three constitute mark-up for bold text.
// (bug 13227: ''''foo'''' turns into ' ''' foo ' ''')
if (apos_len == 4) {
arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry);
arr[i] = Bry_.new_a7("'''");
apos_len = 3;
}
else if (apos_len > 5) {
// If there are more than 5 apostrophes in a row, assume they're all
// text except for the last 5.
// (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''')
arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5));
arr[i] = Bry_.new_a7("'''''");
apos_len = 5;
}
// Count the number of occurrences of bold and italics mark-ups.
if (apos_len == 2) {
num_italics++;
}
else if (apos_len == 3) {
num_bold++;
}
else if (apos_len == 5) {
num_italics++;
num_bold++;
}
}
// If there is an odd number of both bold and italics, it is likely
// that one of the bold ones was meant to be an apostrophe followed
// by italics. Which one we cannot know for certain, but it is more
// likely to be one that has a single-letter word before it.
// NOTE: this code primarily handles italicized possessives; EX: The ''[[Main Page]]'''s talk page.
if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
int prv_ends_w_word_1char = -1;
int prv_ends_w_word_nchar = -1;
int prv_ends_w_space = -1;
for (int i = 1; i < arr_len; i += 2) {
if (arr[i].length == 3) {
byte[] prv = arr[i - 1];
byte prv__last_char = Php_str_.Substr_byte(prv, -1);
byte prv__last_minus_1_char = Php_str_.Substr_byte(prv, -2, 1);
if (prv__last_char == Byte_ascii.Space) { // NOTE: prv ends in space; EX: "''prv '''"
if (prv_ends_w_space == -1) {
prv_ends_w_space = i;
}
}
else if (prv__last_minus_1_char == Byte_ascii.Space) { // NOTE: prv ends in 1-char word; EX: "''prv a'''"
prv_ends_w_word_1char = i;
// if $firstsingleletterword is set, we don't
// look at the other options, so we can bail early.
break;
}
else {
if (prv_ends_w_word_nchar == -1) {
prv_ends_w_word_nchar = i;
}
}
}
}
// If there is a single-letter word, use it!
if (prv_ends_w_word_1char > -1) {
arr[prv_ends_w_word_1char] = Wtxt__apos;
arr[prv_ends_w_word_1char - 1] = Bry_.Add(arr[prv_ends_w_word_1char - 1], Byte_ascii.Apos);
}
else if (prv_ends_w_word_nchar > -1) {
// If not, but there's a multi-letter word, use that one.
arr[prv_ends_w_word_nchar] = Wtxt__apos;
arr[prv_ends_w_word_nchar - 1] = Bry_.Add(arr[prv_ends_w_word_nchar - 1], Byte_ascii.Apos);
}
else if (prv_ends_w_space > -1) {
// ... otherwise use the first one that has neither.
// (notice that it is possible for all three to be -1 if, for example,
// there is only one pentuple-apostrophe in the line)
arr[prv_ends_w_space] = Wtxt__apos;
arr[prv_ends_w_space - 1] = Bry_.Add(arr[prv_ends_w_space - 1], Byte_ascii.Apos);
}
}
// Now let's actually convert our apostrophic mush to HTML!
int state = State__empty;
for (int j = 0; j < arr_len; j++) {
if ((j % 2) == 0) {
if (state == State__both) {
tmp.Add(arr[j]);
}
else {
bfr.Add(arr[j]);
}
}
else {
int apos_len = arr[j].length;
if (apos_len == 2) {
if (state == State__i) {
bfr.Add_str_a7("</i>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i>");
state = State__b;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i><b>");
state = State__b;
}
else if (state == State__both) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_preserve(tmp).Add_str_a7("</i>");
state = State__b;
}
else { // state can be 'b' or ''
bfr.Add_str_a7("<i>");
state = state == State__b ? State__bi : State__i;
}
}
else if (apos_len == 3) {
if (state == State__b) {
bfr.Add_str_a7("</b>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b><i>");
state = State__i;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b>");
state = State__i;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b>");
state = State__i;
}
else { // state can be 'i' or ''
bfr.Add_str_a7("<b>");
state = state == State__i ? State__ib : State__b;
}
}
else if (apos_len == 5) {
if (state == State__b) {
bfr.Add_str_a7("</b><i>");
state = State__i;
}
else if (state == State__i) {
bfr.Add_str_a7("</i><b>");
state = State__b;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b>");
state = State__empty;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i>");
state = State__empty;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b></i>");
state = State__empty;
}
else { // (state == '')
tmp.Clear();
state = State__both;
}
}
}
}
// Now close all remaining tags. Notice that the order is important.
if (state == State__b || state == State__ib) {
bfr.Add_str_a7("</b>");
}
if (state == State__i || state == State__bi || state == State__ib) {
bfr.Add_str_a7("</i>");
}
if (state == State__bi) {
bfr.Add_str_a7("</b>");
}
// There might be lonely ''''', so make sure we have a buffer
if (state == State__both && tmp.Len_gt_0()) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
}
bfr.Add_byte_nl();
return Bry_split_.Rv__ok;
}
private static final int
State__empty = 0
, State__b = 1
, State__i = 2
, State__bi = 3
, State__ib = 4
, State__both = 5
;
}

View File

@@ -1,43 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_quote_wkr__tst {
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
@Test public void Apos__0() {fxt.Test__parse("abc" , "abc");}
@Test public void Apos__1() {fxt.Test__parse("a'b'c" , "a'b'c");}
@Test public void Apos__2() {fxt.Test__parse("a''b''c" , "a<i>b</i>c");}
@Test public void Apos__3() {fxt.Test__parse("a'''b'''c" , "a<b>b</b>c");}
@Test public void Apos__4() {fxt.Test__parse("a''''b''''c" , "a'<b>b'</b>c");} // COVERS: "If there are ever four apostrophes"
@Test public void Apos__5() {fxt.Test__parse("a'''''b'''''c" , "a<i><b>b</b></i>c");}
@Test public void Apos__7() {fxt.Test__parse("a'''''''b'''''''c" , "a''<i><b>b''</b></i>c");} // COVERS: "If there are more than 5 apostrophes in a row"
@Test public void Mix__single() {fxt.Test__parse("''a ''' ''b b''' ''cc'''" , "<i>a <b> </b></i><b>b b'<i> </i>cc</b>");} // COVERS: "If there is a single-letter word, use it!"
@Test public void Mix__multi() {fxt.Test__parse("''a ''' ''b ''' ''cc'''" , "<i>a <b> </b></i><b>b </b> <i>cc'</i>");} // COVERS: "If not, but there's a multi-letter word, use that one."
@Test public void Mix__space() {fxt.Test__parse("''a ''' ''b ''' ''c '''" , "<i>a '</i> <i>b <b> </b></i><b>c </b>");} // COVERS: "... otherwise use the first one that has neither."
@Test public void Dangling__b() {fxt.Test__parse("a'''b" , "a<b>b</b>");} // COVERS: "if (state == State__b || state == State__ib)"
@Test public void Dangling__i() {fxt.Test__parse("a''b" , "a<i>b</i>");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)"
@Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "a<b><i>b</i></b>");} // COVERS: "There might be lonely ''''', so make sure we have a buffer"
}
class Xomw_quote_wkr__fxt {
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Do_all_quotes(src_bry);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
}
}

View File

@@ -1,281 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*;
public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New(), tmp_bfr = Bry_bfr_.New();
private final List_adp
td_history = List_adp_.New() // Is currently a td tag open?
, last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption)
, tr_history = List_adp_.New() // Is currently a tr tag open?
, tr_attributes = List_adp_.New() // history of tr attributes
, has_opened_tr = List_adp_.New() // Did this table open a <tr> element?
;
private int indent_level = 0; // indent level of the table
private byte[] first_2 = new byte[2];
private Xomw_sanitizer_mgr sanitizer;
private Xop_uniq_mgr uniq_mgr;
public byte[] Do_table_stuff(Xomw_parser_ctx ctx, byte[] src) {
this.sanitizer = ctx.Sanitizer();
this.uniq_mgr = ctx.Uniq_mgr();
indent_level = 0;
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text);
// Closing open td, tr && table
while (td_history.Len() > 0) {
if (Php_ary_.Pop_bool_or_n(td_history)) {
bfr.Add_str_a7("</td>\n");
}
if (Php_ary_.Pop_bool_or_n(tr_history)) {
bfr.Add_str_a7("</tr>\n");
}
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
bfr.Add_str_a7("<tr><td></td></tr>\n");
}
bfr.Add_str_a7("</table>\n");
}
// Remove trailing line-ending (b/c)
if (bfr.Get_at_last_or_nil_if_empty() == Byte_ascii.Nl) {
bfr.Del_by_1();
}
// special case: don't return empty table
if ( bfr.Len() == Len__tb__empty
&& Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) {
bfr.Clear();
return Bry_.Empty;
}
return bfr.To_bry_and_clear();
}
public int Split(byte[] src, int itm_bgn, int itm_end) {
byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine"
byte[] line = Bry_.Trim(out_line); // MW: "$line"
int line_len = line.length;
if (line_len == 0) { // empty line, go to next line
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
byte first_char = line[0];
first_2[0] = line[0];
if (line_len > 1) first_2[1] = line[1];
// PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)
byte[] tblw_atrs = null;
boolean tblw_bgn_found = false;
int colons_end = Bry_find_.Find_fwd_while(src, 0, line_len, Byte_ascii.Colon);
int tblw_bgn = Bry_find_.Find_fwd_while(line, colons_end, line_len, Byte_ascii.Space);
int tblw_atrs_bgn = tblw_bgn + 2;
if (Bry_.Eq(line, tblw_bgn, tblw_atrs_bgn, Wtxt__tb__bgn)) {
tblw_bgn_found = true;
tblw_atrs = (tblw_atrs_bgn == line_len) ? Bry_.Empty : Bry_.Mid(line, tblw_atrs_bgn, line_len);
}
if (tblw_bgn_found) {
// First check if we are starting a new table
indent_level = colons_end;
tblw_atrs = uniq_mgr.Convert(tblw_atrs);
// PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>";
for (int j = 0; j < indent_level; j++)
tmp_bfr.Add(Html__dl__bgn);
tmp_bfr.Add_str_a7("<table");
sanitizer.Fix_tag_attributes(tmp_bfr, Name__table, tblw_atrs);
tmp_bfr.Add_byte(Byte_ascii.Angle_end);
out_line = tmp_bfr.To_bry_and_clear();
td_history.Add(false);
last_tag_history.Add(Bry_.Empty);
tr_history.Add(false);
tr_attributes.Add(Bry_.Empty);
has_opened_tr.Add(false);
}
else if (td_history.Len() == 0) {
// Don't do any of the following
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
else if (Bry_.Eq(first_2, Wtxt__tb__end)) {
// We are ending a table
line = tmp_bfr.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear();
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
line = tmp_bfr.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear();
}
if (Php_ary_.Pop_bool_or_n(tr_history)) {
line = tmp_bfr.Add_str_a7("</tr>").Add(line).To_bry_and_clear();
}
if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear();
}
Php_ary_.Pop_bry_or_null(tr_attributes);
// PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level );
tmp_bfr.Add(line);
for (int j = 0; j < indent_level; j++)
tmp_bfr.Add(Html__dl__end);
out_line = tmp_bfr.To_bry_and_clear();
}
else if (Bry_.Eq(first_2, Wtxt__tr)) {
// Now we have a table row
line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line);
// Whats after the tag is now only attributes
byte[] atrs = uniq_mgr.Unstrip_both(line);
sanitizer.Fix_tag_attributes(tmp_bfr, Name__tr, atrs);
atrs = tmp_bfr.To_bry_and_clear();
Php_ary_.Pop_bry_or_null(tr_attributes);
tr_attributes.Add(atrs);
line = Bry_.Empty;
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
Php_ary_.Pop_bool_or_n(has_opened_tr);
has_opened_tr.Add(true);
if (Php_ary_.Pop_bool_or_n(tr_history)) {
line = Html__tr__end;
}
if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear();
}
out_line = line;
tr_history.Add(false);
td_history.Add(false);
last_tag_history.Add(Bry_.Empty);
}
else if ( first_char == Byte_ascii.Pipe
|| first_char == Byte_ascii.Bang
|| Bry_.Eq(first_2, Wtxt__caption)
) {
// This might be cell elements, td, th or captions
if (Bry_.Eq(first_2, Wtxt__caption)) {
first_char = Byte_ascii.Plus;
line = Bry_.Mid(line, 2);
} else {
line = Bry_.Mid(line, 1);
}
// Implies both are valid for table headings.
if (first_char == Byte_ascii.Bang) {
Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
}
// Split up multiple cells on the same line.
// FIXME : This can result in improper nesting of tags processed
// by earlier parser steps.
byte[][] cells = Bry_split_.Split(line, Wtxt__td2);
out_line = Bry_.Empty;
byte[] previous = null;
// Loop through each table cell
int cells_len = cells.length;
for (int j = 0; j < cells_len; j++) {
byte[] cell = cells[j];
previous = Bry_.Empty;
if (first_char != Byte_ascii.Plus) {
byte[] tr_after = Php_ary_.Pop_bry_or_null(tr_attributes);
if (!Php_ary_.Pop_bool_or_n(tr_history)) {
previous = tmp_bfr.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear();
}
tr_history.Add(true);
tr_attributes.Add(Bry_.Empty);
Php_ary_.Pop_bool_or_n(has_opened_tr);
has_opened_tr.Add(true);
}
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (Php_ary_.Pop_bool_or_n(td_history)) {
previous = tmp_bfr.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear();
}
if (first_char == Byte_ascii.Pipe) {
last_tag = Name__td;
}
else if (first_char == Byte_ascii.Bang) {
last_tag = Name__th;
}
else if (first_char == Byte_ascii.Plus) {
last_tag = Name__caption;
}
else {
last_tag = Bry_.Empty;
}
last_tag_history.Add(last_tag);
// A cell could contain both parameters and data
byte[][] cell_data = Bry_split_.Split_w_max(cell, Byte_ascii.Pipe, 2);
// Bug 553: Note that a '|' inside an invalid link should not
// be mistaken as delimiting cell parameters
byte[] cell_data_0 = cell_data[0];
byte[] cell_data_1 = cell_data[1];
if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) {
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear();
}
else if (cell_data_1 == null) {
cell = tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear();
}
else {
byte[] atrs = uniq_mgr.Unstrip_both(cell_data_0);
tmp_bfr.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
sanitizer.Fix_tag_attributes(tmp_bfr, last_tag, atrs);
tmp_bfr.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
cell = tmp_bfr.To_bry_and_clear();
}
out_line = Bry_.Add(out_line, cell);
td_history.Add(true);
}
}
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
private static final byte[]
Wtxt__tb__bgn = Bry_.new_a7("{|")
, Wtxt__tb__end = Bry_.new_a7("|}")
, Wtxt__tr = Bry_.new_a7("|-")
, Wtxt__caption = Bry_.new_a7("|+")
, Wtxt__th2 = Bry_.new_a7("!!")
, Wtxt__td2 = Bry_.new_a7("||")
, Wtxt__lnki__bgn = Bry_.new_a7("[[")
, Name__table = Bry_.new_a7("table")
, Name__tr = Bry_.new_a7("tr")
, Name__td = Bry_.new_a7("td")
, Name__th = Bry_.new_a7("th")
, Name__caption = Bry_.new_a7("caption")
, Html__tr__end = Bry_.new_a7("</tr>")
, Html__dl__bgn = Bry_.new_a7("<dl><dd>")
, Html__dl__end = Bry_.new_a7("</dd></dl>")
, Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>")
;
private static final int Len__tb__empty = Html__tb__empty.length;
}

View File

@@ -1,113 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_table_wkr__tst {
private final Xomw_table_wkr__fxt fxt = new Xomw_table_wkr__fxt();
@Test public void Basic() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|"
, "|-"
, "|a"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
));
}
@Test public void Tb__atrs() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|id='1'"
, "|-"
, "|a"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table id=\"1\">"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
));
}
@Test public void Tc__atrs() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|"
, "|+id='1'|a"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table>"
, "<caption id=\"1\">a"
, "</caption><tr><td></td></tr></table>"
));
}
@Test public void Th__double() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|"
, "!a!!b"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table>"
, "<tr>"
, "<th>a</th>"
, "<th>b"
, "</th></tr></table>"
));
}
@Test public void Blank() { // COVERS: "empty line, go to next line"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( " "
), String_.Concat_lines_nl_skip_last
( " "
));
}
@Test public void Tb__indent() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "::{|"
, "|-"
, "|a"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<dl><dd><dl><dd><table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table></dd></dl></dd></dl>"
));
}
@Test public void Tb__empty() { // COVERS: "if (has_opened_tr.Len() == 0) {"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "{|"
, "|}"
), String_.Concat_lines_nl_skip_last
( "<table>"
, "<tr><td></td></tr></table>"
));
}
}
class Xomw_table_wkr__fxt {
private final Xomw_parser_ctx ctx = new Xomw_parser_ctx();
private final Xomw_table_wkr wkr = new Xomw_table_wkr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Do_table_stuff(ctx, src_bry);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
}
}

View File

@@ -1,41 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.xowa.parsers.htmls.*;
public class Xomw_sanitizer_mgr {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
int len = atr_bldr.Atrs__len();
// PORTED: Sanitizer.php|safeEncodeTagAttributes
for (int i = 0; i < len; i++) {
// $encAttribute = htmlspecialchars( $attribute );
// $encValue = Sanitizer::safeEncodeAttribute( $value );
// $attribs[] = "$encAttribute=\"$encValue\"";
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
bfr.Add_byte_space(); // "return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';"
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
bfr.Add_byte_eq().Add_byte_quote();
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
bfr.Add_byte_quote();
}
}
}

View File

@@ -1,62 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
public class Xomw_string_utils {
public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
// PORTED: avoiding multiple regex calls / String creations
// $placeholder = "\x00";
// Remove placeholder instances
// $text = str_replace( $placeholder, '', $text );
// Replace instances of the separator inside HTML-like tags with the placeholder
// $replacer = new DoubleReplacer( $search, $placeholder );
// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
// Explode, then put the replaced separators back in
// $cleaned = str_replace( $search, $replace, $cleaned );
// $text = str_replace( $placeholder, $search, $cleaned );
// if same length find / repl, do in-place replacement; EX: "!!" -> "||"
int find_len = find.length;
int repl_len = repl.length;
if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
byte find_0 = find[0];
byte dlm_bgn = Byte_ascii.Angle_bgn;
byte dlm_end = Byte_ascii.Angle_end;
boolean repl_active = true;
// loop every char in array
for (int i = src_bgn; i < src_end; i++) {
byte b = src[i];
if ( b == find_0
&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
&& repl_active
) {
Bry_.Set(src, i, i + find_len, repl);
}
else if (b == dlm_bgn) {
repl_active = false;
}
else if (b == dlm_end) {
repl_active = true;
}
}
}
}

View File

@@ -1,47 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.utils; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_string_utils__tst {
private final Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
@Test public void Basic() {
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
}
@Test public void Missing() {
fxt.Test__replace_markup("abcd" , "!!", "||", "abcd");
}
@Test public void Eos() {
fxt.Test__replace_markup("a!!" , "!!", "||", "a||");
}
@Test public void Ignore() {
fxt.Test__replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
}
@Test public void Ignore__asym__lhs() {
fxt.Test__replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
}
@Test public void Ignore__asym__rhs() {
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
}
}
class Xomw_string_utils__fxt {
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
Tfds.Eq_str(expd, src_bry);
}
}

View File

@@ -1,22 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
public interface Xomw_hdr_cbk {
void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr);
void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr);
}

View File

@@ -1,48 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
public class Xomw_hdr_cbk__html implements Xomw_hdr_cbk {
public Bry_bfr Bfr() {return bfr;} private final Bry_bfr bfr = Bry_bfr_.New();
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
// add from txt_bgn to hdr_bgn; EX: "abc\n==A==\n"; "\n==" seen -> add "abc"
byte[] src = wkr.Src();
int hdr_bgn = wkr.Hdr_bgn(), txt_bgn = wkr.Txt_bgn();
if (hdr_bgn > txt_bgn)
bfr.Add_mid(src, txt_bgn, hdr_bgn);
// add "\n" unless BOS
if (hdr_bgn != Xomw_parser_ctx.Pos__bos) bfr.Add_byte_nl();
// add <h2>...</h2>
int hdr_num = wkr.Hdr_num();
bfr.Add(Tag__lhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // <h2>
bfr.Add_mid(wkr.Src(), wkr.Hdr_lhs_end(), wkr.Hdr_rhs_bgn());
bfr.Add(Tag__rhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // </h2>
}
public void On_src_done(Xomw_parser_ctx pctx, Xomw_hdr_wkr wkr) {
// add from txt_bgn to EOS;
byte[] src = wkr.Src();
int txt_bgn = wkr.Txt_bgn(), src_end = wkr.Src_end();
if (txt_bgn != src_end) // PERF: don't call Add_mid() if hdr is at end of EOS
bfr.Add_mid(src, txt_bgn, src_end);
}
private static final byte[]
Tag__lhs = Bry_.new_a7("<h")
, Tag__rhs = Bry_.new_a7("</h")
;
}

View File

@@ -1,98 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.core.btries.*; import gplx.xowa.langs.*;
public class Xomw_hdr_wkr {
private Xomw_parser_ctx pctx;
private Xomw_hdr_cbk cbk;
public byte[] Src() {return src;} private byte[] src;
public int Src_end() {return src_end;} private int src_end;
public int Txt_bgn() {return txt_bgn;} private int txt_bgn;
public int Hdr_bgn() {return hdr_bgn;} private int hdr_bgn;
public int Hdr_end() {return hdr_end;} private int hdr_end;
public int Hdr_num() {return hdr_num;} private int hdr_num;
public int Hdr_lhs_bgn() {return hdr_lhs_bgn;} private int hdr_lhs_bgn;
public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end;
public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn;
public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end;
public void Parse(Xomw_parser_ctx pctx, byte[] src, int src_bgn, int src_end, Xomw_hdr_cbk cbk) { // REF.MW: /includes/parser/Parser.php|doHeadings
// init members
this.pctx = pctx;
this.src = src;
this.src_end = src_end;
this.cbk = cbk;
// do loop
int pos = src_bgn;
this.txt_bgn = pos == Xomw_parser_ctx.Pos__bos ? 0 : pos;
byte b = Byte_ascii.Nl;
while (true) {
int nxt = pos + 1;
// check if (a) cur is \n; (b) nxt is '='
if ( b == Byte_ascii.Nl
&& nxt < src_end
&& src[nxt] == Byte_ascii.Eq
) {
pos = Parse_hdr_nl(txt_bgn, pos, nxt + 1);
this.txt_bgn = pos;
}
else
++pos;
// EOS; add all text after last "==\n"
if (pos == src_end) {
cbk.On_src_done(pctx, this);
break;
}
b = src[pos];
}
}
private int Parse_hdr_nl(int txt_bgn, int nl_lhs, int pos) {
// calc lhs vars
this.hdr_bgn = nl_lhs;
this.hdr_lhs_bgn = nl_lhs == 0 ? 0 : nl_lhs + 1; // set pos of 1st "="; note that "==" can be at BOS;
this.hdr_lhs_end = Bry_find_.Find_fwd_while(src, pos, src_end, Byte_ascii.Eq);
// calc rhs vars
int nl_rhs = Bry_find_.Find_fwd_or(src, Byte_ascii.Nl, hdr_lhs_end + 1, src_end, src_end); // if no "\n", src_end is rest of text; EX: "\n==<text>EOS
this.hdr_end = nl_rhs;
this.hdr_rhs_end = Bry_find_.Find_bwd__skip_ws(src, nl_rhs, hdr_lhs_end);
this.hdr_rhs_bgn = Bry_find_.Find_bwd__skip(src, hdr_rhs_end - 1, hdr_lhs_end, Byte_ascii.Eq);
int hdr_lhs_len = hdr_lhs_end - hdr_lhs_bgn;
int hdr_rhs_len = hdr_rhs_end - hdr_rhs_bgn;
// handle rare situations like "\n====\n"
if (hdr_rhs_len == 0) {
int hdr_lhs_len_half = hdr_lhs_len / 2;
hdr_rhs_len = hdr_lhs_len - hdr_lhs_len_half;
hdr_lhs_len = hdr_lhs_len_half;
this.hdr_lhs_end = hdr_lhs_bgn + hdr_lhs_len;
this.hdr_rhs_bgn = hdr_lhs_end;
}
this.hdr_num = hdr_lhs_len < hdr_rhs_len ? hdr_lhs_len : hdr_rhs_len;
cbk.On_hdr_seen(pctx, this);
return nl_rhs;
}
}
// for ( $i = 6; $i >= 1; --$i ) {
// $h = str_repeat( '=', $i );
// $text = preg_replace( "/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text );
// }

View File

@@ -1,40 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.wkrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_hdr_wkr_tst {
private final Xomw_hdr_wkr_fxt fxt = new Xomw_hdr_wkr_fxt();
@Test public void Basic() {
fxt.Test__parse("==A==" , "<h2>A</h2>");
fxt.Test__parse("abc\n==A==\ndef" , "abc\n<h2>A</h2>\ndef");
fxt.Test__parse("abc" , "abc");
fxt.Test__parse("abc\ndef" , "abc\ndef");
fxt.Test__parse("abc\n==" , "abc\n<h1></h1>");
}
}
class Xomw_hdr_wkr_fxt {
private final Xomw_hdr_wkr wkr = new Xomw_hdr_wkr();
private final Xomw_hdr_cbk__html cbk = new Xomw_hdr_cbk__html();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Parse(pctx, src_bry, -1, src_bry.length, cbk);
Tfds.Eq_str_lines(expd, cbk.Bfr().To_str_and_clear(), src_str);
}
}

View File

@@ -22,6 +22,7 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
private final Bry_bfr key_bfr = Bry_bfr_.New_w_size(32);
private int idx = -1;
public void Clear() {idx = -1; general_trie.Clear();}
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
public byte[] Add(byte[] val) { // "<b>" -> "\u007fUNIQ-item-1--QINU\u007f"
byte[] key = key_bfr
.Add(Bry__uniq__add__bgn)
@@ -30,10 +31,6 @@ public class Xop_uniq_mgr { // REF.MW:/parser/StripState.php
general_trie.Add_bry_bry(key, val);
return key;
}
public byte[] Get(byte[] key) {return (byte[])general_trie.Match_exact(key, 0, key.length);}
public byte[] Unstrip_both(byte[] src) {
return Convert(src);
}
public byte[] Convert(byte[] src) {
if (general_trie.Count() == 0) return src;