Mw_parse.Block: Add initial implementation

2024-10-27 20:34:16 +00:00 · 2017-01-11 23:45:11 -05:00 · 2017-01-11 23:45:11 -05:00 · b35a45657c
commit b35a45657c
parent 086cdea9fb
2 changed files with 252 additions and 0 deletions
--- a/400_xowa/src/gplx/langs/phps/Php_str_.java
+++ b/400_xowa/src/gplx/langs/phps/Php_str_.java
@ -17,6 +17,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.langs.phps; import gplx.*; import gplx.langs.*;
 public class Php_str_ {
 	public static byte[] Substr(byte[] src, int bgn, int len) {return Bry_.Mid(src, bgn, bgn + len);}
 	public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
 		if (max == -1) max = src_len;
 		int rv = 0;
--- a/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java
+++ b/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java
@ -0,0 +1,251 @@
 /*
 XOWA: the XOWA Offline Wiki Application
 Copyright (C) 2012 gnosygnu@gmail.com
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.
 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.xowa.parsers.mws.blocks; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
 import gplx.langs.phps.*;
 public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
 	private final    Bry_bfr bfr = Bry_bfr_.New();
 	private byte[] last_prefix, last_section;
 	private boolean line_start, dt_open, in_block_elem, para_stack, in_blockquote, in_pre = false;
 	private int prefix_len;
 	private int src_len;
 	public byte[] Do_block_levels(byte[] src, boolean line_start) {
 		this.src_len = src.length;
 		this.line_start = line_start;
 		// Parsing through the text line by line.  The main thing
 		// happening here is handling of block-level elements p, pre,
 		// and making lists from lines starting with * # : etc.
 		this.last_prefix = Bry_.Empty;
 		bfr.Clear();
 		this.dt_open = this.in_block_elem = false;
 		this.prefix_len = 0;
 		this.para_stack = false;
 		this.in_blockquote = false;
 		// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
 		Bry_split_.Split(src, 0, src_len, Byte_ascii.Nl, Bool_.N, this);
 		while (prefix_len > 0) {
 			// bfr .= this.closeList(prefix2[prefix_len - 1]);
 			prefix_len--;
 			if (prefix_len > 0) {
 				bfr.Add_byte_nl();
 			}
 		}
 		if (Bry_.Len_gt_0(last_section)) {
 			bfr.Add_str_a7("</").Add(last_section).Add_str_a7(">");
 			this.last_section = Bry_.Empty;
 		}
 		if (dt_open || in_block_elem || para_stack || in_blockquote || in_pre) {
 		}
 		return bfr.To_bry_and_clear();
 	}
 	public int Split(byte[] src, int itm_bgn, int itm_end) {
 		// Fix up line_start
 		if (!line_start) {
 			bfr.Add_mid(src, itm_bgn, itm_end);
 			line_start = true;
 			return Bry_split_.Rv__ok;
 		}
 		// * = ul
 		// # = ol
 		// ; = dt
 		// : = dd
 		int last_prefix_len = last_prefix.length;
 		boolean pre_close_match = false; //preg_match('/<\\/pre/i', $oLine);
 		boolean pre_open_match = false;  //preg_match('/<pre/i', $oLine);
 		byte[] prefix = null, prefix2 = null, t = null;
 		// If not in a <pre> element, scan for and figure out what prefixes are there.
 		if (!in_pre) {
 			// Multiple prefixes may abut each other for nested lists.
 			prefix_len = 0;// strspn($oLine, '*#:;');
 			prefix = Php_str_.Substr(src, itm_bgn, prefix_len);
 			// eh?
 			// ; and : are both from definition-lists, so they're equivalent
 			//  for the purposes of determining whether or not we need to open/close
 			//  elements.
 			prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
 			t = Bry_.Mid(src, itm_bgn + prefix_len, itm_end);
 //				this.in_pre = (boolean)pre_open_match;
 		}
 		else {
 			// Don't interpret any other prefixes in preformatted text
 			prefix_len = 0;
 			prefix = prefix2 = Bry_.Empty;
 			t = Bry_.Mid(src, itm_bgn, itm_end);
 		}
 		// List generation
 		byte[] term = null, t2 = null;
 		int common_prefix_len = -1;
 		if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
 			// Same as the last item, so no need to deal with nesting or opening stuff
 //				bfr .= this.nextItem(substr(prefix, -1));
 			para_stack = false;
 			if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
 				// The one nasty exception: definition lists work like this:
 				// ; title : definition text
 				// So we check for : in the remainder text to split up the
 				// title and definition, without b0rking links.
 				term = t2 = Bry_.Empty;
 //					if (this.findColonNoLinks(t, term, t2) !== false) {
 					t = t2;
 					bfr.Add(term); // . this.nextItem(':');
 //					}
 			}
 		}
 		else if (prefix_len > 0 || last_prefix_len > 0) {
 			// We need to open or close prefixes, or both.
 			// Either open or close a level...
 //				common_prefix_len = this.getCommon(prefix, last_prefix);
 			para_stack = false;
 			// Close all the prefixes which aren't shared.
 			while (common_prefix_len < last_prefix_len) {
 //					bfr .= this.closeList(last_prefix[last_prefix_len - 1]);
 				last_prefix_len--;
 			}
 //
 			// Continue the current prefix if appropriate.
 			if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
 //					bfr .= this.nextItem(prefix[common_prefix_len - 1]);
 			}
 			// Open prefixes where appropriate.
 			if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
 				bfr.Add_byte_nl();
 			}
 			while (prefix_len > common_prefix_len) {
 //					$char = substr(prefix, common_prefix_len, 1);
 //					bfr .= this.openList($char);
 //
 //					if (';' == $char) {
 //						// @todo FIXME: This is dupe of code above
 //						if (this.findColonNoLinks(t, term, t2) !== false) {
 //							t = t2;
 //							bfr .= term . this.nextItem(':');
 //						}
 //					}
 				++common_prefix_len;
 			}
 			if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
 				bfr.Add_byte_nl();
 			}
 			last_prefix = prefix2;
 		}
 		// If we have no prefixes, go to paragraph mode.
 		if (0 == prefix_len) {
 			// No prefix (not in list)--go to paragraph mode
 			// XXX: use a stack for nestable elements like span, table and div
 			boolean open_match = false, close_match = false;
 //				open_match = preg_match(
 //					'/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
 //						. '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
 //					t
 //				);
 //				close_match = preg_match(
 //					'/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
 //						. '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
 //						. self::MARKER_PREFIX
 //						. '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
 //					t
 //				);
 			if (open_match || close_match) {
 				para_stack = false;
 				// @todo bug 5718: paragraph closed
 //					bfr .= this.closeParagraph();
 				if (pre_open_match && !pre_close_match) {
 					this.in_pre = true;
 				}
 //					$bqOffset = 0;
 //					while (preg_match('/<(\\/?)blockquote[\s>]/i', t,
 //						$bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
 //					) {
 //						in_blockquote = !$bqMatch[1][0]; // is this a close tag?
 //						$bqOffset = $bqMatch[0][1] + strlen($bqMatch[0][0]);
 //					}
 				in_block_elem = !close_match;
 			}
 			else if (!in_block_elem && !this.in_pre) {
 				if (    Byte_ascii.Space == t[0]
 //						&& (last_section == 'pre' || trim(t) != '')
 					&& !in_blockquote
 				) {
 					// pre
 //						if (this.last_section !== 'pre') {
 						para_stack = false;
 //							bfr .= this.closeParagraph() . '<pre>';
 //							this.last_section = 'pre';
 //						}
 					t = Bry_.Mid(t, 1);
 				}
 				else {
 					// paragraph
 //						if (trim(t) == '') {
 						if (para_stack) {
 //								bfr .= para_stack . '<br />';
 							para_stack = false;
 //								this.last_section = 'p';
 						}
 						else {
 //								if (this.last_section !== 'p') {
 //									bfr .= this.closeParagraph();
 //									this.last_section = '';
 //									para_stack = '<p>';
 //								}
 //								else {
 //									para_stack = '</p><p>';
 //								}
 						}
 //						}
 //						else {
 						if (para_stack) {
 //								bfr .= para_stack;
 							para_stack = false;
 //								this.last_section = 'p';
 						}
 //							else if (this.last_section !== 'p') {
 //								bfr .= this.closeParagraph() . '<p>';
 //								this.last_section = 'p';
 //							}
 //						}
 				}
 			}
 		}
 		// somewhere above we forget to get out of pre block (bug 785)
 		if (pre_close_match && this.in_pre) {
 			this.in_pre = false;
 		}
 		if (para_stack == false) {
 			bfr.Add(t);
 			if (prefix_len == 0) {
 				bfr.Add_byte_nl();
 			}
 		}
 		if (last_prefix_len == -1 || common_prefix_len == -1) {
 		}
 		return Bry_split_.Rv__ok;
 	}
 }