Mw_parse.Block: Add initial implementation

2025-06-13 12:54:14 +00:00 · 2017-01-11 23:45:11 -05:00 · 2017-01-11 23:45:11 -05:00 · b35a45657c
commit b35a45657c
parent 086cdea9fb
2 changed files with 252 additions and 0 deletions
--- a/400_xowa/src/gplx/langs/phps/Php_str_.java
+++ b/400_xowa/src/gplx/langs/phps/Php_str_.java
@ -17,6 +17,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.langs.phps; import gplx.*; import gplx.langs.*;
 public class Php_str_ {
+	public static byte[] Substr(byte[] src, int bgn, int len) {return Bry_.Mid(src, bgn, bgn + len);}
 	public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
 		if (max == -1) max = src_len;
 		int rv = 0;
--- a/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java
+++ b/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java
@ -0,0 +1,251 @@
+/*
+XOWA: the XOWA Offline Wiki Application
+Copyright (C) 2012 gnosygnu@gmail.com
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+package gplx.xowa.parsers.mws.blocks; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
+import gplx.langs.phps.*;
+public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
+	private final    Bry_bfr bfr = Bry_bfr_.New();
+	private byte[] last_prefix, last_section;
+	private boolean line_start, dt_open, in_block_elem, para_stack, in_blockquote, in_pre = false;
+	private int prefix_len;
+	private int src_len;
+	public byte[] Do_block_levels(byte[] src, boolean line_start) {
+		this.src_len = src.length;
+		this.line_start = line_start;
+		// Parsing through the text line by line.  The main thing
+		// happening here is handling of block-level elements p, pre,
+		// and making lists from lines starting with * # : etc.
+		this.last_prefix = Bry_.Empty;
+		bfr.Clear();
+		this.dt_open = this.in_block_elem = false;
+		this.prefix_len = 0;
+		this.para_stack = false;
+		this.in_blockquote = false;
+
+		// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
+		Bry_split_.Split(src, 0, src_len, Byte_ascii.Nl, Bool_.N, this);
+
+		while (prefix_len > 0) {
+			// bfr .= this.closeList(prefix2[prefix_len - 1]);
+			prefix_len--;
+			if (prefix_len > 0) {
+				bfr.Add_byte_nl();
+			}
+		}
+		if (Bry_.Len_gt_0(last_section)) {
+			bfr.Add_str_a7("</").Add(last_section).Add_str_a7(">");
+			this.last_section = Bry_.Empty;
+		}
+
+		if (dt_open || in_block_elem || para_stack || in_blockquote || in_pre) {
+		}
+		return bfr.To_bry_and_clear();
+	}
+	public int Split(byte[] src, int itm_bgn, int itm_end) {
+		// Fix up line_start
+		if (!line_start) {
+			bfr.Add_mid(src, itm_bgn, itm_end);
+			line_start = true;
+			return Bry_split_.Rv__ok;
+		}
+
+		// * = ul
+		// # = ol
+		// ; = dt
+		// : = dd
+		int last_prefix_len = last_prefix.length;
+		boolean pre_close_match = false; //preg_match('/<\\/pre/i', $oLine);
+		boolean pre_open_match = false;  //preg_match('/<pre/i', $oLine);
+		byte[] prefix = null, prefix2 = null, t = null;
+		// If not in a <pre> element, scan for and figure out what prefixes are there.
+		if (!in_pre) {
+			// Multiple prefixes may abut each other for nested lists.
+			prefix_len = 0;// strspn($oLine, '*#:;');
+			prefix = Php_str_.Substr(src, itm_bgn, prefix_len);
+
+			// eh?
+			// ; and : are both from definition-lists, so they're equivalent
+			//  for the purposes of determining whether or not we need to open/close
+			//  elements.
+			prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
+			t = Bry_.Mid(src, itm_bgn + prefix_len, itm_end);
+//				this.in_pre = (boolean)pre_open_match;
+		}
+		else {
+			// Don't interpret any other prefixes in preformatted text
+			prefix_len = 0;
+			prefix = prefix2 = Bry_.Empty;
+			t = Bry_.Mid(src, itm_bgn, itm_end);
+		}
+
+		// List generation
+		byte[] term = null, t2 = null;
+		int common_prefix_len = -1;
+		if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
+			// Same as the last item, so no need to deal with nesting or opening stuff
+//				bfr .= this.nextItem(substr(prefix, -1));
+			para_stack = false;
+
+			if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
+				// The one nasty exception: definition lists work like this:
+				// ; title : definition text
+				// So we check for : in the remainder text to split up the
+				// title and definition, without b0rking links.
+				term = t2 = Bry_.Empty;
+//					if (this.findColonNoLinks(t, term, t2) !== false) {
+					t = t2;
+					bfr.Add(term); // . this.nextItem(':');
+//					}
+			}
+		}
+		else if (prefix_len > 0 || last_prefix_len > 0) {
+			// We need to open or close prefixes, or both.
+
+			// Either open or close a level...
+//				common_prefix_len = this.getCommon(prefix, last_prefix);
+			para_stack = false;
+
+			// Close all the prefixes which aren't shared.
+			while (common_prefix_len < last_prefix_len) {
+//					bfr .= this.closeList(last_prefix[last_prefix_len - 1]);
+				last_prefix_len--;
+			}
+//
+			// Continue the current prefix if appropriate.
+			if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
+//					bfr .= this.nextItem(prefix[common_prefix_len - 1]);
+			}
+
+			// Open prefixes where appropriate.
+			if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
+				bfr.Add_byte_nl();
+			}
+			while (prefix_len > common_prefix_len) {
+//					$char = substr(prefix, common_prefix_len, 1);
+//					bfr .= this.openList($char);
+//
+//					if (';' == $char) {
+//						// @todo FIXME: This is dupe of code above
+//						if (this.findColonNoLinks(t, term, t2) !== false) {
+//							t = t2;
+//							bfr .= term . this.nextItem(':');
+//						}
+//					}
+				++common_prefix_len;
+			}
+			if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
+				bfr.Add_byte_nl();
+			}
+			last_prefix = prefix2;
+		}
+
+		// If we have no prefixes, go to paragraph mode.
+		if (0 == prefix_len) {
+			// No prefix (not in list)--go to paragraph mode
+			// XXX: use a stack for nestable elements like span, table and div
+			boolean open_match = false, close_match = false;
+//				open_match = preg_match(
+//					'/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
+//						. '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
+//					t
+//				);
+//				close_match = preg_match(
+//					'/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
+//						. '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
+//						. self::MARKER_PREFIX
+//						. '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
+//					t
+//				);
+
+			if (open_match || close_match) {
+				para_stack = false;
+				// @todo bug 5718: paragraph closed
+//					bfr .= this.closeParagraph();
+				if (pre_open_match && !pre_close_match) {
+					this.in_pre = true;
+				}
+//					$bqOffset = 0;
+//					while (preg_match('/<(\\/?)blockquote[\s>]/i', t,
+//						$bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
+//					) {
+//						in_blockquote = !$bqMatch[1][0]; // is this a close tag?
+//						$bqOffset = $bqMatch[0][1] + strlen($bqMatch[0][0]);
+//					}
+				in_block_elem = !close_match;
+			}
+			else if (!in_block_elem && !this.in_pre) {
+				if (    Byte_ascii.Space == t[0]
+//						&& (last_section == 'pre' || trim(t) != '')
+					&& !in_blockquote
+				) {
+					// pre
+//						if (this.last_section !== 'pre') {
+						para_stack = false;
+//							bfr .= this.closeParagraph() . '<pre>';
+//							this.last_section = 'pre';
+//						}
+					t = Bry_.Mid(t, 1);
+				}
+				else {
+					// paragraph
+//						if (trim(t) == '') {
+						if (para_stack) {
+//								bfr .= para_stack . '<br />';
+							para_stack = false;
+//								this.last_section = 'p';
+						}
+						else {
+//								if (this.last_section !== 'p') {
+//									bfr .= this.closeParagraph();
+//									this.last_section = '';
+//									para_stack = '<p>';
+//								}
+//								else {
+//									para_stack = '</p><p>';
+//								}
+						}
+//						}
+//						else {
+						if (para_stack) {
+//								bfr .= para_stack;
+							para_stack = false;
+//								this.last_section = 'p';
+						}
+//							else if (this.last_section !== 'p') {
+//								bfr .= this.closeParagraph() . '<p>';
+//								this.last_section = 'p';
+//							}
+//						}
+				}
+			}
+		}
+		// somewhere above we forget to get out of pre block (bug 785)
+		if (pre_close_match && this.in_pre) {
+			this.in_pre = false;
+		}
+		if (para_stack == false) {
+			bfr.Add(t);
+			if (prefix_len == 0) {
+				bfr.Add_byte_nl();
+			}
+		}
+
+		if (last_prefix_len == -1 || common_prefix_len == -1) {
+		}
+		return Bry_split_.Rv__ok;
+	}
+}