diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwLinker.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwLinker.java index 6a09f461e..a73ab8071 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwLinker.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/XomwLinker.java @@ -332,8 +332,7 @@ public class XomwLinker { // @since 1.20 // @return String HTML for an image, with links, wrappers, etc. // XO.MW:SYNC:1.29; DATE:2017-02-08 - public void makeImageLink(Bry_bfr bfr, Xomw_parser_ctx pctx, Xomw_parser parser, XomwTitle title, XomwFile file, Xomw_params_frame frameParams, Xomw_params_handler handlerParams, Object time, byte[] query, int widthOption) { - Xomw_parser_env env = parser.Env(); + public void makeImageLink(Bry_bfr bfr, Xomw_parser_env env, Xomw_parser_ctx pctx, XomwParserIface parser, XomwTitle title, XomwFile file, Xomw_params_frame frameParams, Xomw_params_handler handlerParams, Object time, byte[] query, int widthOption) { // XO.MW.HOOK:ImageBeforeProduceHTML if (file != null && !file.allowInlineDisplay()) { @@ -413,7 +412,7 @@ public class XomwLinker { // If a thumbnail width has not been provided, it is set // to the default user option as specified in Language*.php if (frameParams.align == Bry_.Empty) { - frameParams.align = parser.Env().Lang__align_end; + frameParams.align = env.Lang__align_end; } bfr.Add(prefix); makeThumbLink2(bfr, env, pctx, title, file, frameParams, handlerParams, time, query); @@ -482,7 +481,7 @@ public class XomwLinker { // @param Parser|null $parser // @return array // XO.MW:SYNC:1.29; DATE:2017-02-08 - private static void getImageLinkMTOParams(Xomw_params_mto rv, Xomw_params_frame frameParams, byte[] query, Xomw_parser parser) { + private static void getImageLinkMTOParams(Xomw_params_mto rv, Xomw_params_frame frameParams, byte[] query, XomwParserIface parser) { if (Php_utl_.isset(frameParams.link_url) && frameParams.link_url != Bry_.Empty) { rv.custom_url_link = frameParams.link_url; if (Php_utl_.isset(frameParams.link_target)) { diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwLinkHolderArray.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwLinkHolderArray.java index 742b0d0c0..e133b96f4 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwLinkHolderArray.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwLinkHolderArray.java @@ -31,13 +31,13 @@ public class XomwLinkHolderArray { /** * @var Parser */ - private final Xomw_parser parent; + private final XomwParserIface parent; // protected $tempIdOffset; /** * @param Parser $parent */ - public XomwLinkHolderArray(Xomw_parser parent) { + public XomwLinkHolderArray(XomwParserIface parent) { this.parent = parent; } @@ -261,18 +261,22 @@ public class XomwLinkHolderArray { * * @param String $text */ - public void replace(Xomw_parser_bfr pbfr) { - this.replaceInternal(pbfr); + public boolean replace(Xomw_parser_bfr pbfr) { + return this.replaceInternal(pbfr); // $this->replaceInterwiki( $text ); } + public byte[] replace(Xomw_parser_bfr pbfr, byte[] text) { + boolean rv = this.replace(pbfr.Init(text)); + return rv ? pbfr.Trg().To_bry_and_clear() : pbfr.Src().To_bry_and_clear(); + } /** * Replace @gplx.Internal protected links * @param String $text */ - private void replaceInternal(Xomw_parser_bfr pbfr) { + private boolean replaceInternal(Xomw_parser_bfr pbfr) { if (internals.Len() == 0) { - return; + return false; } // SKIP:Replace_internals does db lookup to identify redlinks; @@ -430,6 +434,7 @@ public class XomwLinkHolderArray { // $replacer->cb(), // $text // ); + return true; } // /** diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java index 36022114c..ddb270c53 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParser.java @@ -14,14 +14,25 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; +import gplx.core.btries.*; +import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.linkers.*; +import gplx.xowa.mediawiki.includes.parsers.tables.*; +import gplx.xowa.mediawiki.includes.parsers.hrs.*; +import gplx.xowa.mediawiki.includes.parsers.doubleunders.*; +import gplx.xowa.mediawiki.includes.parsers.headings.*; +import gplx.xowa.mediawiki.includes.parsers.lnkis.*; +import gplx.xowa.mediawiki.includes.parsers.quotes.*; +import gplx.xowa.mediawiki.includes.parsers.lnkes.*; +import gplx.xowa.mediawiki.includes.parsers.magiclinks.*; +import gplx.xowa.mediawiki.includes.parsers.nbsps.*; /** * PHP Parser - Processes wiki markup (which uses a more user-friendly * syntax, such as "[[link]]" for making links), and provides a one-way * transformation of that wiki markup it into (X)HTML output / markup * (which in turn the browser understands, and can display). * -* There are seven main entry points into the Parser class: +* There are seven main entry points into the Parser cls: * * - Parser::parse() * produces HTML output @@ -54,7 +65,7 @@ import gplx.xowa.mediawiki.includes.linkers.*; * * @ingroup Parser */ -public class XomwParser { +public class XomwParser implements XomwParserIface { // /** // * Update this version number when the ParserOutput format // * changes in an incompatible way, so the parser cache @@ -162,17 +173,17 @@ public class XomwParser { // */ // public $mOutput; // public $mAutonumber; -// -// /** -// * @var StripState -// */ -// public $mStripState; -// + + /** + * @var StripState + */ + public XomwStripState mStripState = new XomwStripState(); + // public $mIncludeCount; -// /** -// * @var LinkHolderArray -// */ -// public $mLinkHolders; + /** + * @var LinkHolderArray + */ + public XomwLinkHolderArray mLinkHolders; private int mLinkID; // public $mIncludeSizes, $mPPNodeCount, $mGeneratedPPNodeCount, $mHighestExpansionDepth; @@ -188,12 +199,12 @@ public class XomwParser { // // # Temporary // # These are variables reset at least once per parse regardless of $clearState -// -// /** -// * @var ParserOptions -// */ -// public $mOptions; -// + + /** + * @var ParserOptions + */ + public XomwParserOptions mOptions = new XomwParserOptions(); + // /** // * @var Title // */ @@ -243,7 +254,27 @@ public class XomwParser { */ private Xomw_link_renderer mLinkRenderer; + // XOWA + private final Bry_bfr tmp_bfr = Bry_bfr_.New(); + private final Xomw_parser_env env = new Xomw_parser_env(); private final XomwSanitizer sanitizer = new XomwSanitizer(); + private final Xomw_table_wkr tableWkr; + private final Xomw_hr_wkr hrWkr = new Xomw_hr_wkr(); + private final Xomw_doubleunder_wkr doubleunderWkr = new Xomw_doubleunder_wkr(); + private final Xomw_heading_wkr headingWkr = new Xomw_heading_wkr(); + private final Xomw_heading_cbk__html headingWkrCbk = new Xomw_heading_cbk__html(); + private final Xomw_lnki_wkr lnkiWkr; + private final Xomw_quote_wkr quoteWkr; + private final Xomw_lnke_wkr lnkeWkr; + private final Xomw_magiclinks_wkr magiclinksWkr; + private final Xomw_nbsp_wkr nbspWkr = new Xomw_nbsp_wkr(); + private final Xomw_block_level_pass blockWkr = new Xomw_block_level_pass(); + private final Xomw_doubleunder_data doubleunder_data = new Xomw_doubleunder_data(); + private static Xomw_regex_space regex_space; + private static Xomw_regex_boundary regex_boundary; + private static Xomw_regex_url regex_url; + private final Btrie_rv trv = new Btrie_rv(); +// private int marker_index = 0; // /** // * @param array $conf @@ -270,7 +301,45 @@ public class XomwParser { // } // wfDebug(__CLASS__ . ": using preprocessor: {this.mPreprocessorClass}\n"); // } -// + private final Btrie_slim_mgr protocols_trie; + public Xomw_parser_env Env() {return env;} + public Xomw_lnki_wkr Lnki_wkr() {return lnkiWkr;} + public XomwLinker Linker() {return linker;} private final XomwLinker linker; + public byte[] Get_external_link_rel; + private static byte[] Atr__rel; + public XomwParser() { + if (regex_space == null) { + synchronized (Type_adp_.ClassOf_obj(this)) { + regex_space = new Xomw_regex_space(); + regex_boundary = new Xomw_regex_boundary(regex_space); + regex_url = new Xomw_regex_url(regex_space); + Atr__rel = Bry_.new_a7("rel"); + Get_external_link_rel = Bry_.new_a7("nofollow"); + } + } + + this.mLinkRenderer = new Xomw_link_renderer(sanitizer); + this.linker = new XomwLinker(mLinkRenderer); + this.protocols_trie = Xomw_parser.Protocols__dflt(); + this.mLinkHolders = new XomwLinkHolderArray(this); + + this.tableWkr = new Xomw_table_wkr(tmp_bfr, sanitizer, mStripState); + this.quoteWkr = new Xomw_quote_wkr(tmp_bfr); + this.lnkiWkr = new Xomw_lnki_wkr(this, mLinkHolders, mLinkRenderer, protocols_trie, linker, quoteWkr, tmp_bfr, mStripState); + this.lnkeWkr = new Xomw_lnke_wkr(this, tmp_bfr, linker, sanitizer); + this.magiclinksWkr = new Xomw_magiclinks_wkr(this, sanitizer, linker, regex_boundary, regex_url); + } + public void Init_by_wiki(Xowe_wiki wiki) { + linker.Init_by_wiki(env, wiki.Lang().Lnki_trail_mgr().Trie()); + lnkeWkr.Init_by_wiki(protocols_trie, regex_url, regex_space); + lnkiWkr.Init_by_wiki(env, wiki); + doubleunderWkr.Init_by_wiki(doubleunder_data, wiki.Lang()); + magiclinksWkr.Init_by_wiki(); + } + public void Init_by_page(XomwTitle ttl) { +// pctx.Init_by_page(ttl); + } + // /** // * Reduce memory usage to reduce the impact of circular references // */ @@ -321,13 +390,13 @@ public class XomwParser { // // Hooks::run('ParserFirstCallInit', [ &$this ]); // } -// -// /** -// * Clear Parser state -// * -// * @private -// */ -// public function clearState() { + + /** + * Clear Parser state + * + * @private + */ + public void clearState() { // if (this.mFirstCall) { // this.firstCallInit(); // } @@ -335,7 +404,7 @@ public class XomwParser { // this.mOptions->registerWatcher([ this.mOutput, 'recordOption' ]); // this.mAutonumber = 0; // this.mIncludeCount = []; -// this.mLinkHolders = new LinkHolderArray($this); + this.mLinkHolders = new XomwLinkHolderArray(this); // this.mLinkID = 0; // this.mRevisionObject = this.mRevisionTimestamp = // this.mRevisionId = this.mRevisionUser = this.mRevisionSize = null; @@ -343,9 +412,9 @@ public class XomwParser { // this.mUser = null; // this.mLangLinkLanguages = []; // this.currentRevisionCache = null; -// -// this.mStripState = new StripState; -// + + this.mStripState = new XomwStripState(); + // # Clear these on every parse, T6549 // this.mTplRedirCache = this.mTplDomCache = []; // @@ -371,29 +440,34 @@ public class XomwParser { // this.mProfiler = new SectionProfiler(); // // Hooks::run('ParserClearState', [ &$this ]); -// } -// -// /** -// * Convert wikitext to HTML -// * Do not call this function recursively. -// * -// * @param String $text Text we want to parse -// * @param Title $title -// * @param ParserOptions $options -// * @param boolean $linestart -// * @param boolean $clearState -// * @param int $revid Number to pass in {{REVISIONID}} -// * @return ParserOutput A ParserOutput -// */ + } + + /** + * Convert wikitext to HTML + * Do not call this function recursively. + * + * @param String $text Text we want to parse + * @param Title $title + * @param ParserOptions $options + * @param boolean $linestart + * @param boolean $clearState + * @param int $revid Number to pass in {{REVISIONID}} + * @return ParserOutput A ParserOutput + */ // public function parse( // $text, Title $title, ParserOptions $options, // $linestart = true, $clearState = true, $revid = null // ) { -// /** -// * First pass--just handle sections, pass the rest off -// * to internalParse() which does all the real work. -// */ -// + public void parse(Xomw_parser_bfr pbfr, Xomw_parser_ctx pctx,byte[] text, XomwTitle title, XomwParserOptions options) {this.parse(pbfr, pctx, text, title, options, true, true, -1);} + public void parse(Xomw_parser_bfr pbfr, Xomw_parser_ctx pctx, + byte[] text, XomwTitle title, XomwParserOptions options, + boolean linestart, boolean clearState, int revid + ) { + /** + * First pass--just handle sections, pass the rest off + * to internalParse() which does all the real work. + */ + // global $wgShowHostnames; // // if ($clearState) { @@ -427,11 +501,11 @@ public class XomwParser { // Hooks::run('ParserBeforeStrip', [ &$this, &$text, &this.mStripState ]); // # No more strip! // Hooks::run('ParserAfterStrip', [ &$this, &$text, &this.mStripState ]); -// $text = this.internalParse($text); + this.internalParse(pbfr, pctx, text); // Hooks::run('ParserAfterParse', [ &$this, &$text, &this.mStripState ]); -// -// $text = this.internalParseHalfParsed($text, true, $linestart); -// + + this.internalParseHalfParsed(pbfr, pctx, true, linestart); + // /** // * A converted title will be provided in the output Object if title and // * content conversion are enabled, the article text does not contain @@ -579,8 +653,8 @@ public class XomwParser { // this.currentRevisionCache = null; // // return this.mOutput; -// } -// + } + // /** // * Half-parse wikitext to half-parsed HTML. This recursive parser entry point // * can be called from an extension tag hook. @@ -805,16 +879,17 @@ public class XomwParser { // public function getOutput() { // return this.mOutput; // } -// -// /** -// * Get the ParserOptions Object -// * -// * @return ParserOptions -// */ -// public function getOptions() { -// return this.mOptions; -// } -// + + /** + * Get the ParserOptions Object + * + * @return ParserOptions + */ + public XomwParserOptions getOptions() { + return this.mOptions; + } + + // /** // * Accessor/mutator for the ParserOptions Object // * @@ -1034,481 +1109,186 @@ public class XomwParser { // return $marker; // } // -// /** -// * parse the wiki syntax used to render tables -// * -// * @private -// * @param String $text -// * @return String -// */ -// public function doTableStuff($text) { -// -// $lines = StringUtils::explode("\n", $text); -// $out = ''; -// $td_history = []; # Is currently a td tag open? -// $last_tag_history = []; # Save history of last lag activated (td, th or caption) -// $tr_history = []; # Is currently a tr tag open? -// $tr_attributes = []; # history of tr attributes -// $has_opened_tr = []; # Did this table open a element? -// $indent_level = 0; # indent level of the table -// -// foreach ($lines as $outLine) { -// $line = trim($outLine); -// -// if ($line === '') { # empty line, go to next line -// $out .= $outLine . "\n"; -// continue; -// } -// -// $first_character = $line[0]; -// $first_two = substr($line, 0, 2); -// $matches = []; -// -// if (preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)) { -// # First check if we are starting a new table -// $indent_level = strlen($matches[1]); -// -// $attributes = this.mStripState->unstripBoth($matches[2]); -// $attributes = Sanitizer::fixTagAttributes($attributes, 'table'); -// -// $outLine = str_repeat('
', $indent_level) . ""; -// array_push($td_history, false); -// array_push($last_tag_history, ''); -// array_push($tr_history, false); -// array_push($tr_attributes, ''); -// array_push($has_opened_tr, false); -// } elseif (count($td_history) == 0) { -// # Don't do any of the following -// $out .= $outLine . "\n"; -// continue; -// } elseif ($first_two === '|}') { -// # We are ending a table -// $line = '' . substr($line, 2); -// $last_tag = array_pop($last_tag_history); -// -// if (!array_pop($has_opened_tr)) { -// $line = "{$line}"; -// } -// -// if (array_pop($tr_history)) { -// $line = "{$line}"; -// } -// -// if (array_pop($td_history)) { -// $line = "{$line}"; -// } -// array_pop($tr_attributes); -// $outLine = $line . str_repeat('
', $indent_level); -// } elseif ($first_two === '|-') { -// # Now we have a table row -// $line = preg_replace('#^\|-+#', '', $line); -// -// # Whats after the tag is now only attributes -// $attributes = this.mStripState->unstripBoth($line); -// $attributes = Sanitizer::fixTagAttributes($attributes, 'tr'); -// array_pop($tr_attributes); -// array_push($tr_attributes, $attributes); -// -// $line = ''; -// $last_tag = array_pop($last_tag_history); -// array_pop($has_opened_tr); -// array_push($has_opened_tr, true); -// -// if (array_pop($tr_history)) { -// $line = ''; -// } -// -// if (array_pop($td_history)) { -// $line = "{$line}"; -// } -// -// $outLine = $line; -// array_push($tr_history, false); -// array_push($td_history, false); -// array_push($last_tag_history, ''); -// } elseif ($first_character === '|' -// || $first_character === '!' -// || $first_two === '|+' -// ) { -// # This might be cell elements, td, th or captions -// if ($first_two === '|+') { -// $first_character = '+'; -// $line = substr($line, 2); -// } else { -// $line = substr($line, 1); -// } -// -// // Implies both are valid for table headings. -// if ($first_character === '!') { -// $line = StringUtils::replaceMarkup('!!', '||', $line); -// } -// -// # Split up multiple cells on the same line. -// # FIXME : This can result in improper nesting of tags processed -// # by earlier parser steps. -// $cells = explode('||', $line); -// -// $outLine = ''; -// -// # Loop through each table cell -// foreach ($cells as $cell) { -// $previous = ''; -// if ($first_character !== '+') { -// $tr_after = array_pop($tr_attributes); -// if (!array_pop($tr_history)) { -// $previous = "\n"; -// } -// array_push($tr_history, true); -// array_push($tr_attributes, ''); -// array_pop($has_opened_tr); -// array_push($has_opened_tr, true); -// } -// -// $last_tag = array_pop($last_tag_history); -// -// if (array_pop($td_history)) { -// $previous = "\n{$previous}"; -// } -// -// if ($first_character === '|') { -// $last_tag = 'td'; -// } elseif ($first_character === '!') { -// $last_tag = 'th'; -// } elseif ($first_character === '+') { -// $last_tag = 'caption'; -// } else { -// $last_tag = ''; -// } -// -// array_push($last_tag_history, $last_tag); -// -// # A cell could contain both parameters and data -// $cell_data = explode('|', $cell, 2); -// -// # T2553: Note that a '|' inside an invalid link should not -// # be mistaken as delimiting cell parameters -// # Bug T153140: Neither should language converter markup. -// if (preg_match('/\[\[|-\{/', $cell_data[0]) === 1) { -// $cell = "{$previous}<{$last_tag}>{$cell}"; -// } elseif (count($cell_data) == 1) { -// $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; -// } else { -// $attributes = this.mStripState->unstripBoth($cell_data[0]); -// $attributes = Sanitizer::fixTagAttributes($attributes, $last_tag); -// $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; -// } -// -// $outLine .= $cell; -// array_push($td_history, true); -// } -// } -// $out .= $outLine . "\n"; -// } -// -// # Closing open td, tr && table -// while (count($td_history) > 0) { -// if (array_pop($td_history)) { -// $out .= "\n"; -// } -// if (array_pop($tr_history)) { -// $out .= "\n"; -// } -// if (!array_pop($has_opened_tr)) { -// $out .= "\n"; -// } -// -// $out .= "\n"; -// } -// -// # Remove trailing line-ending (b/c) -// if (substr($out, -1) === "\n") { -// $out = substr($out, 0, -1); -// } -// -// # special case: don't return empty table -// if ($out === "\n\n
") { -// $out = ''; -// } -// -// return $out; -// } -// -// /** -// * Helper function for parse() that transforms wiki markup into half-parsed -// * HTML. Only called for $mOutputType == self::OT_HTML. -// * -// * @private -// * -// * @param String $text The text to parse -// * @param boolean $isMain Whether this is being called from the main parse() function -// * @param PPFrame|boolean $frame A pre-processor frame -// * -// * @return String -// */ -// public function internalParse($text, $isMain = true, $frame = false) { -// -// $origText = $text; -// -// # Hook to suspend the parser in this state -// if (!Hooks::run('ParserBeforeInternalParse', [ &$this, &$text, &this.mStripState ])) { -// return $text; -// } -// -// # if $frame is provided, then use $frame for replacing any variables + /** + * parse the wiki syntax used to render tables + * + * @private + * @param String $text + * @return String + */ + // XO.MOVED to Xomw_table_Wkr + // public function doTableStuff($text) {} + + /** + * Helper function for parse() that transforms wiki markup into half-parsed + * HTML. Only called for $mOutputType == self::OT_HTML. + * + * @private + * + * @param String $text The text to parse + * @param boolean $isMain Whether this is being called from the main parse() function + * @param PPFrame|boolean $frame A pre-processor frame + * + * @return String + */ + // isMain=tru + public void internalParse(Xomw_parser_bfr pbfr, Xomw_parser_ctx pctx, byte[] text) {internalParse(pbfr, pctx, text, true, false);} + public void internalParse(Xomw_parser_bfr pbfr, Xomw_parser_ctx pctx, byte[] text, boolean isMain, boolean frame) { + pbfr.Init(text); +// $origText = text; + + // MW.HOOK:ParserBeforeInternalParse + + // if $frame is provided, then use $frame for replacing any variables // if ($frame) { -// # use frame depth to infer how include/noinclude tags should be handled -// # depth=0 means this is the top-level document; otherwise it's an included document + // use frame depth to infer how include/noinclude tags should be handled + // depth=0 means this is the top-level document; otherwise it's an included document +// boolean for_inclusion = false; // if (!$frame->depth) { // $flag = 0; // } else { // $flag = Parser::PTD_FOR_INCLUSION; // } -// $dom = this.preprocessToDom($text, $flag); -// $text = $frame->expand($dom); +// text = prepro_wkr.Preprocess_to_xml(text, for_inclusion); + // text = $frame->expand($dom); // } else { -// # if $frame is not provided, then use old-style replaceVariables -// $text = this.replaceVariables($text); +// // if $frame is not provided, then use old-style replaceVariables +// text = $this->replaceVariables(text); // } -// -// Hooks::run('InternalParseBeforeSanitize', [ &$this, &$text, &this.mStripState ]); -// $text = Sanitizer::removeHTMLtags( -// $text, + + // MW.HOOK:InternalParseBeforeSanitize +// text = Sanitizer::removeHTMLtags( +// text, // [ &$this, 'attributeStripCallback' ], // false, -// array_keys(this.mTransparentTagHooks), +// array_keys($this->mTransparentTagHooks), // [], // [ &$this, 'addTrackingCategory' ] // ); -// Hooks::run('InternalParseBeforeLinks', [ &$this, &$text, &this.mStripState ]); -// -// # Tables need to come after variable replacement for things to work -// # properly; putting them before other transformations should keep -// # exciting things like link expansions from showing up in surprising -// # places. -// $text = this.doTableStuff($text); -// -// $text = preg_replace('/(^|\n)-----*/', '\\1
', $text); -// -// $text = this.doDoubleUnderscore($text); -// -// $text = this.doHeadings($text); -// $text = this.replaceInternalLinks($text); -// $text = this.doAllQuotes($text); -// $text = this.replaceExternalLinks($text); -// -// # replaceInternalLinks may sometimes leave behind -// # absolute URLs, which have to be masked to hide them from replaceExternalLinks -// $text = str_replace(self::MARKER_PREFIX . 'NOPARSE', '', $text); -// -// $text = this.doMagicLinks($text); -// $text = this.formatHeadings($text, $origText, $isMain); -// -// return $text; -// } -// -// /** -// * Helper function for parse() that transforms half-parsed HTML into fully -// * parsed HTML. -// * -// * @param String $text -// * @param boolean $isMain -// * @param boolean $linestart -// * @return String -// */ -// private function internalParseHalfParsed($text, $isMain = true, $linestart = true) { -// $text = this.mStripState->unstripGeneral($text); -// -// if ($isMain) { -// Hooks::run('ParserAfterUnstrip', [ &$this, &$text ]); -// } -// -// # Clean up special characters, only run once, next-to-last before doBlockLevels -// $fixtags = [ -// # French spaces, last one Guillemet-left -// # only if there is something before the space -// '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', -// # french spaces, Guillemet-right -// '/(\\302\\253) /' => '\\1 ', -// '/ (!\s*important)/' => ' \\1', # Beware of CSS magic word !important, T13874. -// ]; -// $text = preg_replace(array_keys($fixtags), array_values($fixtags), $text); -// -// $text = this.doBlockLevels($text, $linestart); -// -// this.replaceLinkHolders($text); -// -// /** -// * The input doesn't get language converted if -// * a) It's disabled -// * b) Content isn't converted -// * c) It's a conversion table -// * d) it is an interface message (which is in the user language) -// */ -// if (!(this.mOptions->getDisableContentConversion() -// || isset(this.mDoubleUnderscores['nocontentconvert'])) + // MW.HOOK:InternalParseBeforeLinks + + // Tables need to come after variable replacement for things to work + // properly; putting them before other transformations should keep + // exciting things like link expansions from showing up in surprising + // places. + tableWkr.doTableStuff(pctx, pbfr); + + // $text = preg_replace('/(^|\n)-----*/', '\\1
', $text); + hrWkr.replaceHrs(pctx, pbfr); + + doubleunderWkr.doDoubleUnderscore(pctx, pbfr); + + headingWkr.doHeadings(pctx, pbfr, headingWkrCbk); + lnkiWkr.replaceInternalLinks(pbfr, env, pctx); + quoteWkr.doAllQuotes(pctx, pbfr); + lnkeWkr.replaceExternalLinks(pctx, pbfr); + + // replaceInternalLinks may sometimes leave behind + // absolute URLs, which have to be masked to hide them from replaceExternalLinks + Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty); // $text = str_replace(self::MARKER_PREFIX . 'NOPARSE', '', $text); + + magiclinksWkr.doMagicLinks(pctx, pbfr); +// $text = $this->formatHeadings($text, $origText, $isMain); + } + + /** + * Helper function for parse() that transforms half-parsed HTML into fully + * parsed HTML. + * + * @param String $text + * @param boolean $isMain + * @param boolean $linestart + * @return String + */ + public void internalParseHalfParsed(Xomw_parser_bfr pbfr, Xomw_parser_ctx pctx, boolean isMain, boolean lineStart) { + this.mStripState.unstripGeneral(pbfr); + + // MW.HOOK:ParserAfterUnstrip + + // Clean up special characters, only run once, next-to-last before doBlockLevels + // $fixtags = [ + // # French spaces, last one Guillemet-left + // # only if there is something before the space + // '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', + // # french spaces, Guillemet-right + // '/(\\302\\253) /' => '\\1 ', + // '/ (!\s*important)/' => ' \\1', # Beware of CSS magic word !important, T13874. + // ]; + // $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); + nbspWkr.doNbsp(pctx, pbfr); + + blockWkr.doBlockLevels(pctx, pbfr, lineStart); + + lnkiWkr.replaceLinkHolders(pbfr); + + // The input doesn't get language converted if + // a) It's disabled + // b) Content isn't converted + // c) It's a conversion table + // d) it is an interface message (which is in the user language) +// if ( !( $this->mOptions->getDisableContentConversion() +// || isset( $this->mDoubleUnderscores['nocontentconvert'] ) ) // ) { -// if (!this.mOptions->getInterfaceMessage()) { -// # The position of the convert() call should not be changed. it -// # assumes that the links are all replaced and the only thing left -// # is the mark. -// $text = this.getConverterLanguage()->convert($text); +// if ( !$this->mOptions->getInterfaceMessage() ) { +// // The position of the convert() call should not be changed. it +// // assumes that the links are all replaced and the only thing left +// // is the mark. +// $text = $this->getConverterLanguage()->convert( $text ); // } // } -// -// $text = this.mStripState->unstripNoWiki($text); -// -// if ($isMain) { -// Hooks::run('ParserBeforeTidy', [ &$this, &$text ]); -// } -// -// $text = this.replaceTransparentTags($text); -// $text = this.mStripState->unstripGeneral($text); -// -// $text = Sanitizer::normalizeCharReferences($text); -// -// if (MWTidy::isEnabled()) { -// if (this.mOptions->getTidy()) { -// $text = MWTidy::tidy($text); + + mStripState.unstripNoWiki(pbfr); + + // MW.HOOK:ParserBeforeTidy + +// $text = $this->replaceTransparentTags( $text ); + mStripState.unstripGeneral(pbfr); + + sanitizer.Normalize_char_references(pbfr); + +// if ( MWTidy::isEnabled() ) { +// if ( $this->mOptions->getTidy() ) { +// $text = MWTidy::tidy( $text ); // } -// } else { -// # attempt to sanitize at least some nesting problems -// # (T4702 and quite a few others) +// } +// else { +// // attempt to sanitize at least some nesting problems +// // (T4702 and quite a few others) // $tidyregs = [ -// # ''Something [http://www.cool.com cool''] --> -// # Somethingcool> +// // ''Something [http://www.cool.com cool''] --> +// // Somethingcool> // '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' => // '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', -// # fix up an anchor inside another anchor, only -// # at least for a single single nested link (T5695) +// // fix up an anchor inside another anchor, only +// // at least for a single single nested link (T5695) // '/(]+>)([^<]*)(]+>[^<]*)<\/a>(.*)<\/a>/' => // '\\1\\2\\3\\1\\4', -// # fix div inside inline elements- doBlockLevels won't wrap a line which -// # contains a div, so fix it up here; replace -// # div with escaped text +// // fix div inside inline elements- doBlockLevels won't wrap a line which +// // contains a div, so fix it up here; replace +// // div with escaped text // '/(<([aib]) [^>]+>)([^<]*)(]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' => // '\\1\\3<div\\5>\\6</div>\\8\\9', -// # remove empty italic or bold tag pairs, some -// # introduced by rules above +// // remove empty italic or bold tag pairs, some +// // introduced by rules above // '/<([bi])><\/\\1>/' => '', // ]; -// + // $text = preg_replace( -// array_keys($tidyregs), -// array_values($tidyregs), -// $text); +// array_keys( $tidyregs ), +// array_values( $tidyregs ), +// $text ); // } -// -// if ($isMain) { -// Hooks::run('ParserAfterTidy', [ &$this, &$text ]); -// } -// -// return $text; -// } -// -// /** -// * Replace special strings like "ISBN xxx" and "RFC xxx" with -// * magic external links. -// * -// * DML -// * @private -// * -// * @param String $text -// * -// * @return String -// */ -// public function doMagicLinks($text) { -// $prots = wfUrlProtocolsWithoutProtRel(); -// $urlChar = self::EXT_LINK_URL_CLASS; -// $addr = self::EXT_LINK_ADDR; -// $space = self::SPACE_NOT_NL; # non-newline space -// $spdash = "(?:-|$space)"; # a dash or a non-newline space -// $spaces = "$space++"; # possessive match of 1 or more spaces -// $text = preg_replace_callback( -// '!(?: # Start cases -// (].*?) | # m[1]: Skip link text -// (<.*?>) | # m[2]: Skip stuff inside -// # HTML elements' . " -// (\b(?i:$prots)($addr$urlChar*)) | # m[3]: Free external links -// # m[4]: Post-protocol path -// \b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number -// ([0-9]+)\b | -// \bISBN $spaces ( # m[6]: ISBN, capture number -// (?: 97[89] $spdash?)? # optional 13-digit ISBN prefix -// (?: [0-9] $spdash?){9} # 9 digits with opt. delimiters -// [0-9Xx] # check digit -// )\b -// )!xu", [ &$this, 'magicLinkCallback' ], $text); -// return $text; -// } -// -// /** -// * @throws MWException -// * @param array $m -// * @return HTML|String -// */ -// public function magicLinkCallback($m) { -// if (isset($m[1]) && $m[1] !== '') { -// # Skip anchor -// return $m[0]; -// } elseif (isset($m[2]) && $m[2] !== '') { -// # Skip HTML element -// return $m[0]; -// } elseif (isset($m[3]) && $m[3] !== '') { -// # Free external link -// return this.makeFreeExternalLink($m[0], strlen($m[4])); -// } elseif (isset($m[5]) && $m[5] !== '') { -// # RFC or PMID -// if (substr($m[0], 0, 3) === 'RFC') { -// if (!this.mOptions->getMagicRFCLinks()) { -// return $m[0]; -// } -// $keyword = 'RFC'; -// $urlmsg = 'rfcurl'; -// $cssClass = 'mw-magiclink-rfc'; -// $trackingCat = 'magiclink-tracking-rfc'; -// $id = $m[5]; -// } elseif (substr($m[0], 0, 4) === 'PMID') { -// if (!this.mOptions->getMagicPMIDLinks()) { -// return $m[0]; -// } -// $keyword = 'PMID'; -// $urlmsg = 'pubmedurl'; -// $cssClass = 'mw-magiclink-pmid'; -// $trackingCat = 'magiclink-tracking-pmid'; -// $id = $m[5]; -// } else { -// throw new MWException(__METHOD__ . ': unrecognised match type "' . -// substr($m[0], 0, 20) . '"'); -// } -// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text(); -// this.addTrackingCategory($trackingCat); -// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], this.mTitle); -// } elseif (isset($m[6]) && $m[6] !== '' -// && this.mOptions->getMagicISBNLinks() -// ) { -// # ISBN -// $isbn = $m[6]; -// $space = self::SPACE_NOT_NL; # non-newline space -// $isbn = preg_replace("/$space/", ' ', $isbn); -// $num = strtr($isbn, [ -// '-' => '', -// ' ' => '', -// 'x' => 'X', -// ]); -// this.addTrackingCategory('magiclink-tracking-isbn'); -// return this.getLinkRenderer()->makeKnownLink( -// SpecialPage::getTitleFor('Booksources', $num), -// "ISBN $isbn", -// [ -// 'class' => '@gplx.Internal protected mw-magiclink-isbn', -// 'title' => false // suppress title attribute -// ] -// ); -// } else { -// return $m[0]; -// } -// } -// + +// // MW.HOOK:ParserAfterTidy + } + + + // XO.MW:MOVED + // public function doMagicLinks($text) {} + + // XO.MW:MOVED + // public function magicLinkCallback($m) {} + // /** // * Make a free external link, given a user-supplied URL // * @@ -1585,224 +1365,15 @@ public class XomwParser { // } // return $text . $trail; // } -// -// /** -// * Parse headers and return html -// * -// * @private -// * -// * @param String $text -// * -// * @return String -// */ -// public function doHeadings($text) { -// for ($i = 6; $i >= 1; --$i) { -// $h = str_repeat('=', $i); -// $text = preg_replace("/^$h(.+)$h\\s*$/m", "\\1", $text); -// } -// return $text; -// } -// -// /** -// * Replace single quotes with HTML markup -// * @private -// * -// * @param String $text -// * -// * @return String The altered text -// */ -// public function doAllQuotes($text) { -// $outtext = ''; -// $lines = StringUtils::explode("\n", $text); -// foreach ($lines as $line) { -// $outtext .= this.doQuotes($line) . "\n"; -// } -// $outtext = substr($outtext, 0, -1); -// return $outtext; -// } -// -// /** -// * Helper function for doAllQuotes() -// * -// * @param String $text -// * -// * @return String -// */ -// public function doQuotes($text) { -// $arr = preg_split("/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE); -// $countarr = count($arr); -// if ($countarr == 1) { -// return $text; -// } -// -// // First, do some preliminary work. This may shift some apostrophes from -// // being mark-up to being text. It also counts the number of occurrences -// // of bold and italics mark-ups. -// $numbold = 0; -// $numitalics = 0; -// for ($i = 1; $i < $countarr; $i += 2) { -// $thislen = strlen($arr[$i]); -// // If there are ever four apostrophes, assume the first is supposed to -// // be text, and the remaining three constitute mark-up for bold text. -// // (T15227: ''''foo'''' turns into ' ''' foo ' ''') -// if ($thislen == 4) { -// $arr[$i - 1] .= "'"; -// $arr[$i] = "'''"; -// $thislen = 3; -// } elseif ($thislen > 5) { -// // If there are more than 5 apostrophes in a row, assume they're all -// // text except for the last 5. -// // (T15227: ''''''foo'''''' turns into ' ''''' foo ' ''''') -// $arr[$i - 1] .= str_repeat("'", $thislen - 5); -// $arr[$i] = "'''''"; -// $thislen = 5; -// } -// // Count the number of occurrences of bold and italics mark-ups. -// if ($thislen == 2) { -// $numitalics++; -// } elseif ($thislen == 3) { -// $numbold++; -// } elseif ($thislen == 5) { -// $numitalics++; -// $numbold++; -// } -// } -// -// // If there is an odd number of both bold and italics, it is likely -// // that one of the bold ones was meant to be an apostrophe followed -// // by italics. Which one we cannot know for certain, but it is more -// // likely to be one that has a single-letter word before it. -// if (($numbold % 2 == 1) && ($numitalics % 2 == 1)) { -// $firstsingleletterword = -1; -// $firstmultiletterword = -1; -// $firstspace = -1; -// for ($i = 1; $i < $countarr; $i += 2) { -// if (strlen($arr[$i]) == 3) { -// $x1 = substr($arr[$i - 1], -1); -// $x2 = substr($arr[$i - 1], -2, 1); -// if ($x1 === ' ') { -// if ($firstspace == -1) { -// $firstspace = $i; -// } -// } elseif ($x2 === ' ') { -// $firstsingleletterword = $i; -// // if $firstsingleletterword is set, we don't -// // look at the other options, so we can bail early. -// break; -// } else { -// if ($firstmultiletterword == -1) { -// $firstmultiletterword = $i; -// } -// } -// } -// } -// -// // If there is a single-letter word, use it! -// if ($firstsingleletterword > -1) { -// $arr[$firstsingleletterword] = "''"; -// $arr[$firstsingleletterword - 1] .= "'"; -// } elseif ($firstmultiletterword > -1) { -// // If not, but there's a multi-letter word, use that one. -// $arr[$firstmultiletterword] = "''"; -// $arr[$firstmultiletterword - 1] .= "'"; -// } elseif ($firstspace > -1) { -// // ... otherwise use the first one that has neither. -// // (notice that it is possible for all three to be -1 if, for example, -// // there is only one pentuple-apostrophe in the line) -// $arr[$firstspace] = "''"; -// $arr[$firstspace - 1] .= "'"; -// } -// } -// -// // Now let's actually convert our apostrophic mush to HTML! -// $output = ''; -// $buffer = ''; -// $state = ''; -// $i = 0; -// foreach ($arr as $r) { -// if (($i % 2) == 0) { -// if ($state === 'both') { -// $buffer .= $r; -// } else { -// $output .= $r; -// } -// } else { -// $thislen = strlen($r); -// if ($thislen == 2) { -// if ($state === 'i') { -// $output .= ''; -// $state = ''; -// } elseif ($state === 'bi') { -// $output .= ''; -// $state = 'b'; -// } elseif ($state === 'ib') { -// $output .= ''; -// $state = 'b'; -// } elseif ($state === 'both') { -// $output .= '' . $buffer . ''; -// $state = 'b'; -// } else { // $state can be 'b' or '' -// $output .= ''; -// $state .= 'i'; -// } -// } elseif ($thislen == 3) { -// if ($state === 'b') { -// $output .= ''; -// $state = ''; -// } elseif ($state === 'bi') { -// $output .= ''; -// $state = 'i'; -// } elseif ($state === 'ib') { -// $output .= ''; -// $state = 'i'; -// } elseif ($state === 'both') { -// $output .= '' . $buffer . ''; -// $state = 'i'; -// } else { // $state can be 'i' or '' -// $output .= ''; -// $state .= 'b'; -// } -// } elseif ($thislen == 5) { -// if ($state === 'b') { -// $output .= ''; -// $state = 'i'; -// } elseif ($state === 'i') { -// $output .= ''; -// $state = 'b'; -// } elseif ($state === 'bi') { -// $output .= ''; -// $state = ''; -// } elseif ($state === 'ib') { -// $output .= ''; -// $state = ''; -// } elseif ($state === 'both') { -// $output .= '' . $buffer . ''; -// $state = ''; -// } else { // ($state == '') -// $buffer = ''; -// $state = 'both'; -// } -// } -// } -// $i++; -// } -// // Now close all remaining tags. Notice that the order is important. -// if ($state === 'b' || $state === 'ib') { -// $output .= ''; -// } -// if ($state === 'i' || $state === 'bi' || $state === 'ib') { -// $output .= ''; -// } -// if ($state === 'bi') { -// $output .= ''; -// } -// // There might be lonely ''''', so make sure we have a buffer -// if ($state === 'both' && $buffer) { -// $output .= '' . $buffer . ''; -// } -// return $output; -// } -// + + // XO.MW:MOVED + // public function doHeadings($text) {} + + // XO.MW:MOVED + // public function doAllQuotes($text) {} + + // XO.MW:MOVED + // public function doQuotes($text) {} // /** // * Replace external links (REL) // * @@ -1905,38 +1476,27 @@ public class XomwParser { // } // return null; // } -// -// /** -// * Get an associative array of additional HTML attributes appropriate for a -// * particular external link. This currently may include rel => nofollow -// * (depending on configuration, namespace, and the URL's domain) and/or a -// * target attribute (depending on configuration). -// * -// * @param String $url URL to extract the domain from for rel => -// * nofollow if appropriate -// * @return array Associative array of HTML attributes -// */ -// public function getExternalLinkAttribs($url) { -// $attribs = []; -// $rel = self::getExternalLinkRel($url, this.mTitle); -// -// $target = this.mOptions->getExternalLinkTarget(); -// if ($target) { -// $attribs['target'] = $target; -// if (!in_array($target, [ '_self', '_parent', '_top' ])) { -// // T133507. New windows can navigate parent cross-origin. -// // Including noreferrer due to lacking browser -// // support of noopener. Eventually noreferrer should be removed. -// if ($rel !== '') { -// $rel .= ' '; -// } -// $rel .= 'noreferrer noopener'; -// } -// } -// $attribs['rel'] = $rel; -// return $attribs; -// } -// + + /** + * Get an associative array of additional HTML attributes appropriate for a + * particular external link. This currently may include rel => nofollow + * (depending on configuration, namespace, and the URL's domain) and/or a + * target attribute (depending on configuration). + * + * @param String $url URL to extract the domain from for rel => + * nofollow if appropriate + * @return array Associative array of HTML attributes + */ + public Xomw_atr_mgr getExternalLinkAttribs(Xomw_atr_mgr atrs) { + atrs.Clear(); + byte[] rel = Get_external_link_rel; + + // XO.MW.UNSUPPORTED: XO will assume target is blank; MW will set target of "_blank", "_self", etc. depending on global opt + // $target = $this->mOptions->getExternalLinkTarget(); + atrs.Add(Atr__rel, rel); + return atrs; + } + // /** // * Replace unusual escape codes in a URL with their equivalent characters // * @@ -2054,329 +1614,13 @@ public class XomwParser { // } // return $text; // } -// -// /** -// * Process [[ ]] wikilinks -// * -// * @param String $s -// * -// * @return String Processed text -// * -// * @private -// */ -// public function replaceInternalLinks($s) { -// this.mLinkHolders->merge(this.replaceInternalLinks2($s)); -// return $s; -// } -// -// /** -// * Process [[ ]] wikilinks (RIL) -// * @param String $s -// * @throws MWException -// * @return LinkHolderArray -// * -// * @private -// */ -// public function replaceInternalLinks2(&$s) { -// global $wgExtraInterlanguageLinkPrefixes; -// -// static $tc = false, $e1, $e1_img; -// # the % is needed to support urlencoded titles as well -// if (!$tc) { -// $tc = Title::legalChars() . '#%'; -// # Match a link having the form [[namespace:link|alternate]]trail -// $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; -// # Match cases where there is no "]]", which might still be images -// $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; -// } -// -// $holders = new LinkHolderArray($this); -// -// # split the entire text String on occurrences of [[ -// $a = StringUtils::explode('[[', ' ' . $s); -// # get the first element (all text up to first [[), and remove the space we added -// $s = $a->current(); -// $a->next(); -// $line = $a->current(); # Workaround for broken ArrayIterator::next() that returns "void" -// $s = substr($s, 1); -// -// $useLinkPrefixExtension = this.getTargetLanguage()->linkPrefixExtension(); -// $e2 = null; -// if ($useLinkPrefixExtension) { -// # Match the end of a line for a word that's not followed by whitespace, -// # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched -// global $wgContLang; -// $charset = $wgContLang->linkPrefixCharset(); -// $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu"; -// } -// -// if (is_null(this.mTitle)) { -// throw new MWException(__METHOD__ . ": \this.mTitle is null\n"); -// } -// $nottalk = !this.mTitle->isTalkPage(); -// -// if ($useLinkPrefixExtension) { -// $m = []; -// if (preg_match($e2, $s, $m)) { -// $first_prefix = $m[2]; -// } else { -// $first_prefix = false; -// } -// } else { -// $prefix = ''; -// } -// -// $useSubpages = this.areSubpagesAllowed(); -// -// // @codingStandardsIgnoreStart Squiz.WhiteSpace.SemicolonSpacing.Incorrect -// # Loop for each link -// for (; $line !== false && $line !== null; $a->next(), $line = $a->current()) { -// // @codingStandardsIgnoreEnd -// -// # Check for excessive memory usage -// if ($holders->isBig()) { -// # Too big -// # Do the existence check, replace the link holders and clear the array -// $holders->replace($s); -// $holders->clear(); -// } -// -// if ($useLinkPrefixExtension) { -// if (preg_match($e2, $s, $m)) { -// $prefix = $m[2]; -// $s = $m[1]; -// } else { -// $prefix = ''; -// } -// # first link -// if ($first_prefix) { -// $prefix = $first_prefix; -// $first_prefix = false; -// } -// } -// -// $might_be_img = false; -// -// if (preg_match($e1, $line, $m)) { # page with normal text or alt -// $text = $m[2]; -// # If we get a ] at the beginning of $m[3] that means we have a link that's something like: -// # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up, -// # the real problem is with the $e1 regex -// # See T1500. -// # Still some problems for cases where the ] is meant to be outside punctuation, -// # and no image is in sight. See T4095. -// if ($text !== '' -// && substr($m[3], 0, 1) === ']' -// && strpos($text, '[') !== false -// ) { -// $text .= ']'; # so that replaceExternalLinks($text) works later -// $m[3] = substr($m[3], 1); -// } -// # fix up urlencoded title texts -// if (strpos($m[1], '%') !== false) { -// # Should anchors '#' also be rejected? -// $m[1] = str_replace([ '<', '>' ], [ '<', '>' ], rawurldecode($m[1])); -// } -// $trail = $m[3]; -// } elseif (preg_match($e1_img, $line, $m)) { -// # Invalid, but might be an image with a link in its caption -// $might_be_img = true; -// $text = $m[2]; -// if (strpos($m[1], '%') !== false) { -// $m[1] = str_replace([ '<', '>' ], [ '<', '>' ], rawurldecode($m[1])); -// } -// $trail = ""; -// } else { # Invalid form; output directly -// $s .= $prefix . '[[' . $line; -// continue; -// } -// -// $origLink = ltrim($m[1], ' '); -// -// # Don't allow @gplx.Internal protected links to pages containing -// # PROTO: where PROTO is a valid URL protocol; these -// # should be external links. -// if (preg_match('/^(?i:' . this.mUrlProtocols . ')/', $origLink)) { -// $s .= $prefix . '[[' . $line; -// continue; -// } -// -// # Make subpage if necessary -// if ($useSubpages) { -// $link = this.maybeDoSubpageLink($origLink, $text); -// } else { -// $link = $origLink; -// } -// -// $noforce = (substr($origLink, 0, 1) !== ':'); -// if (!$noforce) { -// # Strip off leading ':' -// $link = substr($link, 1); -// } -// -// $unstrip = this.mStripState->unstripNoWiki($link); -// $nt = is_string($unstrip) ? Title::newFromText($unstrip) : null; -// if ($nt === null) { -// $s .= $prefix . '[[' . $line; -// continue; -// } -// -// $ns = $nt->getNamespace(); -// $iw = $nt->getInterwiki(); -// -// if ($might_be_img) { # if this is actually an invalid link -// if ($ns == NS_FILE && $noforce) { # but might be an image -// $found = false; -// while (true) { -// # look at the next 'line' to see if we can close it there -// $a->next(); -// $next_line = $a->current(); -// if ($next_line === false || $next_line === null) { -// break; -// } -// $m = explode(']]', $next_line, 3); -// if (count($m) == 3) { -// # the first ]] closes the inner link, the second the image -// $found = true; -// $text .= "[[{$m[0]}]]{$m[1]}"; -// $trail = $m[2]; -// break; -// } elseif (count($m) == 2) { -// # if there's exactly one ]] that's fine, we'll keep looking -// $text .= "[[{$m[0]}]]{$m[1]}"; -// } else { -// # if $next_line is invalid too, we need look no further -// $text .= '[[' . $next_line; -// break; -// } -// } -// if (!$found) { -// # we couldn't find the end of this imageLink, so output it raw -// # but don't ignore what might be perfectly normal links in the text we've examined -// $holders->merge(this.replaceInternalLinks2($text)); -// $s .= "{$prefix}[[$link|$text"; -// # note: no $trail, because without an end, there *is* no trail -// continue; -// } -// } else { # it's not an image, so output it raw -// $s .= "{$prefix}[[$link|$text"; -// # note: no $trail, because without an end, there *is* no trail -// continue; -// } -// } -// -// $wasblank = ($text == ''); -// if ($wasblank) { -// $text = $link; -// } else { -// # T6598 madness. Handle the quotes only if they come from the alternate part -// # [[Lista d''e paise d''o munno]] -> Lista d''e paise d''o munno -// # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']] -// # -> Criticism of Harry Potter -// $text = this.doQuotes($text); -// } -// -// # Link not escaped by : , create the various objects -// if ($noforce && !$nt->wasLocalInterwiki()) { -// # Interwikis -// if ( -// $iw && this.mOptions->getInterwikiMagic() && $nottalk && ( -// Language::fetchLanguageName($iw, null, 'mw') || -// in_array($iw, $wgExtraInterlanguageLinkPrefixes) -// ) -// ) { -// # T26502: filter duplicates -// if (!isset(this.mLangLinkLanguages[$iw])) { -// this.mLangLinkLanguages[$iw] = true; -// this.mOutput->addLanguageLink($nt->getFullText()); -// } -// -// $s = rtrim($s . $prefix); -// $s .= trim($trail, "\n") == '' ? '': $prefix . $trail; -// continue; -// } -// -// if ($ns == NS_FILE) { -// if (!wfIsBadImage($nt->getDBkey(), this.mTitle)) { -// if ($wasblank) { -// # if no parameters were passed, $text -// # becomes something like "File:Foo.png", -// # which we don't want to pass on to the -// # image generator -// $text = ''; -// } else { -// # recursively parse links inside the image caption -// # actually, this will parse them in any other parameters, too, -// # but it might be hard to fix that, and it doesn't matter ATM -// $text = this.replaceExternalLinks($text); -// $holders->merge(this.replaceInternalLinks2($text)); -// } -// # cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them -// $s .= $prefix . this.armorLinks( -// this.makeImage($nt, $text, $holders)) . $trail; -// continue; -// } -// } elseif ($ns == NS_CATEGORY) { -// $s = rtrim($s . "\n"); # T2087 -// -// if ($wasblank) { -// $sortkey = this.getDefaultSort(); -// } else { -// $sortkey = $text; -// } -// $sortkey = Sanitizer::decodeCharReferences($sortkey); -// $sortkey = str_replace("\n", '', $sortkey); -// $sortkey = this.getConverterLanguage()->convertCategoryKey($sortkey); -// this.mOutput->addCategory($nt->getDBkey(), $sortkey); -// -// /** -// * Strip the whitespace Category links produce, see T2087 -// */ -// $s .= trim($prefix . $trail, "\n") == '' ? '' : $prefix . $trail; -// -// continue; -// } -// } -// -// # Self-link checking. For some languages, variants of the title are checked in -// # LinkHolderArray::doVariants() to allow batching the existence checks necessary -// # for linking to a different variant. -// if ($ns != NS_SPECIAL && $nt->equals(this.mTitle) && !$nt->hasFragment()) { -// $s .= $prefix . Linker::makeSelfLinkObj($nt, $text, '', $trail); -// continue; -// } -// -// # NS_MEDIA is a pseudo-namespace for linking directly to a file -// # @todo FIXME: Should do batch file existence checks, see comment below -// if ($ns == NS_MEDIA) { -// # Give extensions a chance to select the file revision for us -// $options = []; -// $descQuery = false; -// Hooks::run('BeforeParserFetchFileAndTitle', -// [ $this, $nt, &$options, &$descQuery ]); -// # Fetch and register the file (file title may be different via hooks) -// list($file, $nt) = this.fetchFileAndTitle($nt, $options); -// # Cloak with NOPARSE to avoid replacement in replaceExternalLinks -// $s .= $prefix . this.armorLinks( -// Linker::makeMediaLinkFile($nt, $file, $text)) . $trail; -// continue; -// } -// -// # Some titles, such as valid special pages or files in foreign repos, should -// # be shown as bluelinks even though they're not included in the page table -// # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do -// # batch file existence checks for NS_FILE and NS_MEDIA -// if ($iw == '' && $nt->isAlwaysKnown()) { -// this.mOutput->addLink($nt); -// $s .= this.makeKnownLinkHolder($nt, $text, $trail, $prefix); -// } else { -// # Links will be added to the output link list after checking -// $s .= $holders->makeHolder($nt, $text, [], $trail, $prefix); -// } -// } -// return $holders; -// } -// + + // XO.MW:MOVED + // public function replaceInternalLinks($s) {} + + // XO.MW:MOVED +// public function replaceInternalLinks2(&$s) {} + // /** // * Render a forced-blue link inline; protect against double expansion of // * URLs if we're in a mode that prepends full URL prefixes to @gplx.Internal protected links. @@ -2404,21 +1648,69 @@ public class XomwParser { // return this.armorLinks($link) . $trail; // } // -// /** -// * Insert a NOPARSE hacky thing into any inline links in a chunk that's -// * going to go through further parsing steps before inline URL expansion. -// * -// * Not needed quite as much as it used to be since free links are a bit -// * more sensible these days. But bracketed links are still an issue. -// * -// * @param String $text More-or-less HTML -// * @return String Less-or-more HTML with NOPARSE bits -// */ -// public function armorLinks($text) { -// return preg_replace('/\b((?i)' . this.mUrlProtocols . ')/', -// self::MARKER_PREFIX . "NOPARSE$1", $text); -// } -// + /** + * Insert a NOPARSE hacky thing into any inline links in a chunk that's + * going to go through further parsing steps before inline URL expansion. + * + * Not needed quite as much as it used to be since free links are a bit + * more sensible these days. But bracketed links are still an issue. + * + * @param String $text More-or-less HTML + * @return String Less-or-more HTML with NOPARSE bits + */ + public byte[] armorLinks(Bry_bfr trg, byte[] src, int src_bgn, int src_end) { + // XO.MW.PORTED + // return preg_replace('/\b((?i)' . this.mUrlProtocols . ')/', + // self::MARKER_PREFIX . "NOPARSE$1", $text); + int cur = src_bgn; + int prv = cur; + boolean dirty = false; + boolean called_by_bry = trg == null; + while (true) { + // exit if EOS + if (cur == src_end) { + // if dirty, add rest of String + if (dirty) + trg.Add_mid(src, prv, src_end); + break; + } + + // check if cur matches protocol + Object protocol_obj = protocols_trie.Match_at(trv, src, cur, src_end); + // no match; continue + if (protocol_obj == null) { + cur++; + } + // match; add to bfr + else { + dirty = true; + byte[] protocol_bry = (byte[])protocol_obj; + if (called_by_bry) trg = Bry_bfr_.New(); + trg.Add_bry_many(XomwStripState.Bry__marker__bgn, Bry__noparse, protocol_bry); + cur += protocol_bry.length; + prv = cur; + } + } + if (called_by_bry) { + if (dirty) + return trg.To_bry_and_clear(); + else { + if (src_bgn == 0 && src_end == src.length) + return src; + else + return Bry_.Mid(src, src_bgn, src_end); + } + } + else { + if (dirty) + return null; + else { + trg.Add_mid(src, src_bgn, src_end); + return null; + } + } + } + // /** // * Return true if subpage links should be expanded on this page. // * @return boolean @@ -3925,64 +3217,10 @@ public class XomwParser { // this.mExpensiveFunctionCount++; // return this.mExpensiveFunctionCount <= this.mOptions->getExpensiveParserFunctionLimit(); // } -// -// /** -// * Strip double-underscore items like __NOGALLERY__ and __NOTOC__ -// * Fills this.mDoubleUnderscores, returns the modified text -// * -// * @param String $text -// * -// * @return String -// */ -// public function doDoubleUnderscore($text) { -// -// # The position of __TOC__ needs to be recorded -// $mw = MagicWord::get('toc'); -// if ($mw->match($text)) { -// this.mShowToc = true; -// this.mForceTocPosition = true; -// -// # Set a placeholder. At the end we'll fill it in with the TOC. -// $text = $mw->replace('', $text, 1); -// -// # Only keep the first one. -// $text = $mw->replace('', $text); -// } -// -// # Now match and remove the rest of them -// $mwa = MagicWord::getDoubleUnderscoreArray(); -// this.mDoubleUnderscores = $mwa->matchAndRemove($text); -// -// if (isset(this.mDoubleUnderscores['nogallery'])) { -// this.mOutput->mNoGallery = true; -// } -// if (isset(this.mDoubleUnderscores['notoc']) && !this.mForceTocPosition) { -// this.mShowToc = false; -// } -// if (isset(this.mDoubleUnderscores['hiddencat']) -// && this.mTitle->getNamespace() == NS_CATEGORY -// ) { -// this.addTrackingCategory('hidden-category-category'); -// } -// # (T10068) Allow control over whether robots index a page. -// # __INDEX__ always overrides __NOINDEX__, see T16899 -// if (isset(this.mDoubleUnderscores['noindex']) && this.mTitle->canUseNoindex()) { -// this.mOutput->setIndexPolicy('noindex'); -// this.addTrackingCategory('noindex-category'); -// } -// if (isset(this.mDoubleUnderscores['index']) && this.mTitle->canUseNoindex()) { -// this.mOutput->setIndexPolicy('index'); -// this.addTrackingCategory('index-category'); -// } -// -// # Cache all double underscores in the database -// foreach (this.mDoubleUnderscores as $key => $val) { -// this.mOutput->setProperty($key, ''); -// } -// -// return $text; -// } -// + + // XO.MW:MOVED + // public void doDoubleUnderscore($text) {} + // /** // * @see ParserOutput::addTrackingCategory() // * @param String $msg Message key @@ -4877,18 +4115,25 @@ public class XomwParser { // // return $old; // } -// -// /** -// * Replace "" link placeholders with actual links, in the buffer -// * Placeholders created in Linker::link() -// * -// * @param String $text -// * @param int $options -// */ -// public function replaceLinkHolders(&$text, $options = 0) { -// this.mLinkHolders->replace($text); -// } -// + + /** + * Replace "" link placeholders with actual links, in the buffer + * Placeholders created in Linker::link() + * + * @param String $text + * @param int $options + */ + public void replaceLinkHolders(Xomw_parser_bfr pbfr) { + // this.mLinkHolders.replace(text); + this.mLinkHolders.replace(pbfr); + } + private final Xomw_parser_bfr tmp_pbfr = new Xomw_parser_bfr(); + public byte[] replaceLinkHolders(byte[] text) { + // this.mLinkHolders.replace(text); + this.mLinkHolders.replace(tmp_pbfr.Init(text)); + return tmp_pbfr.Trg().To_bry_and_clear(); + } + // /** // * Replace "" link placeholders with plain text of links // * (not HTML-formatted). @@ -5339,22 +4584,32 @@ public class XomwParser { // // return $ret; // } -// -// /** -// * @param String $caption -// * @param LinkHolderArray|boolean $holders -// * @return mixed|String -// */ + + /** + * @param String $caption + * @param LinkHolderArray|boolean $holders + * @return mixed|String + */ + public byte[] stripAltText(byte[] caption, XomwLinkHolderArray holders) { + // Strip bad stuff out of the title (tooltip). We can't just use + // replaceLinkHoldersText() here, because if this function is called + // from replaceInternalLinks2(), mLinkHolders won't be up-to-date. + byte[] tooltip; + if (holders != null) { + tooltip = holders.replace(tmp_pbfr, caption); + } else { + tooltip = this.replaceLinkHolders(caption); + } + + // make sure there are no placeholders in thumbnail attributes + // that are later expanded to html- so expand them now and + // remove the tags + tooltip = this.mStripState.unstripBoth(tooltip); +// tooltip = Sanitizer::stripAllTags( tooltip ); + + return tooltip; + } // protected function stripAltText($caption, $holders) { -// # Strip bad stuff out of the title (tooltip). We can't just use -// # replaceLinkHoldersText() here, because if this function is called -// # from replaceInternalLinks2(), mLinkHolders won't be up-to-date. -// if ($holders) { -// $tooltip = $holders->replaceText($caption); -// } else { -// $tooltip = this.replaceLinkHoldersText($caption); -// } -// // # make sure there are no placeholders in thumbnail attributes // # that are later expanded to html- so expand them now and // # remove the tags @@ -6079,4 +5334,7 @@ public class XomwParser { // OutputPage::setupOOUI(); // this.mOutput->setEnableOOUI(true); // } + private static final byte[] // Bry__strip_state_item = Bry_.new_a7("-item-") + Bry__noparse = Bry_.new_a7("NOPARSE"); + private static final byte[] Bry__marker__noparse = Bry_.Add(XomwStripState.Bry__marker__bgn, Bry__noparse); } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserIface.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserIface.java new file mode 100644 index 000000000..48c4a6293 --- /dev/null +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserIface.java @@ -0,0 +1,27 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; +import gplx.xowa.mediawiki.includes.htmls.*; +import gplx.xowa.mediawiki.includes.linkers.*; +public interface XomwParserIface { + int nextLinkID(); + XomwParserOptions getOptions(); + Xomw_link_renderer getLinkRenderer(); + + byte[] armorLinks(Bry_bfr trg, byte[] src, int src_bgn, int src_end); + Xomw_atr_mgr getExternalLinkAttribs(Xomw_atr_mgr atrs); + byte[] stripAltText(byte[] caption, XomwLinkHolderArray holders); +} diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser_options.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserOptions.java similarity index 96% rename from gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser_options.java rename to gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserOptions.java index d32c5fa05..f0c827d2c 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser_options.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserOptions.java @@ -14,8 +14,8 @@ GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; -public class Xomw_parser_options { - public Xomw_parser_options() { +public class XomwParserOptions { + public XomwParserOptions() { this.mThumbSize = 220; } // /** diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserTest.java similarity index 77% rename from gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser__tst.java rename to gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserTest.java index 2262ad10d..7cd87f739 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_parser__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwParserTest.java @@ -15,8 +15,8 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import org.junit.*; -public class Xomw_parser__tst { - private final Xomw_parser__fxt fxt = new Xomw_parser__fxt(); +public class XomwParserTest { + private final XomwParserFxt fxt = new XomwParserFxt(); @Test public void Basic() { fxt.Test__parse(String_.Concat_lines_nl_skip_last ( "== heading_1 ==" @@ -57,19 +57,21 @@ public class Xomw_parser__tst { )); } } -class Xomw_parser__fxt { - private final Xomw_parser mgr = new Xomw_parser(); +class XomwParserFxt { + private final XomwParser parser = new XomwParser(); + private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); - public Xomw_parser__fxt() { + public XomwParserFxt() { Xoae_app app = Xoa_app_fxt.Make__app__edit(); Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); - mgr.Init_by_wiki(wiki); - mgr.Init_by_page(XomwTitle.newFromText(Bry_.new_a7("Page_1"))); + parser.Init_by_wiki(wiki); + parser.Init_by_page(XomwTitle.newFromText(Bry_.new_a7("Page_1"))); + pctx.Init_by_page(XomwTitle.newFromText(Bry_.new_a7("Page_1"))); } public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - mgr.Internal_parse(pbfr, src_bry); - mgr.Internal_parse_half_parsed(pbfr, true, true); + parser.internalParse(pbfr, pctx, src_bry); + parser.internalParseHalfParsed(pbfr, pctx, true, true); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripState.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripState.java new file mode 100644 index 000000000..7ceecb349 --- /dev/null +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripState.java @@ -0,0 +1,346 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012-2017 gnosygnu@gmail.com + +XOWA is licensed under the terms of the General Public License (GPL) Version 3, +or alternatively under the terms of the Apache License Version 2.0. + +You may use XOWA according to either of these licenses as is most appropriate +for your project on a case-by-case basis. + +The terms of each license can be found in the source code repository: + +GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt +Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt +*/ +package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; +import gplx.core.btries.*; +public class XomwStripState { +// protected $prefix; +// protected $data; +// protected $regex; +// +// protected $tempType, $tempMergePrefix; +// protected $circularRefGuard; +// protected $recursionLevel = 0; +// +// static final UNSTRIP_RECURSION_LIMIT = 20; + + private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); + private final Btrie_rv trv = new Btrie_rv(); + private final Bry_bfr tmp_1 = Bry_bfr_.New(); + private final Bry_bfr tmp_2 = Bry_bfr_.New(); + private boolean tmp_2_used = false; + private int generalLen, nowikiLen; + +// /** +// * @param String|null $prefix +// * @since 1.26 The prefix argument should be omitted, as the strip marker +// * prefix String is now a constant. +// */ +// public function __construct($prefix = null) { +// if ($prefix !== null) { +// wfDeprecated(__METHOD__ . ' with called with $prefix argument' . +// ' (call with no arguments instead)', '1.26'); +// } +// this.data = [ +// 'nowiki' => [], +// 'general' => [] +// ]; +// this.regex = '/' . Parser::MARKER_PREFIX . "([^\x7f<>&'\"]+)" . Parser::MARKER_SUFFIX . '/'; +// this.circularRefGuard = []; +// } +// public void Clear() { +// trie.Clear(); +// generalLen = nowikiLen = 0; +// tmp_2_used = false; +// } + + /** + * Add a nowiki strip item + * @param String $marker + * @param String $value + */ + public void addNoWiki(byte[] marker, byte[] val) { + this.addItem(TYPE_NOWIKI, marker, val); + } + + /** + * @param String $marker + * @param String $value + */ + public void addGeneral(byte[] marker, byte[] val) { + this.addItem(TYPE_GENERAL, marker, val); + } + + /** + * @throws MWException + * @param String $type + * @param String $marker + * @param String $value + */ + public void addItem(byte type, byte[] marker, byte[] val) { + // if (!preg_match(this.regex, $marker, $m)) { + // throw new MWException("Invalid marker: $marker"); + // } + + // XO.MW:ported + // this.data[$type][$m[1]] = $value; + trie.Add_obj(marker, new XomwStripItem(type, marker, val)); + if (type == TYPE_GENERAL) + generalLen++; + else + nowikiLen++; + } + + /** + * @param String $text + * @return mixed + */ + public byte[] unstripGeneral(byte[] text) { + return this.unstripType(TYPE_GENERAL, text); + } + + /** + * @param String $text + * @return mixed + */ + public byte[] unstripNoWiki(byte[] text) { + return this.unstripType(TYPE_NOWIKI, text); + } + + /** + * @param String $text + * @return mixed + */ + public byte[] unstripBoth(byte[] text) { + // $text = this.unstripType('general', $text); + // $text = this.unstripType('nowiki', $text); + return this.unstripType(TYPE_BOTH, text); + } + + public byte[] unstripType(byte tid, byte[] text) { + boolean dirty = unstripType(tid, tmp_1, text, 0, text.length); + return dirty ? tmp_1.To_bry_and_clear() : text; + } + + // XOWA + public void unstripGeneral(Xomw_parser_bfr pbfr) {unstripType(TYPE_GENERAL, pbfr);} + public void unstripNoWiki(Xomw_parser_bfr pbfr) {unstripType(TYPE_NOWIKI , pbfr);} + public void unstripBoth(Xomw_parser_bfr pbfr) {unstripType(TYPE_BOTH , pbfr);} + private boolean unstripType(byte tid, Xomw_parser_bfr pbfr) { + // XO.PBFR + Bry_bfr src_bfr = pbfr.Src(); + byte[] src = src_bfr.Bfr(); + boolean dirty = unstripType(tid, pbfr.Trg(), src, 0, src_bfr.Len()); + if (dirty) + pbfr.Switch(); + return dirty; + } + + /** + * @param String $type + * @param String $text + * @return mixed + */ + private boolean unstripType(byte tid, Bry_bfr trg, byte[] src, int src_bgn, int src_end) { + // // Shortcut + // if (!count(this.data[$type])) { + // return $text; + // } + // exit early if no items for type + if ((tid & TYPE_GENERAL) == TYPE_GENERAL) { + if (generalLen == 0) + return false; + } + else if ((tid & TYPE_NOWIKI) == TYPE_NOWIKI) { + if (nowikiLen == 0) + return false; + } + + // XO.MW:PORTED + // $oldType = this.tempType; + // this.tempType = $type; + // $text = preg_replace_callback(this.regex, [ $this, 'unstripCallback' ], $text); + // this.tempType = $oldType; + // return $text; + int cur = src_bgn; + int prv = cur; + boolean dirty = false; + // loop over each src char + while (true) { + // EOS: exit + if (cur == src_end) { + if (dirty) // add remainder if dirty + trg.Add_mid(src, prv, src_end); + break; + } + + // check if current pos matches strip state + Object o = trie.Match_at(trv, src, cur, src_end); + if (o != null) { // match + XomwStripItem item = (XomwStripItem)o; + byte item_tid = item.Type(); + if ((tid & item_tid) == item_tid) { // check if types match + // get bfr for recursion + Bry_bfr nested_bfr = null; + boolean tmp_2_release = false; + if (tmp_2_used) { + nested_bfr = Bry_bfr_.New(); + } + else { + nested_bfr = tmp_2; + tmp_2_used = true; + tmp_2_release = true; + } + + // recurse + byte[] item_val = item.Val(); + if (unstripType(tid, nested_bfr, item_val, 0, item_val.length)) + item_val = nested_bfr.To_bry_and_clear(); + if (tmp_2_release) + tmp_2_used = false; + + // add to trg + trg.Add_mid(src, prv, cur); + trg.Add(item_val); + + // update vars + dirty = true; + cur += item.Key().length; + prv = cur; + continue; + } + } + cur++; + } + return dirty; + } + + // /** + // * @param array $m + // * @return array + // */ + // protected function unstripCallback($m) { + // $marker = $m[1]; + // if (isset(this.data[this.tempType][$marker])) { + // if (isset(this.circularRefGuard[$marker])) { + // return '' + // . wfMessage('parser-unstrip-loop-warning')->inContentLanguage()->text() + // . ''; + // } + // if (this.recursionLevel >= self::UNSTRIP_RECURSION_LIMIT) { + // return '' . + // wfMessage('parser-unstrip-recursion-limit') + // ->numParams(self::UNSTRIP_RECURSION_LIMIT)->inContentLanguage()->text() . + // ''; + // } + // this.circularRefGuard[$marker] = true; + // this.recursionLevel++; + // $value = this.data[this.tempType][$marker]; + // if ($value instanceof Closure) { + // $value = $value(); + // } + // $ret = this.unstripType(this.tempType, $value); + // this.recursionLevel--; + // unset(this.circularRefGuard[$marker]); + // return $ret; + // } else { + // return $m[0]; + // } + // } + +// /** +// * Get a StripState Object which is sufficient to unstrip the given text. +// * It will contain the minimum subset of strip items necessary. +// * +// * @param String $text +// * +// * @return StripState +// */ +// public function getSubState($text) { +// $subState = new StripState(); +// $pos = 0; +// while (true) { +// $startPos = strpos($text, Parser::MARKER_PREFIX, $pos); +// $endPos = strpos($text, Parser::MARKER_SUFFIX, $pos); +// if ($startPos === false || $endPos === false) { +// break; +// } +// +// $endPos += strlen(Parser::MARKER_SUFFIX); +// $marker = substr($text, $startPos, $endPos - $startPos); +// if (!preg_match(this.regex, $marker, $m)) { +// continue; +// } +// +// $key = $m[1]; +// if (isset(this.data['nowiki'][$key])) { +// $subState->data['nowiki'][$key] = this.data['nowiki'][$key]; +// } elseif (isset(this.data['general'][$key])) { +// $subState->data['general'][$key] = this.data['general'][$key]; +// } +// $pos = $endPos; +// } +// return $subState; +// } +// +// /** +// * Merge another StripState Object into this one. The strip marker keys +// * will not be preserved. The strings in the $texts array will have their +// * strip markers rewritten, the resulting array of strings will be returned. +// * +// * @param StripState $otherState +// * @param array $texts +// * @return array +// */ +// public function merge($otherState, $texts) { +// $mergePrefix = wfRandomString(16); +// +// foreach ($otherState->data as $type => $items) { +// foreach ($items as $key => $value) { +// this.data[$type]["$mergePrefix-$key"] = $value; +// } +// } +// +// this.tempMergePrefix = $mergePrefix; +// $texts = preg_replace_callback($otherState->regex, [ $this, 'mergeCallback' ], $texts); +// this.tempMergePrefix = null; +// return $texts; +// } +// +// /** +// * @param array $m +// * @return String +// */ +// protected function mergeCallback($m) { +// $key = $m[1]; +// return Parser::MARKER_PREFIX . this.tempMergePrefix . '-' . $key . Parser::MARKER_SUFFIX; +// } +// +// /** +// * Remove any strip markers found in the given text. +// * +// * @param String $text Input String +// * @return String +// */ +// public function killMarkers($text) { +// return preg_replace(this.regex, '', $text); +// } + public static final String Str__marker_bgn = "\u007f'\"`UNIQ-"; + public static final byte[] + Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn) + , Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f") + ; + public static final byte TYPE_GENERAL = 1, TYPE_NOWIKI = 2, TYPE_BOTH = 3; +} +class XomwStripItem { + public XomwStripItem(byte tid, byte[] key, byte[] val) { + this.tid = tid; + this.key = key; + this.val = val; + } + public byte Type() {return tid;} private final byte tid; + public byte[] Key() {return key;} private final byte[] key; + public byte[] Val() {return val;} private final byte[] val; +} diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripStateTest.java similarity index 50% rename from gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state__tst.java rename to gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripStateTest.java index 783a3514c..81bd57ccc 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/XomwStripStateTest.java @@ -15,28 +15,28 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import org.junit.*; import gplx.core.tests.*; -public class Xomw_strip_state__tst { - private final Xomw_strip_state__fxt fxt = new Xomw_strip_state__fxt(); +public class XomwStripStateTest { + private final XomwStripStateFxt fxt = new XomwStripStateFxt(); @Test public void Basic() { - fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1"); - fxt.Test__nostrip(Xomw_strip_state.Tid__nowiki , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b"); - fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b"); - fxt.Test__unstrip(Xomw_strip_state.Tid__both , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b"); + fxt.Init__add (XomwStripState.TYPE_GENERAL, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1"); + fxt.Test__nostrip(XomwStripState.TYPE_NOWIKI , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b"); + fxt.Test__unstrip(XomwStripState.TYPE_GENERAL, "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b"); + fxt.Test__unstrip(XomwStripState.TYPE_BOTH , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b"); } @Test public void Recurse() { - fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1"); - fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-2-QINU`\"'\u007f", "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f"); - fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-2-QINU`\"'\u007f b", "a val-1 b"); + fxt.Init__add (XomwStripState.TYPE_GENERAL, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1"); + fxt.Init__add (XomwStripState.TYPE_GENERAL, "\u007f'\"`UNIQ-key-2-QINU`\"'\u007f", "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f"); + fxt.Test__unstrip(XomwStripState.TYPE_GENERAL, "a \u007f'\"`UNIQ-key-2-QINU`\"'\u007f b", "a val-1 b"); } } -class Xomw_strip_state__fxt { - private final Xomw_strip_state strip_state = new Xomw_strip_state(); +class XomwStripStateFxt { + private final XomwStripState stripState = new XomwStripState(); public void Init__add(byte tid, String marker, String val) { - strip_state.Add_item(tid, Bry_.new_u8(marker), Bry_.new_u8(val)); + stripState.addItem(tid, Bry_.new_u8(marker), Bry_.new_u8(val)); } public void Test__nostrip(byte tid, String src) {Test__unstrip(tid, src, src);} public void Test__unstrip(byte tid, String src, String expd) { - byte[] actl = strip_state.Unstrip(tid, Bry_.new_u8(src)); + byte[] actl = stripState.unstripType(tid, Bry_.new_u8(src)); Gftest.Eq__str(expd, String_.new_u8(actl)); } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_block_level_pass.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_block_level_pass.java index 9596b3519..0b2071ca7 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_block_level_pass.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_block_level_pass.java @@ -24,7 +24,7 @@ public class Xomw_block_level_pass { private int last_section; private byte[] find_colon_no_links__before, find_colon_no_links__after; - public void Do_block_levels(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, boolean line_start) { + public void doBlockLevels(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, boolean line_start) { // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); @@ -41,7 +41,7 @@ public class Xomw_block_level_pass { ("formatHeadings($text, $origText, $isMain); } public void Internal_parse_half_parsed(Xomw_parser_bfr pbfr, boolean is_main, boolean line_start) { - strip_state.Unstrip_general(pbfr); + strip_state.unstripGeneral(pbfr); // MW.HOOK:ParserAfterUnstrip // Clean up special characters, only run once, next-to-last before doBlockLevels - nbsp_wkr.Do_nbsp(pctx, pbfr); + nbsp_wkr.doNbsp(pctx, pbfr); - block_wkr.Do_block_levels(pctx, pbfr, line_start); + block_wkr.doBlockLevels(pctx, pbfr, line_start); lnki_wkr.replaceLinkHolders(pbfr); @@ -192,12 +193,12 @@ public class Xomw_parser { // } // } - strip_state.Unstrip_nowiki(pbfr); + strip_state.unstripNoWiki(pbfr); // MW.HOOK:ParserBeforeTidy // $text = $this->replaceTransparentTags( $text ); - strip_state.Unstrip_general(pbfr); + strip_state.unstripGeneral(pbfr); sanitizer.Normalize_char_references(pbfr); @@ -236,7 +237,11 @@ public class Xomw_parser { // MW.HOOK:ParserAfterTidy } - public byte[] Armor_links(Bry_bfr trg, byte[] src, int src_bgn, int src_end) { + public byte[] stripAltText(byte[] caption, XomwLinkHolderArray holders) { + return caption; + } + + public byte[] armorLinks(Bry_bfr trg, byte[] src, int src_bgn, int src_end) { // PORTED:preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/', self::MARKER_PREFIX . "NOPARSE$1", $text ) int cur = src_bgn; int prv = cur; @@ -262,7 +267,7 @@ public class Xomw_parser { dirty = true; byte[] protocol_bry = (byte[])protocol_obj; if (called_by_bry) trg = Bry_bfr_.New(); - trg.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__noparse, protocol_bry); + trg.Add_bry_many(XomwStripState.Bry__marker__bgn, Bry__noparse, protocol_bry); cur += protocol_bry.length; prv = cur; } @@ -287,15 +292,15 @@ public class Xomw_parser { } } public byte[] Insert_strip_item(byte[] text) { - tmp.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__strip_state_item); + tmp.Add_bry_many(XomwStripState.Bry__marker__bgn, Bry__strip_state_item); tmp.Add_int_variable(marker_index); - tmp.Add(Xomw_strip_state.Bry__marker__end); + tmp.Add(XomwStripState.Bry__marker__end); byte[] marker = tmp.To_bry_and_clear(); marker_index++; - strip_state.Add_general(marker, text); + strip_state.addGeneral(marker, text); return marker; } - public Xomw_atr_mgr Get_external_link_attribs(Xomw_atr_mgr atrs) { + public Xomw_atr_mgr getExternalLinkAttribs(Xomw_atr_mgr atrs) { atrs.Clear(); byte[] rel = Get_external_link_rel; @@ -309,7 +314,7 @@ public class Xomw_parser { public byte[] Get_external_link_rel; private static byte[] Atr__rel; private static final byte[] Bry__strip_state_item = Bry_.new_a7("-item-"), Bry__noparse = Bry_.new_a7("NOPARSE"); - private static final byte[] Bry__marker__noparse = Bry_.Add(Xomw_strip_state.Bry__marker__bgn, Bry__noparse); + private static final byte[] Bry__marker__noparse = Bry_.Add(XomwStripState.Bry__marker__bgn, Bry__noparse); public static Btrie_slim_mgr Protocols__dflt() { Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7(); Gfo_protocol_itm[] ary = Gfo_protocol_itm.Ary(); diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state.java deleted file mode 100644 index 0f7bd0266..000000000 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/Xomw_strip_state.java +++ /dev/null @@ -1,137 +0,0 @@ -/* -XOWA: the XOWA Offline Wiki Application -Copyright (C) 2012-2017 gnosygnu@gmail.com - -XOWA is licensed under the terms of the General Public License (GPL) Version 3, -or alternatively under the terms of the Apache License Version 2.0. - -You may use XOWA according to either of these licenses as is most appropriate -for your project on a case-by-case basis. - -The terms of each license can be found in the source code repository: - -GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt -Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt -*/ -package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; -import gplx.core.btries.*; -public class Xomw_strip_state { // REF.MW:/parser/StripState.php - private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs(); - private final Btrie_rv trv = new Btrie_rv(); - private final Bry_bfr tmp_1 = Bry_bfr_.New(); - private final Bry_bfr tmp_2 = Bry_bfr_.New(); - private boolean tmp_2_used = false; - private int general_len, nowiki_len; - public void Clear() { - trie.Clear(); - general_len = nowiki_len = 0; - tmp_2_used = false; - } - public void Add_general(byte[] marker, byte[] val) {Add_item(Tid__general, marker, val);} - public void Add_nowiki (byte[] marker, byte[] val) {Add_item(Tid__nowiki, marker, val);} - public void Add_item(byte tid, byte[] marker, byte[] val) { - trie.Add_obj(marker, new Xomw_strip_item(tid, marker, val)); - if (tid == Tid__general) - general_len++; - else - nowiki_len++; - } - public byte[] Unstrip_general(byte[] text) {return Unstrip(Tid__general, text);} - public byte[] Unstrip_nowiki (byte[] text) {return Unstrip(Tid__nowiki , text);} - public byte[] Unstrip_both (byte[] text) {return Unstrip(Tid__both , text);} - public byte[] Unstrip(byte tid, byte[] text) { - boolean dirty = Unstrip(tid, tmp_1, text, 0, text.length); - return dirty ? tmp_1.To_bry_and_clear() : text; - } - public void Unstrip_general(Xomw_parser_bfr pbfr) {Unstrip(Tid__general, pbfr);} - public void Unstrip_nowiki (Xomw_parser_bfr pbfr) {Unstrip(Tid__nowiki , pbfr);} - public void Unstrip_both (Xomw_parser_bfr pbfr) {Unstrip(Tid__both , pbfr);} - private boolean Unstrip(byte tid, Xomw_parser_bfr pbfr) { - // XO.PBFR - Bry_bfr src_bfr = pbfr.Src(); - byte[] src = src_bfr.Bfr(); - boolean dirty = Unstrip(tid, pbfr.Trg(), src, 0, src_bfr.Len()); - if (dirty) - pbfr.Switch(); - return dirty; - } - private boolean Unstrip(byte tid, Bry_bfr trg, byte[] src, int src_bgn, int src_end) { - // exit early if no items for type - if ((tid & Tid__general) == Tid__general) { - if (general_len == 0) - return false; - } - else if ((tid & Tid__nowiki) == Tid__nowiki) { - if (nowiki_len == 0) - return false; - } - - int cur = src_bgn; - int prv = cur; - boolean dirty = false; - // loop over each src char - while (true) { - // EOS: exit - if (cur == src_end) { - if (dirty) // add remainder if dirty - trg.Add_mid(src, prv, src_end); - break; - } - - // check if current pos matches strip state - Object o = trie.Match_at(trv, src, cur, src_end); - if (o != null) { // match - Xomw_strip_item item = (Xomw_strip_item)o; - byte item_tid = item.Tid(); - if ((tid & item_tid) == item_tid) { // check if types match - // get bfr for recursion - Bry_bfr nested_bfr = null; - boolean tmp_2_release = false; - if (tmp_2_used) { - nested_bfr = Bry_bfr_.New(); - } - else { - nested_bfr = tmp_2; - tmp_2_used = true; - tmp_2_release = true; - } - - // recurse - byte[] item_val = item.Val(); - if (Unstrip(tid, nested_bfr, item_val, 0, item_val.length)) - item_val = nested_bfr.To_bry_and_clear(); - if (tmp_2_release) - tmp_2_used = false; - - // add to trg - trg.Add_mid(src, prv, cur); - trg.Add(item_val); - - // update vars - dirty = true; - cur += item.Key().length; - prv = cur; - continue; - } - } - cur++; - } - return dirty; - } - public static final String Str__marker_bgn = "\u007f'\"`UNIQ-"; - public static final byte[] - Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn) - , Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f") - ; - public static final byte Tid__general = 1, Tid__nowiki = 2, Tid__both = 3; -} -class Xomw_strip_item { - public Xomw_strip_item(byte tid, byte[] key, byte[] val) { - this.tid = tid; - this.key = key; - this.val = val; - } - public byte Tid() {return tid;} private final byte tid; - public byte[] Key() {return key;} private final byte[] key; - public byte[] Val() {return val;} private final byte[] val; -} diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr.java index d5e404a1b..83d8b4643 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr.java @@ -37,7 +37,7 @@ public class Xomw_doubleunder_wkr { , Xol_kwd_grp_.Id_nocontentconvert ); } - public void Do_double_underscore(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void doDoubleUnderscore(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); @@ -123,6 +123,62 @@ public class Xomw_doubleunder_wkr { if (dirty) pbfr.Switch(); } +// /** +// * Strip double-underscore items like __NOGALLERY__ and __NOTOC__ +// * Fills this.mDoubleUnderscores, returns the modified text +// * +// * @param String $text +// * +// * @return String +// */ +// public function doDoubleUnderscore($text) { +// +// # The position of __TOC__ needs to be recorded +// $mw = MagicWord::get('toc'); +// if ($mw->match($text)) { +// this.mShowToc = true; +// this.mForceTocPosition = true; +// +// # Set a placeholder. At the end we'll fill it in with the TOC. +// $text = $mw->replace('', $text, 1); +// +// # Only keep the first one. +// $text = $mw->replace('', $text); +// } +// +// # Now match and remove the rest of them +// $mwa = MagicWord::getDoubleUnderscoreArray(); +// this.mDoubleUnderscores = $mwa->matchAndRemove($text); +// +// if (isset(this.mDoubleUnderscores['nogallery'])) { +// this.mOutput->mNoGallery = true; +// } +// if (isset(this.mDoubleUnderscores['notoc']) && !this.mForceTocPosition) { +// this.mShowToc = false; +// } +// if (isset(this.mDoubleUnderscores['hiddencat']) +// && this.mTitle->getNamespace() == NS_CATEGORY +// ) { +// this.addTrackingCategory('hidden-category-category'); +// } +// # (T10068) Allow control over whether robots index a page. +// # __INDEX__ always overrides __NOINDEX__, see T16899 +// if (isset(this.mDoubleUnderscores['noindex']) && this.mTitle->canUseNoindex()) { +// this.mOutput->setIndexPolicy('noindex'); +// this.addTrackingCategory('noindex-category'); +// } +// if (isset(this.mDoubleUnderscores['index']) && this.mTitle->canUseNoindex()) { +// this.mOutput->setIndexPolicy('index'); +// this.addTrackingCategory('index-category'); +// } +// +// # Cache all double underscores in the database +// foreach (this.mDoubleUnderscores as $key => $val) { +// this.mOutput->setProperty($key, ''); +// } +// +// return $text; +// } private static void Reg(Btrie_slim_mgr trie, Xol_kwd_mgr mgr, int... ids) { for (int id : ids) { Xol_kwd_grp grp = mgr.Get_or_new(id); diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java index 33d2f5c57..24a35bd3b 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java @@ -36,7 +36,7 @@ class Xomw_doubleunder_wkr__fxt { } public Xomw_doubleunder_wkr__fxt Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Do_double_underscore(pctx, pbfr.Init(src_bry)); + wkr.doDoubleUnderscore(pctx, pbfr.Init(src_bry)); Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear(), src_str); return this; } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/headings/Xomw_heading_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/headings/Xomw_heading_wkr.java index a0f13122f..7722cf619 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/headings/Xomw_heading_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/headings/Xomw_heading_wkr.java @@ -28,7 +28,7 @@ public class Xomw_heading_wkr { public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end; public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn; public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end; - public void Do_headings(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, Xomw_heading_cbk__html cbk) { + public void doHeadings(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, Xomw_heading_cbk__html cbk) { Bry_bfr src_bfr = pbfr.Src(); byte[] src_bry = src_bfr.Bfr(); int src_end = src_bfr.Len(); @@ -103,4 +103,20 @@ public class Xomw_heading_wkr { cbk.On_hdr_seen(pctx, this); return nl_rhs; } +// /** +// * Parse headers and return html +// * +// * @private +// * +// * @param String $text +// * +// * @return String +// */ +// public function doHeadings($text) { +// for ($i = 6; $i >= 1; --$i) { +// $h = str_repeat('=', $i); +// $text = preg_replace("/^$h(.+)$h\\s*$/m", "\\1", $text); +// } +// return $text; +// } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr.java index 083e63182..1cf302df3 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr.java @@ -17,7 +17,7 @@ package gplx.xowa.mediawiki.includes.parsers.hrs; import gplx.*; import gplx.xow import gplx.xowa.mediawiki.includes.utls.*; public class Xomw_hr_wkr {// THREAD.UNSAFE: caching for repeated calls private Bry_bfr bfr; - public void Replace_hrs(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1
', text); + public void replaceHrs(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1
', text); // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr__tst.java index 3df117729..643672be8 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/hrs/Xomw_hr_wkr__tst.java @@ -28,7 +28,7 @@ class Xomw_hr_wkr__fxt { private final Xomw_hr_wkr wkr = new Xomw_hr_wkr(); public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Replace_hrs(new Xomw_parser_ctx(), pbfr.Init(src_bry)); + wkr.replaceHrs(new Xomw_parser_ctx(), pbfr.Init(src_bry)); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java index 6d859a617..cb8bd0cd8 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr.java @@ -25,17 +25,17 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls private final Bry_bfr tmp; private Btrie_slim_mgr protocol_trie; private final Btrie_rv trv = new Btrie_rv(); private int autonumber; - private final Xomw_parser parser; + private final XomwParserIface parser; private final XomwLinker linker; private final XomwSanitizer sanitizer; private final Xomw_atr_mgr attribs = new Xomw_atr_mgr(); private Xomw_regex_url regex_url; private Xomw_regex_space regex_space; - public Xomw_lnke_wkr(Xomw_parser parser) { + public Xomw_lnke_wkr(XomwParserIface parser, Bry_bfr tmp, XomwLinker linker, XomwSanitizer sanitizer) { this.parser = parser; - this.tmp = parser.Tmp(); - this.linker = parser.Linker(); - this.sanitizer = parser.Sanitizer(); + this.tmp = tmp; + this.linker = linker; + this.sanitizer = sanitizer; if (angle_entities_trie == null) { synchronized (Type_adp_.ClassOf_obj(this)) { @@ -63,7 +63,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls this.regex_space = regex_space; } // XO.MW:SYNC:1.29; DATE:2017-02-01 - public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void replaceExternalLinks(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); @@ -211,7 +211,7 @@ public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls // This means that users can paste URLs directly into the text // Funny characters like � aren't valid in URLs anyway // This was changed in August 2004 - linker.makeExternalLink(bfr, url, Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, parser.Get_external_link_attribs(attribs), Bry_.Empty); + linker.makeExternalLink(bfr, url, Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, parser.getExternalLinkAttribs(attribs), Bry_.Empty); // XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions? // Register link in the output Object. diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr__tst.java index 47d27128a..58965af45 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkes/Xomw_lnke_wkr__tst.java @@ -53,16 +53,18 @@ public class Xomw_lnke_wkr__tst { } } class Xomw_lnke_wkr__fxt { - private final Xomw_lnke_wkr wkr = new Xomw_lnke_wkr(new Xomw_parser()); + private final Xomw_lnke_wkr wkr; private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); private boolean apos = true; public Xomw_lnke_wkr__fxt() { + Xomw_parser parser = new Xomw_parser(); + this.wkr = new Xomw_lnke_wkr(parser, parser.Tmp(), parser.Linker(), parser.Sanitizer()); Xomw_regex_space regex_space = new Xomw_regex_space(); wkr.Init_by_wiki(Xomw_parser.Protocols__dflt(), new Xomw_regex_url(regex_space), regex_space); } public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Replace_external_links(new Xomw_parser_ctx(), pbfr.Init(src_bry)); + wkr.replaceExternalLinks(new Xomw_parser_ctx(), pbfr.Init(src_bry)); if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java index a7aeb127e..d02ce1463 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr.java @@ -30,8 +30,8 @@ import gplx.xowa.parsers.uniqs.*; * P6: [[Media:]] * P4: handle "]]]"; "If we get a ] at the beginning of $m[3]" * P4: handle "[[http://a.org]]" - * P3: $langObj->formatNum( ++$this->mAutonumber ); - * P2: $this->getConverterLanguage()->markNoConversion( $text ); + * P3: $langObj->formatNum( ++this.mAutonumber ); + * P2: this.getConverterLanguage()->markNoConversion( $text ); * P1: link_prefix; EX: b[[A]]; [not enabled on enwiki] */ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls @@ -40,29 +40,31 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls private final Xomw_link_renderer link_renderer; // private final Btrie_slim_mgr protocols_trie; private final Xomw_quote_wkr quote_wkr; - private final Xomw_strip_state strip_state; + private final XomwStripState strip_state; private Xomw_parser_env env; private Xow_wiki wiki; private XomwTitle mPageTitle; // private final XomwLinker_NormalizeSubpageLink normalize_subpage_link = new XomwLinker_NormalizeSubpageLink(); private final Bry_bfr tmp; - private final Xomw_parser parser; + private final XomwParserIface parser; private final Xomw_atr_mgr extra_atrs = new Xomw_atr_mgr(); private final Xomw_qry_mgr query = new Xomw_qry_mgr(); private final Btrie_rv trv = new Btrie_rv(); private final List_adp tmp_list = List_adp_.New(); private final Hash_adp mImageParams = Hash_adp_bry.cs(); private final Hash_adp mImageParamsMagicArray = Hash_adp_bry.cs(); - public Xomw_lnki_wkr(Xomw_parser parser, XomwLinkHolderArray holders, Xomw_link_renderer link_renderer, Btrie_slim_mgr protocols_trie) { + public Xomw_lnki_wkr(XomwParserIface parser, XomwLinkHolderArray holders, Xomw_link_renderer link_renderer, Btrie_slim_mgr protocols_trie + , XomwLinker linker, Xomw_quote_wkr quote_wkr, Bry_bfr tmp, XomwStripState strip_state + ) { this.parser = parser; this.holders = holders; this.link_renderer = link_renderer; // this.protocols_trie = protocols_trie; - this.linker = parser.Linker(); - this.quote_wkr = parser.Quote_wkr(); - this.tmp = parser.Tmp(); - this.strip_state = parser.Strip_state(); + this.linker = linker; + this.quote_wkr = quote_wkr; + this.tmp = tmp; + this.strip_state = strip_state; } public void Init_by_wiki(Xomw_parser_env env, Xow_wiki wiki) { this.env = env; @@ -77,7 +79,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls public void Clear_state() { holders.clear(); } - public void Replace_internal_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void replaceInternalLinks(Xomw_parser_bfr pbfr, Xomw_parser_env env, Xomw_parser_ctx pctx) { // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); @@ -88,10 +90,10 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls this.mPageTitle = pctx.Page_title(); - Replace_internal_links(pctx, bfr, src, src_bgn, src_end); + replaceInternalLinks(env, pctx, bfr, src, src_bgn, src_end); } // XO.MW:SYNC:1.29; DATE:2017-02-02 - public void Replace_internal_links(Xomw_parser_ctx pctx, Bry_bfr bfr, byte[] src, int src_bgn, int src_end) { + public void replaceInternalLinks(Xomw_parser_env env, Xomw_parser_ctx pctx, Bry_bfr bfr, byte[] src, int src_bgn, int src_end) { // XO.MW: regex for tc move to header; e1 and e1_img moved to code // the % is needed to support urlencoded titles as well @@ -109,7 +111,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // XO.MW.IGNORE: handles strange split logic of adding space to String; "$s = substr($s, 1);" // TODO.XO:link_prefix; EX: b[[A]] - // $useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension(); + // $useLinkPrefixExtension = this.getTargetLanguage()->linkPrefixExtension(); // $e2 = null; // if ($useLinkPrefixExtension) { // // Match the end of a line for a word that's not followed by whitespace, @@ -119,9 +121,9 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu"; // } - // IGNORE: throw new MWException(__METHOD__ . ": \$this->mTitle is null\n"); + // IGNORE: throw new MWException(__METHOD__ . ": \this.mTitle is null\n"); - // $nottalk = !$this->mTitle->isTalkPage(); + // $nottalk = !this.mTitle->isTalkPage(); // TODO.XO:link_prefix byte[] prefix = Bry_.Empty; @@ -240,7 +242,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // Don't allow @gplx.Internal protected links to pages containing // PROTO: where PROTO is a valid URL protocol; these // should be external links. - // if (preg_match('/^(?i:' . $this->mUrlProtocols . ')/', $origLink)) { + // if (preg_match('/^(?i:' . this.mUrlProtocols . ')/', $origLink)) { // $s .= $prefix . '[[' . $line; // continue; // } @@ -267,7 +269,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // link = orig_link; // } - byte[] unstrip = strip_state.Unstrip_nowiki(link); + byte[] unstrip = strip_state.unstripNoWiki(link); if (!Bry_.Eq(unstrip, link)) nt = XomwTitle.newFromText(unstrip); if (nt == null) { @@ -309,7 +311,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // we couldn't find the end of this imageLink, so output it raw // but don't ignore what might be perfectly normal links in the text we've examined Bry_bfr nested = wiki.Utl__bfr_mkr().Get_b128(); - this.Replace_internal_links(pctx, nested, text, 0, text.length); + this.replaceInternalLinks(env, pctx, nested, text, 0, text.length); nested.Mkr_rls(); bfr.Add(prefix).Add(Bry__wtxt__lnki__bgn).Add(link).Add_byte_pipe().Add(text); // s .= "{prefix}[[link|text"; // note: no trail, because without an end, there *is* no trail @@ -376,7 +378,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them bfr.Add(prefix); // Armor_links(Make_image(bfr, nt, text, holders)) - this.makeImage(pctx, bfr, nt, text, holders); + this.makeImage(env, pctx, bfr, nt, text, holders); bfr.Add(trail); continue; } @@ -442,7 +444,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls } } } - public void makeImage(Xomw_parser_ctx pctx, Bry_bfr bfr, XomwTitle title, byte[] options_at_link, XomwLinkHolderArray holders) { + public void makeImage(Xomw_parser_env env, Xomw_parser_ctx pctx, Bry_bfr bfr, XomwTitle title, byte[] options_at_link, XomwLinkHolderArray holders) { // Check if the options text is of the form "options|alt text" // Options are: // * thumbnail make a thumbnail with enlarge-icon and caption, alignment depends on lang @@ -478,7 +480,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // XO.MW.HOOK:BeforeParserFetchFileAndTitle // Fetch and register the file (file title may be different via hooks) -// list($file, $title) = $this->fetchFileAndTitle($title, $options); +// list($file, $title) = this.fetchFileAndTitle($title, $options); XomwFile file = fetchFileAndTitle(title, null); // Get parameter map @@ -489,7 +491,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls Xomw_param_map paramMap = tmp_img_params.paramMap; XomwMagicWordArray mwArray = tmp_img_params.mwArray; - // XO.MW.UNSUPPORTED.TrackingCategory: if (!$file) $this->addTrackingCategory('broken-file-category'); + // XO.MW.UNSUPPORTED.TrackingCategory: if (!$file) this.addTrackingCategory('broken-file-category'); // Process the input parameters byte[] caption = Bry_.Empty; @@ -549,12 +551,12 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // manualthumb? downstream behavior seems odd with // missing manual thumbs. validated = true; - // $value = $this->stripAltText($value, $holders); + val = parser.stripAltText(val, holders); break; case Xomw_param_itm.Name__link: // $chars = self::EXT_LINK_URL_CLASS; // $addr = self::EXT_LINK_ADDR; -// $prots = $this->mUrlProtocols; +// $prots = this.mUrlProtocols; // if ($value === '') { // $paramName = 'no-link'; // $value = true; @@ -563,9 +565,9 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // else if (preg_match("/^((?i)$prots)/", $value)) { // if (preg_match("/^((?i)$prots)$addr$chars*$/u", $value, $m)) { // $paramName = 'link-url'; -// $this->mOutput->addExternalLink($value); -// if ($this->mOptions->getExternalLinkTarget()) { -// $params[$type]['link-target'] = $this->mOptions->getExternalLinkTarget(); +// this.mOutput->addExternalLink($value); +// if (this.mOptions->getExternalLinkTarget()) { +// $params[$type]['link-target'] = this.mOptions->getExternalLinkTarget(); // } validated = true; // } @@ -574,7 +576,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // if ($linkTitle) { // $paramName = 'link-title'; // $value = $linkTitle; -// $this->mOutput->addLink($linkTitle); +// this.mOutput->addLink($linkTitle); validated = true; // } // } @@ -649,7 +651,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls if (frameParams.alt == null) { // No alt text, use the "caption" for the alt text if (caption != Bry_.Empty) { -// frameParams.alt = $this->stripAltText(caption, $holders); + frameParams.alt = parser.stripAltText(caption, holders); } else { // No caption, fall back to using the filename for the @@ -658,7 +660,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls } } // Use the "caption" for the tooltip text -// frameParams.title = $this->stripAltText(caption, $holders); + frameParams.title = parser.stripAltText(caption, holders); } // MW.HOOK:ParserMakeImageParams @@ -666,33 +668,13 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // Linker does the rest // byte[] time = options.time; Object time = null; - linker.makeImageLink(bfr, pctx, parser, title, file, frameParams, handlerParams, time, desc_query, parser.Options().getThumbSize()); + linker.makeImageLink(bfr, env, pctx, parser, title, file, frameParams, handlerParams, time, desc_query, parser.getOptions().getThumbSize()); // Give the handler a chance to modify the parser Object // if (handler != null) { // $handler->parserTransformHook($this, $file); // } } -// private byte[] stripAltText(byte[] caption, XomwLinkHolderArray holders) { -// // Strip bad stuff out of the title (tooltip). We can't just use -// // replaceLinkHoldersText() here, because if this function is called -// // from replaceInternalLinks2(), mLinkHolders won't be up-to-date. -// byte[] tooltip; -// if (holders != null) { -// tooltip = holders.replace(caption); -// } else { -// tooltip = this.replace_link_holders(caption); -// } -// -// // make sure there are no placeholders in thumbnail attributes -// // that are later expanded to html- so expand them now and -// // remove the tags -//// $tooltip = $this->mStripState->unstripBoth( $tooltip ); -//// $tooltip = Sanitizer::stripAllTags( $tooltip ); -//// -//// return $tooltip; -// return null; -// } private static Xomw_param_list[] internalParamNames; private static Xomw_param_map internalParamMap; @@ -783,11 +765,11 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls //$time = $file ? $file->getTimestamp() : false; //$sha1 = $file ? $file->getSha1() : false; //# Register the file as a dependency... - //$this->mOutput->addImage( $title->getDBkey(), $time, $sha1 ); + //this.mOutput->addImage( $title->getDBkey(), $time, $sha1 ); //if ( $file && !$title->equals( $file->getTitle() ) ) { // # Update fetched file title // $title = $file->getTitle(); - // $this->mOutput->addImage( $title->getDBkey(), $time, $sha1 ); + // this.mOutput->addImage( $title->getDBkey(), $time, $sha1 ); //} return file; } @@ -835,7 +817,7 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls link_renderer.Make_known_link(bfr, nt, text, extra_atrs, query); byte[] link = bfr.To_bry_and_clear(); - parser.Armor_links(bfr, link, 0, link.length); + parser.armorLinks(bfr, link, 0, link.length); bfr.Add(trail); } @@ -856,4 +838,326 @@ public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls // title-char -> ([{$tc}]+) // pipe -> \\| // other chars... -> (.*) +// +// /** +// * Process [[ ]] wikilinks +// * +// * @param String $s +// * +// * @return String Processed text +// * +// * @private +// */ +// public function replaceInternalLinks($s) { +// this.mLinkHolders->merge(this.replaceInternalLinks2($s)); +// return $s; +// } +// +// /** +// * Process [[ ]] wikilinks (RIL) +// * @param String $s +// * @throws MWException +// * @return LinkHolderArray +// * +// * @private +// */ +// public function replaceInternalLinks2(&$s) { +// global $wgExtraInterlanguageLinkPrefixes; +// +// static $tc = false, $e1, $e1_img; +// # the % is needed to support urlencoded titles as well +// if (!$tc) { +// $tc = Title::legalChars() . '#%'; +// # Match a link having the form [[namespace:link|alternate]]trail +// $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; +// # Match cases where there is no "]]", which might still be images +// $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; +// } +// +// $holders = new LinkHolderArray($this); +// +// # split the entire text String on occurrences of [[ +// $a = StringUtils::explode('[[', ' ' . $s); +// # get the first element (all text up to first [[), and remove the space we added +// $s = $a->current(); +// $a->next(); +// $line = $a->current(); # Workaround for broken ArrayIterator::next() that returns "void" +// $s = substr($s, 1); +// +// $useLinkPrefixExtension = this.getTargetLanguage()->linkPrefixExtension(); +// $e2 = null; +// if ($useLinkPrefixExtension) { +// # Match the end of a line for a word that's not followed by whitespace, +// # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched +// global $wgContLang; +// $charset = $wgContLang->linkPrefixCharset(); +// $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu"; +// } +// +// if (is_null(this.mTitle)) { +// throw new MWException(__METHOD__ . ": \this.mTitle is null\n"); +// } +// $nottalk = !this.mTitle->isTalkPage(); +// +// if ($useLinkPrefixExtension) { +// $m = []; +// if (preg_match($e2, $s, $m)) { +// $first_prefix = $m[2]; +// } else { +// $first_prefix = false; +// } +// } else { +// $prefix = ''; +// } +// +// $useSubpages = this.areSubpagesAllowed(); +// +// // @codingStandardsIgnoreStart Squiz.WhiteSpace.SemicolonSpacing.Incorrect +// # Loop for each link +// for (; $line !== false && $line !== null; $a->next(), $line = $a->current()) { +// // @codingStandardsIgnoreEnd +// +// # Check for excessive memory usage +// if ($holders->isBig()) { +// # Too big +// # Do the existence check, replace the link holders and clear the array +// $holders->replace($s); +// $holders->clear(); +// } +// +// if ($useLinkPrefixExtension) { +// if (preg_match($e2, $s, $m)) { +// $prefix = $m[2]; +// $s = $m[1]; +// } else { +// $prefix = ''; +// } +// # first link +// if ($first_prefix) { +// $prefix = $first_prefix; +// $first_prefix = false; +// } +// } +// +// $might_be_img = false; +// +// if (preg_match($e1, $line, $m)) { # page with normal text or alt +// $text = $m[2]; +// # If we get a ] at the beginning of $m[3] that means we have a link that's something like: +// # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up, +// # the real problem is with the $e1 regex +// # See T1500. +// # Still some problems for cases where the ] is meant to be outside punctuation, +// # and no image is in sight. See T4095. +// if ($text !== '' +// && substr($m[3], 0, 1) === ']' +// && strpos($text, '[') !== false +// ) { +// $text .= ']'; # so that replaceExternalLinks($text) works later +// $m[3] = substr($m[3], 1); +// } +// # fix up urlencoded title texts +// if (strpos($m[1], '%') !== false) { +// # Should anchors '#' also be rejected? +// $m[1] = str_replace([ '<', '>' ], [ '<', '>' ], rawurldecode($m[1])); +// } +// $trail = $m[3]; +// } elseif (preg_match($e1_img, $line, $m)) { +// # Invalid, but might be an image with a link in its caption +// $might_be_img = true; +// $text = $m[2]; +// if (strpos($m[1], '%') !== false) { +// $m[1] = str_replace([ '<', '>' ], [ '<', '>' ], rawurldecode($m[1])); +// } +// $trail = ""; +// } else { # Invalid form; output directly +// $s .= $prefix . '[[' . $line; +// continue; +// } +// +// $origLink = ltrim($m[1], ' '); +// +// # Don't allow @gplx.Internal protected links to pages containing +// # PROTO: where PROTO is a valid URL protocol; these +// # should be external links. +// if (preg_match('/^(?i:' . this.mUrlProtocols . ')/', $origLink)) { +// $s .= $prefix . '[[' . $line; +// continue; +// } +// +// # Make subpage if necessary +// if ($useSubpages) { +// $link = this.maybeDoSubpageLink($origLink, $text); +// } else { +// $link = $origLink; +// } +// +// $noforce = (substr($origLink, 0, 1) !== ':'); +// if (!$noforce) { +// # Strip off leading ':' +// $link = substr($link, 1); +// } +// +// $unstrip = this.mStripState->unstripNoWiki($link); +// $nt = is_string($unstrip) ? Title::newFromText($unstrip) : null; +// if ($nt === null) { +// $s .= $prefix . '[[' . $line; +// continue; +// } +// +// $ns = $nt->getNamespace(); +// $iw = $nt->getInterwiki(); +// +// if ($might_be_img) { # if this is actually an invalid link +// if ($ns == NS_FILE && $noforce) { # but might be an image +// $found = false; +// while (true) { +// # look at the next 'line' to see if we can close it there +// $a->next(); +// $next_line = $a->current(); +// if ($next_line === false || $next_line === null) { +// break; +// } +// $m = explode(']]', $next_line, 3); +// if (count($m) == 3) { +// # the first ]] closes the inner link, the second the image +// $found = true; +// $text .= "[[{$m[0]}]]{$m[1]}"; +// $trail = $m[2]; +// break; +// } elseif (count($m) == 2) { +// # if there's exactly one ]] that's fine, we'll keep looking +// $text .= "[[{$m[0]}]]{$m[1]}"; +// } else { +// # if $next_line is invalid too, we need look no further +// $text .= '[[' . $next_line; +// break; +// } +// } +// if (!$found) { +// # we couldn't find the end of this imageLink, so output it raw +// # but don't ignore what might be perfectly normal links in the text we've examined +// $holders->merge(this.replaceInternalLinks2($text)); +// $s .= "{$prefix}[[$link|$text"; +// # note: no $trail, because without an end, there *is* no trail +// continue; +// } +// } else { # it's not an image, so output it raw +// $s .= "{$prefix}[[$link|$text"; +// # note: no $trail, because without an end, there *is* no trail +// continue; +// } +// } +// +// $wasblank = ($text == ''); +// if ($wasblank) { +// $text = $link; +// } else { +// # T6598 madness. Handle the quotes only if they come from the alternate part +// # [[Lista d''e paise d''o munno]] -> Lista d''e paise d''o munno +// # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']] +// # -> Criticism of Harry Potter +// $text = this.doQuotes($text); +// } +// +// # Link not escaped by : , create the various objects +// if ($noforce && !$nt->wasLocalInterwiki()) { +// # Interwikis +// if ( +// $iw && this.mOptions->getInterwikiMagic() && $nottalk && ( +// Language::fetchLanguageName($iw, null, 'mw') || +// in_array($iw, $wgExtraInterlanguageLinkPrefixes) +// ) +// ) { +// # T26502: filter duplicates +// if (!isset(this.mLangLinkLanguages[$iw])) { +// this.mLangLinkLanguages[$iw] = true; +// this.mOutput->addLanguageLink($nt->getFullText()); +// } +// +// $s = rtrim($s . $prefix); +// $s .= trim($trail, "\n") == '' ? '': $prefix . $trail; +// continue; +// } +// +// if ($ns == NS_FILE) { +// if (!wfIsBadImage($nt->getDBkey(), this.mTitle)) { +// if ($wasblank) { +// # if no parameters were passed, $text +// # becomes something like "File:Foo.png", +// # which we don't want to pass on to the +// # image generator +// $text = ''; +// } else { +// # recursively parse links inside the image caption +// # actually, this will parse them in any other parameters, too, +// # but it might be hard to fix that, and it doesn't matter ATM +// $text = this.replaceExternalLinks($text); +// $holders->merge(this.replaceInternalLinks2($text)); +// } +// # cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them +// $s .= $prefix . this.armorLinks( +// this.makeImage($nt, $text, $holders)) . $trail; +// continue; +// } +// } elseif ($ns == NS_CATEGORY) { +// $s = rtrim($s . "\n"); # T2087 +// +// if ($wasblank) { +// $sortkey = this.getDefaultSort(); +// } else { +// $sortkey = $text; +// } +// $sortkey = Sanitizer::decodeCharReferences($sortkey); +// $sortkey = str_replace("\n", '', $sortkey); +// $sortkey = this.getConverterLanguage()->convertCategoryKey($sortkey); +// this.mOutput->addCategory($nt->getDBkey(), $sortkey); +// +// /** +// * Strip the whitespace Category links produce, see T2087 +// */ +// $s .= trim($prefix . $trail, "\n") == '' ? '' : $prefix . $trail; +// +// continue; +// } +// } +// +// # Self-link checking. For some languages, variants of the title are checked in +// # LinkHolderArray::doVariants() to allow batching the existence checks necessary +// # for linking to a different variant. +// if ($ns != NS_SPECIAL && $nt->equals(this.mTitle) && !$nt->hasFragment()) { +// $s .= $prefix . Linker::makeSelfLinkObj($nt, $text, '', $trail); +// continue; +// } +// +// # NS_MEDIA is a pseudo-namespace for linking directly to a file +// # @todo FIXME: Should do batch file existence checks, see comment below +// if ($ns == NS_MEDIA) { +// # Give extensions a chance to select the file revision for us +// $options = []; +// $descQuery = false; +// Hooks::run('BeforeParserFetchFileAndTitle', +// [ $this, $nt, &$options, &$descQuery ]); +// # Fetch and register the file (file title may be different via hooks) +// list($file, $nt) = this.fetchFileAndTitle($nt, $options); +// # Cloak with NOPARSE to avoid replacement in replaceExternalLinks +// $s .= $prefix . this.armorLinks( +// Linker::makeMediaLinkFile($nt, $file, $text)) . $trail; +// continue; +// } +// +// # Some titles, such as valid special pages or files in foreign repos, should +// # be shown as bluelinks even though they're not included in the page table +// # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do +// # batch file existence checks for NS_FILE and NS_MEDIA +// if ($iw == '' && $nt->isAlwaysKnown()) { +// this.mOutput->addLink($nt); +// $s .= this.makeKnownLinkHolder($nt, $text, $trail, $prefix); +// } else { +// # Links will be added to the output link list after checking +// $s .= $holders->makeHolder($nt, $text, [], $trail, $prefix); +// } +// } +// return $holders; +// } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr__file__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr__file__tst.java index 93aefed86..23f663c2a 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr__file__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/lnkis/Xomw_lnki_wkr__file__tst.java @@ -23,11 +23,19 @@ public class Xomw_lnki_wkr__file__tst { fxt.Clear(); fxt.Init__file("File:A.png", 300, 200); } - @Test public void Plain() { + @Test public void Orig() { + // basic fxt.Test__to_html("[[File:A.png]]", "A.png"); + + // caption + fxt.Test__to_html("[[File:A.png|abc]]", "abc"); } @Test public void Thumb() { + // basic fxt.Test__to_html("[[File:A.png|thumb]]", "
A.png
"); + + // caption + fxt.Test__to_html("[[File:A.png|thumb|abc]]", "
abc
"); } @Test public void Size() { fxt.Test__to_html("[[File:A.png|123x456px]]", "A.png"); @@ -66,17 +74,19 @@ class Xomw_lnki_wkr__fxt { private final Xomw_lnki_wkr wkr; private final Xomw_parser_ctx pctx; private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); + private final Xomw_parser_env env; private final XomwFileFinderMock file_finder; private final XomwFileRepo repo = new XomwFileRepo(Bry_.new_a7("/orig"), Bry_.new_a7("/thumb")); private boolean apos = true; public Xomw_lnki_wkr__fxt() { Xoae_app app = Xoa_app_fxt.Make__app__edit(); Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); - Xomw_parser parser = new Xomw_parser(); + XomwParser parser = new XomwParser(); wkr = parser.Lnki_wkr(); // env file_finder = new XomwFileFinderMock(parser.Env()); + env = parser.Env(); parser.Env().File_finder_(file_finder); parser.Env().Magic_word_mgr().Add(Bry_.new_u8("img_thumbnail"), Bool_.Y, Bry_.Ary("thumb")); parser.Env().Magic_word_mgr().Add(Bry_.new_u8("img_width"), Bool_.Y, Bry_.Ary("$1px")); @@ -95,7 +105,7 @@ class Xomw_lnki_wkr__fxt { } public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Replace_internal_links(pctx, pbfr.Init(src_bry)); + wkr.replaceInternalLinks(pbfr.Init(src_bry), env, pctx); if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd); Gftest.Eq__ary__lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } @@ -109,7 +119,7 @@ class Xomw_lnki_wkr__fxt { } private String Exec__to_html(String src_str) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Replace_internal_links(pctx, pbfr.Init(src_bry)); + wkr.replaceInternalLinks(pbfr.Init(src_bry), env, pctx); wkr.replaceLinkHolders(pbfr); return pbfr.Rslt().To_str_and_clear(); } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java index a314b4270..402fff325 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr.java @@ -24,7 +24,7 @@ public class Xomw_magiclinks_wkr { private static byte[] Tag__anch__rhs; private boolean[] url_separators; private static Xomw_regex_link_interrupt regex_link_interrupt; - private final Xomw_parser parser; + private final XomwParserIface parser; private final Xomw_regex_boundary regex_boundary; private final Xomw_regex_url regex_url; private final XomwSanitizer sanitizer; @@ -33,7 +33,7 @@ public class Xomw_magiclinks_wkr { private byte[] page_title; private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3; - public Xomw_magiclinks_wkr(Xomw_parser parser, XomwSanitizer sanitizer, XomwLinker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) { + public Xomw_magiclinks_wkr(XomwParserIface parser, XomwSanitizer sanitizer, XomwLinker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) { this.parser = parser; this.sanitizer = sanitizer; this.linker = linker; @@ -66,7 +66,7 @@ public class Xomw_magiclinks_wkr { // Replace special strings like "ISBN xxx" and "RFC xxx" with // magic external links. - public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void doMagicLinks(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // XO.PBFR Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); @@ -262,7 +262,7 @@ public class Xomw_magiclinks_wkr { linker.makeExternalLink(bfr, url , url // $this->getConverterLanguage()->markNoConversion($url, true), , true, Bry_.new_a7("free") - , parser.Get_external_link_attribs(atrs) + , parser.getExternalLinkAttribs(atrs) , page_title); // XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions? @@ -390,4 +390,107 @@ class Xomw_regex_link_interrupt { } return Bry_find_.Not_found; } +// /** +// * Replace special strings like "ISBN xxx" and "RFC xxx" with +// * magic external links. +// * +// * DML +// * @private +// * +// * @param String $text +// * +// * @return String +// */ +// public function doMagicLinks($text) { +// $prots = wfUrlProtocolsWithoutProtRel(); +// $urlChar = self::EXT_LINK_URL_CLASS; +// $addr = self::EXT_LINK_ADDR; +// $space = self::SPACE_NOT_NL; # non-newline space +// $spdash = "(?:-|$space)"; # a dash or a non-newline space +// $spaces = "$space++"; # possessive match of 1 or more spaces +// $text = preg_replace_callback( +// '!(?: # Start cases +// (].*?) | # m[1]: Skip link text +// (<.*?>) | # m[2]: Skip stuff inside +// # HTML elements' . " +// (\b(?i:$prots)($addr$urlChar*)) | # m[3]: Free external links +// # m[4]: Post-protocol path +// \b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number +// ([0-9]+)\b | +// \bISBN $spaces ( # m[6]: ISBN, capture number +// (?: 97[89] $spdash?)? # optional 13-digit ISBN prefix +// (?: [0-9] $spdash?){9} # 9 digits with opt. delimiters +// [0-9Xx] # check digit +// )\b +// )!xu", [ &$this, 'magicLinkCallback' ], $text); +// return $text; +// } +// +// /** +// * @throws MWException +// * @param array $m +// * @return HTML|String +// */ +// public function magicLinkCallback($m) { +// if (isset($m[1]) && $m[1] !== '') { +// # Skip anchor +// return $m[0]; +// } elseif (isset($m[2]) && $m[2] !== '') { +// # Skip HTML element +// return $m[0]; +// } elseif (isset($m[3]) && $m[3] !== '') { +// # Free external link +// return this.makeFreeExternalLink($m[0], strlen($m[4])); +// } elseif (isset($m[5]) && $m[5] !== '') { +// # RFC or PMID +// if (substr($m[0], 0, 3) === 'RFC') { +// if (!this.mOptions->getMagicRFCLinks()) { +// return $m[0]; +// } +// $keyword = 'RFC'; +// $urlmsg = 'rfcurl'; +// $cssClass = 'mw-magiclink-rfc'; +// $trackingCat = 'magiclink-tracking-rfc'; +// $id = $m[5]; +// } elseif (substr($m[0], 0, 4) === 'PMID') { +// if (!this.mOptions->getMagicPMIDLinks()) { +// return $m[0]; +// } +// $keyword = 'PMID'; +// $urlmsg = 'pubmedurl'; +// $cssClass = 'mw-magiclink-pmid'; +// $trackingCat = 'magiclink-tracking-pmid'; +// $id = $m[5]; +// } else { +// throw new MWException(__METHOD__ . ': unrecognised match type "' . +// substr($m[0], 0, 20) . '"'); +// } +// $url = wfMessage($urlmsg, $id)->inContentLanguage()->text(); +// this.addTrackingCategory($trackingCat); +// return Linker::makeExternalLink($url, "{$keyword} {$id}", true, $cssClass, [], this.mTitle); +// } elseif (isset($m[6]) && $m[6] !== '' +// && this.mOptions->getMagicISBNLinks() +// ) { +// # ISBN +// $isbn = $m[6]; +// $space = self::SPACE_NOT_NL; # non-newline space +// $isbn = preg_replace("/$space/", ' ', $isbn); +// $num = strtr($isbn, [ +// '-' => '', +// ' ' => '', +// 'x' => 'X', +// ]); +// this.addTrackingCategory('magiclink-tracking-isbn'); +// return this.getLinkRenderer()->makeKnownLink( +// SpecialPage::getTitleFor('Booksources', $num), +// "ISBN $isbn", +// [ +// 'class' => '@gplx.Internal protected mw-magiclink-isbn', +// 'title' => false // suppress title attribute +// ] +// ); +// } else { +// return $m[0]; +// } +// } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java index 4453faefe..bad507edf 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/magiclinks/Xomw_magiclinks_wkr__tst.java @@ -79,7 +79,7 @@ class Xomw_magiclinks_wkr__fxt { public void Test__parse(boolean apos, String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); pbfr.Init(src_bry); - wkr.Do_magic_links(pctx, pbfr); + wkr.doMagicLinks(pctx, pbfr); if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr.java index a94e72c93..58d7bf66d 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr.java @@ -17,7 +17,7 @@ package gplx.xowa.mediawiki.includes.parsers.nbsps; import gplx.*; import gplx.x import gplx.core.btries.*; public class Xomw_nbsp_wkr { private final Btrie_rv trv = new Btrie_rv(); - public void Do_nbsp(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void doNbsp(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // PORTED: // Clean up special characters, only run once, next-to-last before doBlockLevels // $fixtags = [ diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr__tst.java index f6b3fa60b..fc4449e2a 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/nbsps/Xomw_nbsp_wkr__tst.java @@ -31,7 +31,7 @@ class Xomw_nbsp_wkr__fxt { public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); pbfr.Init(src_bry); - wkr.Do_nbsp(pctx, pbfr); + wkr.doNbsp(pctx, pbfr); if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr.java index c98f61ffe..92d35d89e 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr.java @@ -20,10 +20,10 @@ import gplx.core.primitives.*; public class Xomw_quote_wkr {// THREAD.UNSAFE: caching for repeated calls private Bry_bfr tmp; private final Int_list apos_pos_ary = new Int_list(32); - public Xomw_quote_wkr(Xomw_parser mgr) { - this.tmp = mgr.Tmp(); + public Xomw_quote_wkr(Bry_bfr tmp) { + this.tmp = tmp; } - public void Do_all_quotes(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void doAllQuotes(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); int src_bgn = 0; @@ -262,4 +262,203 @@ public class Xomw_quote_wkr {// THREAD.UNSAFE: caching for repeated calls , State__both = 5 ; private static final byte[] Wtxt__apos = Bry_.new_a7("''"); +// /** +// * Replace single quotes with HTML markup +// * @private +// * +// * @param String $text +// * +// * @return String The altered text +// */ +// public function doAllQuotes($text) { +// $outtext = ''; +// $lines = StringUtils::explode("\n", $text); +// foreach ($lines as $line) { +// $outtext .= this.doQuotes($line) . "\n"; +// } +// $outtext = substr($outtext, 0, -1); +// return $outtext; +// } +// +// /** +// * Helper function for doAllQuotes() +// * +// * @param String $text +// * +// * @return String +// */ +// public function doQuotes($text) { +// $arr = preg_split("/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE); +// $countarr = count($arr); +// if ($countarr == 1) { +// return $text; +// } +// +// // First, do some preliminary work. This may shift some apostrophes from +// // being mark-up to being text. It also counts the number of occurrences +// // of bold and italics mark-ups. +// $numbold = 0; +// $numitalics = 0; +// for ($i = 1; $i < $countarr; $i += 2) { +// $thislen = strlen($arr[$i]); +// // If there are ever four apostrophes, assume the first is supposed to +// // be text, and the remaining three constitute mark-up for bold text. +// // (T15227: ''''foo'''' turns into ' ''' foo ' ''') +// if ($thislen == 4) { +// $arr[$i - 1] .= "'"; +// $arr[$i] = "'''"; +// $thislen = 3; +// } elseif ($thislen > 5) { +// // If there are more than 5 apostrophes in a row, assume they're all +// // text except for the last 5. +// // (T15227: ''''''foo'''''' turns into ' ''''' foo ' ''''') +// $arr[$i - 1] .= str_repeat("'", $thislen - 5); +// $arr[$i] = "'''''"; +// $thislen = 5; +// } +// // Count the number of occurrences of bold and italics mark-ups. +// if ($thislen == 2) { +// $numitalics++; +// } elseif ($thislen == 3) { +// $numbold++; +// } elseif ($thislen == 5) { +// $numitalics++; +// $numbold++; +// } +// } +// +// // If there is an odd number of both bold and italics, it is likely +// // that one of the bold ones was meant to be an apostrophe followed +// // by italics. Which one we cannot know for certain, but it is more +// // likely to be one that has a single-letter word before it. +// if (($numbold % 2 == 1) && ($numitalics % 2 == 1)) { +// $firstsingleletterword = -1; +// $firstmultiletterword = -1; +// $firstspace = -1; +// for ($i = 1; $i < $countarr; $i += 2) { +// if (strlen($arr[$i]) == 3) { +// $x1 = substr($arr[$i - 1], -1); +// $x2 = substr($arr[$i - 1], -2, 1); +// if ($x1 === ' ') { +// if ($firstspace == -1) { +// $firstspace = $i; +// } +// } elseif ($x2 === ' ') { +// $firstsingleletterword = $i; +// // if $firstsingleletterword is set, we don't +// // look at the other options, so we can bail early. +// break; +// } else { +// if ($firstmultiletterword == -1) { +// $firstmultiletterword = $i; +// } +// } +// } +// } +// +// // If there is a single-letter word, use it! +// if ($firstsingleletterword > -1) { +// $arr[$firstsingleletterword] = "''"; +// $arr[$firstsingleletterword - 1] .= "'"; +// } elseif ($firstmultiletterword > -1) { +// // If not, but there's a multi-letter word, use that one. +// $arr[$firstmultiletterword] = "''"; +// $arr[$firstmultiletterword - 1] .= "'"; +// } elseif ($firstspace > -1) { +// // ... otherwise use the first one that has neither. +// // (notice that it is possible for all three to be -1 if, for example, +// // there is only one pentuple-apostrophe in the line) +// $arr[$firstspace] = "''"; +// $arr[$firstspace - 1] .= "'"; +// } +// } +// +// // Now let's actually convert our apostrophic mush to HTML! +// $output = ''; +// $buffer = ''; +// $state = ''; +// $i = 0; +// foreach ($arr as $r) { +// if (($i % 2) == 0) { +// if ($state === 'both') { +// $buffer .= $r; +// } else { +// $output .= $r; +// } +// } else { +// $thislen = strlen($r); +// if ($thislen == 2) { +// if ($state === 'i') { +// $output .= ''; +// $state = ''; +// } elseif ($state === 'bi') { +// $output .= ''; +// $state = 'b'; +// } elseif ($state === 'ib') { +// $output .= ''; +// $state = 'b'; +// } elseif ($state === 'both') { +// $output .= '' . $buffer . ''; +// $state = 'b'; +// } else { // $state can be 'b' or '' +// $output .= ''; +// $state .= 'i'; +// } +// } elseif ($thislen == 3) { +// if ($state === 'b') { +// $output .= ''; +// $state = ''; +// } elseif ($state === 'bi') { +// $output .= ''; +// $state = 'i'; +// } elseif ($state === 'ib') { +// $output .= ''; +// $state = 'i'; +// } elseif ($state === 'both') { +// $output .= '' . $buffer . ''; +// $state = 'i'; +// } else { // $state can be 'i' or '' +// $output .= ''; +// $state .= 'b'; +// } +// } elseif ($thislen == 5) { +// if ($state === 'b') { +// $output .= ''; +// $state = 'i'; +// } elseif ($state === 'i') { +// $output .= ''; +// $state = 'b'; +// } elseif ($state === 'bi') { +// $output .= ''; +// $state = ''; +// } elseif ($state === 'ib') { +// $output .= ''; +// $state = ''; +// } elseif ($state === 'both') { +// $output .= '' . $buffer . ''; +// $state = ''; +// } else { // ($state == '') +// $buffer = ''; +// $state = 'both'; +// } +// } +// } +// $i++; +// } +// // Now close all remaining tags. Notice that the order is important. +// if ($state === 'b' || $state === 'ib') { +// $output .= ''; +// } +// if ($state === 'i' || $state === 'bi' || $state === 'ib') { +// $output .= ''; +// } +// if ($state === 'bi') { +// $output .= ''; +// } +// // There might be lonely ''''', so make sure we have a buffer +// if ($state === 'both' && $buffer) { +// $output .= '' . $buffer . ''; +// } +// return $output; +// } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr__tst.java index d88a17918..a36843e13 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/quotes/Xomw_quote_wkr__tst.java @@ -33,11 +33,11 @@ public class Xomw_quote_wkr__tst { @Test public void Nl__text() {fxt.Test__parse("a\nb''c''d\n\ne" , "a\nbcd\n\ne");} } class Xomw_quote_wkr__fxt { - private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(new Xomw_parser()); + private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(Bry_bfr_.New()); private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); - wkr.Do_all_quotes(new Xomw_parser_ctx(), pbfr.Init(src_bry)); + wkr.doAllQuotes(new Xomw_parser_ctx(), pbfr.Init(src_bry)); Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str); } } diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java index 7fef6d17f..468aa35cf 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr.java @@ -20,7 +20,7 @@ import gplx.xowa.mediawiki.includes.libs.*; import gplx.xowa.parsers.uniqs.*; public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls private final Bry_bfr tmp; private Bry_bfr bfr; - private final XomwSanitizer sanitizer; private final Xomw_strip_state strip_state; + private final XomwSanitizer sanitizer; private final XomwStripState strip_state; private final List_adp td_history = List_adp_.New() // Is currently a td tag open? , last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption) @@ -30,12 +30,12 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U ; private int indent_level = 0; // indent level of the table private byte[] first_2 = new byte[2]; - public Xomw_table_wkr(Xomw_parser parser) { - this.tmp = parser.Tmp(); - this.sanitizer = parser.Sanitizer(); - this.strip_state = parser.Strip_state(); + public Xomw_table_wkr(Bry_bfr tmp, XomwSanitizer sanitizer, XomwStripState stripState) { + this.tmp = tmp; + this.sanitizer = sanitizer; + this.strip_state = stripState; } - public void Do_table_stuff(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { + public void doTableStuff(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); int src_bgn = 0; @@ -101,7 +101,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U // First check if we are starting a new table indent_level = colons_end; - tblw_atrs = strip_state.Unstrip_both(tblw_atrs); + tblw_atrs = strip_state.unstripBoth(tblw_atrs); // PORTED: out_line = str_repeat('
', $indent_level) . ""; for (int j = 0; j < indent_level; j++) @@ -149,7 +149,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line); // Whats after the tag is now only attributes - byte[] atrs = strip_state.Unstrip_both(line); + byte[] atrs = strip_state.unstripBoth(line); sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs); atrs = tmp.To_bry_and_clear(); @@ -251,7 +251,7 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear(); } else { - byte[] atrs = strip_state.Unstrip_both(cell_data_0); + byte[] atrs = strip_state.unstripBoth(cell_data_0); tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag); sanitizer.Fix_tag_attributes(tmp, last_tag, atrs); tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1); @@ -265,6 +265,197 @@ public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U bfr.Add(out_line).Add_byte_nl(); return Bry_split_.Rv__ok; } +// public function doTableStuff($text) { +// +// $lines = StringUtils::explode("\n", $text); +// $out = ''; +// $td_history = []; # Is currently a td tag open? +// $last_tag_history = []; # Save history of last lag activated (td, th or caption) +// $tr_history = []; # Is currently a tr tag open? +// $tr_attributes = []; # history of tr attributes +// $has_opened_tr = []; # Did this table open a element? +// $indent_level = 0; # indent level of the table +// +// foreach ($lines as $outLine) { +// $line = trim($outLine); +// +// if ($line === '') { # empty line, go to next line +// $out .= $outLine . "\n"; +// continue; +// } +// +// $first_character = $line[0]; +// $first_two = substr($line, 0, 2); +// $matches = []; +// +// if (preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)) { +// # First check if we are starting a new table +// $indent_level = strlen($matches[1]); +// +// $attributes = this.mStripState->unstripBoth($matches[2]); +// $attributes = Sanitizer::fixTagAttributes($attributes, 'table'); +// +// $outLine = str_repeat('
', $indent_level) . ""; +// array_push($td_history, false); +// array_push($last_tag_history, ''); +// array_push($tr_history, false); +// array_push($tr_attributes, ''); +// array_push($has_opened_tr, false); +// } elseif (count($td_history) == 0) { +// # Don't do any of the following +// $out .= $outLine . "\n"; +// continue; +// } elseif ($first_two === '|}') { +// # We are ending a table +// $line = '' . substr($line, 2); +// $last_tag = array_pop($last_tag_history); +// +// if (!array_pop($has_opened_tr)) { +// $line = "{$line}"; +// } +// +// if (array_pop($tr_history)) { +// $line = "{$line}"; +// } +// +// if (array_pop($td_history)) { +// $line = "{$line}"; +// } +// array_pop($tr_attributes); +// $outLine = $line . str_repeat('
', $indent_level); +// } elseif ($first_two === '|-') { +// # Now we have a table row +// $line = preg_replace('#^\|-+#', '', $line); +// +// # Whats after the tag is now only attributes +// $attributes = this.mStripState->unstripBoth($line); +// $attributes = Sanitizer::fixTagAttributes($attributes, 'tr'); +// array_pop($tr_attributes); +// array_push($tr_attributes, $attributes); +// +// $line = ''; +// $last_tag = array_pop($last_tag_history); +// array_pop($has_opened_tr); +// array_push($has_opened_tr, true); +// +// if (array_pop($tr_history)) { +// $line = ''; +// } +// +// if (array_pop($td_history)) { +// $line = "{$line}"; +// } +// +// $outLine = $line; +// array_push($tr_history, false); +// array_push($td_history, false); +// array_push($last_tag_history, ''); +// } elseif ($first_character === '|' +// || $first_character === '!' +// || $first_two === '|+' +// ) { +// # This might be cell elements, td, th or captions +// if ($first_two === '|+') { +// $first_character = '+'; +// $line = substr($line, 2); +// } else { +// $line = substr($line, 1); +// } +// +// // Implies both are valid for table headings. +// if ($first_character === '!') { +// $line = StringUtils::replaceMarkup('!!', '||', $line); +// } +// +// # Split up multiple cells on the same line. +// # FIXME : This can result in improper nesting of tags processed +// # by earlier parser steps. +// $cells = explode('||', $line); +// +// $outLine = ''; +// +// # Loop through each table cell +// foreach ($cells as $cell) { +// $previous = ''; +// if ($first_character !== '+') { +// $tr_after = array_pop($tr_attributes); +// if (!array_pop($tr_history)) { +// $previous = "\n"; +// } +// array_push($tr_history, true); +// array_push($tr_attributes, ''); +// array_pop($has_opened_tr); +// array_push($has_opened_tr, true); +// } +// +// $last_tag = array_pop($last_tag_history); +// +// if (array_pop($td_history)) { +// $previous = "\n{$previous}"; +// } +// +// if ($first_character === '|') { +// $last_tag = 'td'; +// } elseif ($first_character === '!') { +// $last_tag = 'th'; +// } elseif ($first_character === '+') { +// $last_tag = 'caption'; +// } else { +// $last_tag = ''; +// } +// +// array_push($last_tag_history, $last_tag); +// +// # A cell could contain both parameters and data +// $cell_data = explode('|', $cell, 2); +// +// # T2553: Note that a '|' inside an invalid link should not +// # be mistaken as delimiting cell parameters +// # Bug T153140: Neither should language converter markup. +// if (preg_match('/\[\[|-\{/', $cell_data[0]) === 1) { +// $cell = "{$previous}<{$last_tag}>{$cell}"; +// } elseif (count($cell_data) == 1) { +// $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; +// } else { +// $attributes = this.mStripState->unstripBoth($cell_data[0]); +// $attributes = Sanitizer::fixTagAttributes($attributes, $last_tag); +// $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; +// } +// +// $outLine .= $cell; +// array_push($td_history, true); +// } +// } +// $out .= $outLine . "\n"; +// } +// +// # Closing open td, tr && table +// while (count($td_history) > 0) { +// if (array_pop($td_history)) { +// $out .= "\n"; +// } +// if (array_pop($tr_history)) { +// $out .= "\n"; +// } +// if (!array_pop($has_opened_tr)) { +// $out .= "\n"; +// } +// +// $out .= "\n"; +// } +// +// # Remove trailing line-ending (b/c) +// if (substr($out, -1) === "\n") { +// $out = substr($out, 0, -1); +// } +// +// # special case: don't return empty table +// if ($out === "\n\n
") { +// $out = ''; +// } +// +// return $out; +// } private static final byte[] Wtxt__tb__bgn = Bry_.new_a7("{|") , Wtxt__tb__end = Bry_.new_a7("|}") diff --git a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr__tst.java b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr__tst.java index ef18ad511..d40bc6e44 100644 --- a/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr__tst.java +++ b/gplx.xowa.mediawiki/src/gplx/xowa/mediawiki/includes/parsers/tables/Xomw_table_wkr__tst.java @@ -117,11 +117,16 @@ public class Xomw_table_wkr__tst { class Xomw_table_wkr__fxt { private final Xomw_parser_bfr parser_bfr = new Xomw_parser_bfr(); private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); - private final Xomw_table_wkr wkr = new Xomw_table_wkr(new Xomw_parser()); + private final Xomw_table_wkr wkr; + public Xomw_table_wkr__fxt() { + Xomw_parser parser = new Xomw_parser(); + this.wkr = new Xomw_table_wkr(parser.Tmp(), parser.Sanitizer(), parser.Strip_state()); + } + public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); parser_bfr.Init(src_bry); - wkr.Do_table_stuff(pctx, parser_bfr); + wkr.doTableStuff(pctx, parser_bfr); Tfds.Eq_str_lines(expd, parser_bfr.Rslt().To_str_and_clear(), src_str); } }