/* XOWA: the XOWA Offline Wiki Application Copyright (C) 2012-2017 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. You may use XOWA according to either of these licenses as is most appropriate for your project on a case-by-case basis. The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*; import gplx.xowa.parsers.htmls.*; import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*; public class XomwSanitizer { private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr(); private final Mwh_atr_parser atr_parser = new Mwh_atr_parser(); private final Xomw_regex_escape_invalid regex_clean_url = new Xomw_regex_escape_invalid(); private final Xomw_regex_find_domain regex_find_domain = new Xomw_regex_find_domain(); private final Xomw_regex_ipv6_brack regex_ipv6_brack = new Xomw_regex_ipv6_brack(); private final Bry_tmp tmp_host = new Bry_tmp(); private final Bry_bfr tmp_bfr = Bry_bfr_.New(); private final Bry_bfr tmp_bfr_2 = Bry_bfr_.New(); private final Btrie_rv trv = new Btrie_rv(); private final Xomw_regex_url_char_cbk__normalize normalize_cbk = new Xomw_regex_url_char_cbk__normalize(); private final Xomw_regex_url_char_cbk__decode decode_cbk = new Xomw_regex_url_char_cbk__decode(); private static Xomw_regex_url_char regex_url_char; private static Btrie_slim_mgr invalid_idn_trie; public XomwSanitizer() { if (regex_url_char == null) { synchronized (Type_adp_.ClassOf_obj(this)) { regex_url_char = new Xomw_regex_url_char(); // Characters that will be ignored in IDNs. // https://tools.ietf.org/html/rfc3454#section-3.1 // $strip = "/ // \\s| // general whitespace // \xc2\xad| // 00ad SOFT HYPHEN // \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN // \xe2\x80\x8b| // 200b ZERO WIDTH SPACE // \xe2\x81\xa0| // 2060 WORD JOINER // \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE // \xcd\x8f| // 034f COMBINING GRAPHEME JOINER // \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE // \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO // \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE // \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER // \xe2\x80\x8d| // 200d ZERO WIDTH JOINER // [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16 // /xuD"; // XO.MW.REGEX:http://php.net/manual/en/reference.pcre.pattern.modifiers.php // /x : ignore embedded ws // /u : enabled pcre utf8 // /D : $ matches EOS, not NL invalid_idn_trie = Btrie_slim_mgr.cs() .Add_many_bry(new Xomw_regex_parser().Add_ary ("\\s" , "\\xc2\\xad" // 00ad SOFT HYPHEN , "\\xe1\\xa0\\x86" // 1806 MONGOLIAN TODO SOFT HYPHEN , "\\xe2\\x80\\x8b" // 200b ZERO WIDTH SPACE , "\\xe2\\x81\\xa0" // 2060 WORD JOINER , "\\xef\\xbb\\xbf" // feff ZERO WIDTH NO-BREAK SPACE , "\\xcd\\x8f" // 034f COMBINING GRAPHEME JOINER , "\\xe1\\xa0\\x8b" // 180b MONGOLIAN FREE VARIATION SELECTOR ONE , "\\xe1\\xa0\\x8c" // 180c MONGOLIAN FREE VARIATION SELECTOR TWO , "\\xe1\\xa0\\x8d" // 180d MONGOLIAN FREE VARIATION SELECTOR THREE , "\\xe2\\x80\\x8c" // 200c ZERO WIDTH NON-JOINER , "\\xe2\\x80\\x8d" // 200d ZERO WIDTH JOINER ) .Add_rng ("\\xef\\xb8\\x80", "\\xef\\xb8\\x8f" // fe00-fe0f VARIATION SELECTOR-1-16 ) .Rslt()); // assert static structs if (html_entities == null) { synchronized (Type_adp_.ClassOf_obj(this)) { html_entities = Html_entities_new(); } } } } } /** * Regular expression to match various types of character references in * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ // XO.MW.MOVED:Xomw_regex_url_char // static final CHAR_REFS_REGEX = // '/&([A-Za-z0-9\x80-\xff]+); // |&\#([0-9]+); // |&\#[xX]([0-9A-Fa-f]+); // |(&)/x'; // /** // * Acceptable tag name charset from HTML5 parsing spec // * https://www.w3.org/TR/html5/syntax.html#tag-open-state // */ // static final ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; // // /** // * Blacklist for evil uris like javascript: // * WARNING: DO NOT use this in any place that actually requires blacklisting // * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the // * only way to be secure from javascript: uri based xss vectors is to whitelist // * things that you know are safe and deny everything else. // * [1]: http://ha.ckers.org/xss.html // */ // static final EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; // static final XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; /** * List of all named character entities defined in HTML 4.01 * https://www.w3.org/TR/html4/sgml/entities.html * As well as ' which is only defined starting in XHTML1. */ // XO.MW.MOVED:Html_entities_new // private static $htmlEntities = [] /** * Character entity aliases accepted by MediaWiki */ // XO.MW.MOVED:Html_entities_new // private static $htmlEntityAliases = [] // /** // * Lazy-initialised attributes regex, see getAttribsRegex() // */ // private static $attribsRegex; // // /** // * Regular expression to match HTML/XML attribute pairs within a tag. // * Allows some... latitude. Based on, // * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state // * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes // * @return String // */ // static function getAttribsRegex() { // if (self::$attribsRegex === null) { // $attribFirst = '[:A-Z_a-z0-9]'; // $attrib = '[:A-Z_a-z-.0-9]'; // $space = '[\x09\x0a\x0c\x0d\x20]'; // self::$attribsRegex = // "/(?:^|$space)({$attribFirst}{$attrib}*) // ($space*=$space* // (?: // # The attribute value: quoted or alone // \"([^\"]*)(?:\"|\$) // | '([^']*)(?:'|\$) // | (((?!$space|>).)*) // ) // )?(?=$space|\$)/sx"; // } // return self::$attribsRegex; // } // // /** // * Return the various lists of recognized tags // * @param array $extratags For any extra tags to include // * @param array $removetags For any tags (default or extra) to exclude // * @return array // */ // public static function getRecognizedTagData($extratags = [], $removetags = []) { // global $wgAllowImageTag; // // static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, // $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; // // // Base our staticInitialised variable off of the global config state so that if the globals // // are changed (like in the screwed up test system) we will re-initialise the settings. // $globalContext = $wgAllowImageTag; // if (!$staticInitialised || $staticInitialised != $globalContext) { // $htmlpairsStatic = [ # Tags that must be closed // 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', // 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', // 'strike', 'strong', 'tt', 'var', 'div', 'center', // 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', // 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', // 'kbd', 'samp', 'data', 'time', 'mark' // ]; // $htmlsingle = [ // 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' // ]; // // # Elements that cannot have close tags. This is (not coincidentally) // # also the list of tags for which the HTML 5 parsing algorithm // # requires you to "acknowledge the token's self-closing flag", i.e. // # a self-closing tag like
is not an HTML 5 parse error only // # for this list. // $htmlsingleonly = [ // 'br', 'wbr', 'hr', 'meta', 'link' // ]; // // $htmlnest = [ # Tags that can be nested--?? // 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', // 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', // 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' // ]; // $tabletags = [ # Can only appear inside table, we will close them // 'td', 'th', 'tr', // ]; // $htmllist = [ # Tags used by list // 'ul', 'ol', // ]; // $listtags = [ # Tags that can appear in a list // 'li', // ]; // // if ($wgAllowImageTag) { // $htmlsingle[] = 'img'; // $htmlsingleonly[] = 'img'; // } // // $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); // $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); // // # Convert them all to hashtables for faster lookup // $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', // 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; // foreach ($vars as $var) { // $$var = array_flip($$var); // } // $staticInitialised = $globalContext; // } // // # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays // $extratags = array_flip($extratags); // $removetags = array_flip($removetags); // $htmlpairs = array_merge($extratags, $htmlpairsStatic); // $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); // // return [ // 'htmlpairs' => $htmlpairs, // 'htmlsingle' => $htmlsingle, // 'htmlsingleonly' => $htmlsingleonly, // 'htmlnest' => $htmlnest, // 'tabletags' => $tabletags, // 'htmllist' => $htmllist, // 'listtags' => $listtags, // 'htmlsingleallowed' => $htmlsingleallowed, // 'htmlelements' => $htmlelements, // ]; // } // // /** // * Cleans up HTML, removes dangerous tags and attributes, and // * removes HTML comments // * @param String $text // * @param callable $processCallback Callback to do any variable or parameter // * replacements in HTML attribute values // * @param array|boolean $args Arguments for the processing callback // * @param array $extratags For any extra tags to include // * @param array $removetags For any tags (default or extra) to exclude // * @param callable $warnCallback (Deprecated) Callback allowing the // * addition of a tracking category when bad input is encountered. // * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be // * removed shortly. // * @return String // */ // public static function removeHTMLtags($text, $processCallback = null, // $args = [], $extratags = [], $removetags = [], $warnCallback = null // ) { // extract(self::getRecognizedTagData($extratags, $removetags)); // // # Remove HTML comments // $text = Sanitizer::removeHTMLcomments($text); // $bits = explode('<', $text); // $text = str_replace('>', '>', array_shift($bits)); // if (!MWTidy::isEnabled()) { // $tagstack = $tablestack = []; // foreach ($bits as $x) { // $regs = []; // # $slash: Does the current element start with a '/'? // # $t: Current element name // # $params: String between element name and > // # $brace: Ending '>' or '/>' // # $rest: Everything until the next element of $bits // if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { // list(/* $qbar */, $slash, $t, $params, $brace, $rest) = $regs; // } else { // $slash = $t = $params = $brace = $rest = null; // } // // $badtag = false; // $t = strtolower($t); // if (isset($htmlelements[$t])) { // # Check our stack // if ($slash && isset($htmlsingleonly[$t])) { // $badtag = true; // } elseif ($slash) { // # Closing a tag... is it the one we just opened? // MediaWiki\suppressWarnings(); // $ot = array_pop($tagstack); // MediaWiki\restoreWarnings(); // // if ($ot != $t) { // if (isset($htmlsingleallowed[$ot])) { // # Pop all elements with an optional close tag // # and see if we find a match below them // $optstack = []; // array_push($optstack, $ot); // MediaWiki\suppressWarnings(); // $ot = array_pop($tagstack); // MediaWiki\restoreWarnings(); // while ($ot != $t && isset($htmlsingleallowed[$ot])) { // array_push($optstack, $ot); // MediaWiki\suppressWarnings(); // $ot = array_pop($tagstack); // MediaWiki\restoreWarnings(); // } // if ($t != $ot) { // # No match. Push the optional elements back again // $badtag = true; // MediaWiki\suppressWarnings(); // $ot = array_pop($optstack); // MediaWiki\restoreWarnings(); // while ($ot) { // array_push($tagstack, $ot); // MediaWiki\suppressWarnings(); // $ot = array_pop($optstack); // MediaWiki\restoreWarnings(); // } // } // } else { // MediaWiki\suppressWarnings(); // array_push($tagstack, $ot); // MediaWiki\restoreWarnings(); // // #
  • can be nested in