1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Xomw: Move Mw_parse classes into separate project

This commit is contained in:
gnosygnu
2017-02-08 17:38:39 -05:00
parent fdf6c49a05
commit 9a19be675e
117 changed files with 394 additions and 260 deletions

View File

@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry combineaccessrules="false" kind="src" path="/100_core"/>
<classpathentry combineaccessrules="false" kind="src" path="/140_dbs"/>
<classpathentry combineaccessrules="false" kind="src" path="/150_gfui"/>
<classpathentry combineaccessrules="false" kind="src" path="/400_xowa"/>
<classpathentry kind="lib" path="C:/000/200_dev/110_java/lib/junit.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View File

@@ -0,0 +1,33 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_MagicWord {
public boolean case_match;
public byte[] name;
public Xomw_MagicWordSynonym[] synonyms;
public Xomw_MagicWord(byte[] name, boolean case_match, byte[][] synonyms_ary) {
this.name = name;
this.case_match = case_match;
int synonyms_len = synonyms_ary.length;
this.synonyms = new Xomw_MagicWordSynonym[synonyms_len];
for (int i = 0; i < synonyms_len; i++) {
synonyms[i] = new Xomw_MagicWordSynonym(name, case_match, synonyms_ary[i]);
}
}
}

View File

@@ -0,0 +1,376 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import gplx.core.btries.*; import gplx.core.primitives.*;
public class Xomw_MagicWordArray {
private Btrie_slim_mgr fwd_trie;
private Btrie_bwd_mgr bwd_trie;
private final Btrie_rv trv = new Btrie_rv();
// private final Xomw_MagicWordMgr magic_word_mgr;
public final byte[][] names;
// /** @var array */
// private hash;
// private baseRegex;
// private regex;
public Xomw_MagicWordArray(Xomw_MagicWordMgr magic_word_mgr, byte[][] names) {
// this.magic_word_mgr = magic_word_mgr;
this.names = names;
// ASSUME: all magic words in a group have the same case sensitivity
for (byte[] name : names) {
Xomw_MagicWord word = magic_word_mgr.Get(name);
if (word == null) continue;
Xomw_MagicWordSynonym[] synonyms = word.synonyms;
int synonyms_len = synonyms.length;
for (int i = 0; i < synonyms_len; i++) {
Xomw_MagicWordSynonym synonym = synonyms[i];
switch (synonym.arg1_tid) {
case Xomw_MagicWordSynonym.Arg1__nil:
case Xomw_MagicWordSynonym.Arg1__end:
if (fwd_trie == null) fwd_trie = word.case_match ? Btrie_slim_mgr.cs() : Btrie_slim_mgr.ci_u8();
fwd_trie.Add_obj(synonym.text_wo_arg1, synonym);
break;
case Xomw_MagicWordSynonym.Arg1__bgn:
if (bwd_trie == null) bwd_trie = Btrie_bwd_mgr.c__(word.case_match);
bwd_trie.Add(synonym.text_wo_arg1, synonym);
break;
// ignore if mid / mix
case Xomw_MagicWordSynonym.Arg1__mid:
case Xomw_MagicWordSynonym.Arg1__mix:
Gfo_usr_dlg_.Instance.Warn_many("", "", "MagicWordArray: unsupported arg_1_tid: tid=~{0}", synonym.arg1_tid);
continue;
}
}
}
}
// /**
// * Add a magic word by name
// *
// * @param String name
// */
// public function add(name) {
// this->names[] = name;
// this->hash = this->baseRegex = this->regex = null;
// }
//
// /**
// * Add a number of magic words by name
// *
// * @param array names
// */
// public function addArray(names) {
// this->names = array_merge(this->names, array_values(names));
// this->hash = this->baseRegex = this->regex = null;
// }
//
// /**
// * Get a 2-d hashtable for this array
// * @return array
// */
// public function getHash() {
// if (is_null(this->hash)) {
// global wgContLang;
// this->hash = [ 0 => [], 1 => [] ];
// foreach (this->names as name) {
// magic = MagicWord::get(name);
// case = intval(magic->isCaseSensitive());
// foreach (magic->getSynonyms() as syn) {
// if (!case) {
// syn = wgContLang->lc(syn);
// }
// this->hash[case][syn] = name;
// }
// }
// }
// return this->hash;
// }
//
// /**
// * Get the super regex
// * @return array
// */
// public function getBaseRegex() {
// if (is_null(this->baseRegex)) {
// this->baseRegex = [ 0 => '', 1 => '' ];
// foreach (this->names as name) {
// magic = MagicWord::get(name);
// case = intval(magic->isCaseSensitive());
// foreach (magic->getSynonyms() as i => syn) {
// // Group name must start with a non-digit in PCRE 8.34+
// it = strtr(i, '0123456789', 'abcdefghij');
// group = "(?P<{it}_{name}>" . preg_quote(syn, '/') . ')';
// if (this->baseRegex[case] === '') {
// this->baseRegex[case] = group;
// } else {
// this->baseRegex[case] .= '|' . group;
// }
// }
// }
// }
// return this->baseRegex;
// }
//
// /**
// * Get an unanchored regex that does not match parameters
// * @return array
// */
// public function getRegex() {
// if (is_null(this->regex)) {
// super = this->getBaseRegex();
// this->regex = [ '', '' ];
// if (this->baseRegex[0] !== '') {
// this->regex[0] = "/{super[0]}/iuS";
// }
// if (this->baseRegex[1] !== '') {
// this->regex[1] = "/{super[1]}/S";
// }
// }
// return this->regex;
// }
//
// /**
// * Get a regex for matching variables with parameters
// *
// * @return String
// */
// public function getVariableRegex() {
// return str_replace("\\1", "(.*?)", this->getRegex());
// }
//
// /**
// * Get a regex anchored to the start of the String that does not match parameters
// *
// * @return array
// */
// public function getRegexStart() {
// super = this->getBaseRegex();
// newRegex = [ '', '' ];
// if (super[0] !== '') {
// newRegex[0] = "/^(?:{super[0]})/iuS";
// }
// if (super[1] !== '') {
// newRegex[1] = "/^(?:{super[1]})/S";
// }
// return newRegex;
// }
//
// /**
// * Get an anchored regex for matching variables with parameters
// *
// * @return array
// */
// public function getVariableStartToEndRegex() {
// super = this->getBaseRegex();
// newRegex = [ '', '' ];
// if (super[0] !== '') {
// newRegex[0] = str_replace("\\1", "(.*?)", "/^(?:{super[0]})/iuS");
// }
// if (super[1] !== '') {
// newRegex[1] = str_replace("\\1", "(.*?)", "/^(?:{super[1]})/S");
// }
// return newRegex;
// }
//
// /**
// * @since 1.20
// * @return array
// */
// public function getNames() {
// return this->names;
// }
//
// /**
// * Parse a match array from preg_match
// * Returns array(magic word ID, parameter value)
// * If there is no parameter value, that element will be false.
// *
// * @param array m
// *
// * @throws MWException
// * @return array
// */
// public function parseMatch(m) {
// reset(m);
// while (list(key, value) = each(m)) {
// if (key === 0 || value === '') {
// continue;
// }
// parts = explode('_', key, 2);
// if (count(parts) != 2) {
// // This shouldn't happen
// // continue;
// throw new MWException(__METHOD__ . ': bad parameter name');
// }
// list(/* synIndex */, magicName) = parts;
// paramValue = next(m);
// return [ magicName, paramValue ];
// }
// // This shouldn't happen either
// throw new MWException(__METHOD__ . ': parameter not found');
// }
/**
* Match some text, with parameter capture
* Returns an array with the magic word name in the first element and the
* parameter in the second element.
* Both elements are false if there was no match.
*
* @param String text
*
* @return array
*/
public void matchVariableStartToEnd(byte[][] rv, byte[] src) {
int src_end = src.length;
if (src_end == 0) {
rv[0] = rv[1] = null;
return;
}
byte[] name = null;
int val_bgn = -1, val_end = -1;
// check fwd; EX: "thumb=$1"
if (fwd_trie != null) {
Object o = fwd_trie.Match_at(trv, src, 0, src_end);
if (o != null) {
Xomw_MagicWordSynonym syn = ((Xomw_MagicWordSynonym)o);
name = syn.magic_name;
val_bgn = trv.Pos();
val_end = src_end;
// if "nil", then must be full match; EX: "thumbx" does not match "thumb"
if (syn.arg1_tid == Xomw_MagicWordSynonym.Arg1__nil
&& syn.text_wo_arg1.length != src_end) {
rv[0] = rv[1] = null;
return;
}
}
}
// check bwd; EX: "$1px"
if (bwd_trie != null) {
Object o = bwd_trie.Match_at(trv, src, src_end - 1, -1);
if (o != null) {
Xomw_MagicWordSynonym syn = ((Xomw_MagicWordSynonym)o);
name = syn.magic_name;
val_bgn = 0;
val_end = src_end - syn.text_wo_arg1.length;
}
}
rv[0] = name;
rv[1] = val_end - val_bgn == 0 ? Bry_.Empty : Bry_.Mid(src, val_bgn, val_end);
}
// /**
// * Match some text, without parameter capture
// * Returns the magic word name, or false if there was no capture
// *
// * @param String text
// *
// * @return String|boolean False on failure
// */
// public function matchStartToEnd(text) {
// hash = this->getHash();
// if (isset(hash[1][text])) {
// return hash[1][text];
// }
// global wgContLang;
// lc = wgContLang->lc(text);
// if (isset(hash[0][lc])) {
// return hash[0][lc];
// }
// return false;
// }
//
// /**
// * Returns an associative array, ID => param value, for all items that match
// * Removes the matched items from the input String (passed by reference)
// *
// * @param String text
// *
// * @return array
// */
// public function matchAndRemove(&text) {
// found = [];
// regexes = this->getRegex();
// foreach (regexes as regex) {
// if (regex === '') {
// continue;
// }
// matches = [];
// res = preg_match_all(regex, text, matches, PREG_SET_ORDER);
// if (res === false) {
// LoggerFactory::getInstance('parser')->warning('preg_match_all returned false', [
// 'code' => preg_last_error(),
// 'regex' => regex,
// 'text' => text,
// ]);
// } elseif (res) {
// foreach (matches as m) {
// list(name, param) = this->parseMatch(m);
// found[name] = param;
// }
// }
// res = preg_replace(regex, '', text);
// if (res === null) {
// LoggerFactory::getInstance('parser')->warning('preg_replace returned null', [
// 'code' => preg_last_error(),
// 'regex' => regex,
// 'text' => text,
// ]);
// }
// text = res;
// }
// return found;
// }
//
// /**
// * Return the ID of the magic word at the start of text, and remove
// * the prefix from text.
// * Return false if no match found and text is not modified.
// * Does not match parameters.
// *
// * @param String text
// *
// * @return int|boolean False on failure
// */
// public function matchStartAndRemove(&text) {
// regexes = this->getRegexStart();
// foreach (regexes as regex) {
// if (regex === '') {
// continue;
// }
// if (preg_match(regex, text, m)) {
// list(id,) = this->parseMatch(m);
// if (strlen(m[0]) >= strlen(text)) {
// text = '';
// } else {
// text = substr(text, strlen(m[0]));
// }
// return id;
// }
// }
// return false;
// }
}

View File

@@ -0,0 +1,64 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_MagicWordArray__tst {
private final Xomw_MagicWordArray__fxt fxt = new Xomw_MagicWordArray__fxt();
@Test public void Nil() {
fxt.Init__word(Bool_.Y, "img_nil", "nil");
fxt.Init__ary("img_nil");
fxt.Test__matchVariableStartToEnd("nil", "img_nil", "");
fxt.Test__matchVariableStartToEnd("nila", null, null);
}
@Test public void Bgn() {
fxt.Init__word(Bool_.Y, "img_bgn", "bgn$1");
fxt.Init__ary("img_bgn");
fxt.Test__matchVariableStartToEnd("bgna", "img_bgn", "a");
fxt.Test__matchVariableStartToEnd("bgn", "img_bgn", "");
}
@Test public void End() {
fxt.Init__word(Bool_.Y, "img_end", "$1end");
fxt.Init__ary("img_end");
fxt.Test__matchVariableStartToEnd("aend", "img_end", "a");
fxt.Test__matchVariableStartToEnd("end", "img_end", "");
}
@Test public void Smoke() {
fxt.Init__word(Bool_.Y, "img_upright", "upright", "upright=$1", "upright $1");
fxt.Init__word(Bool_.Y, "img_width", "$1px");
fxt.Init__ary("img_upright", "img_width");
fxt.Test__matchVariableStartToEnd("upright=123", "img_upright", "123");
fxt.Test__matchVariableStartToEnd("123px", "img_width", "123");
}
}
class Xomw_MagicWordArray__fxt {
private final Xomw_MagicWordMgr magic_word_mgr = new Xomw_MagicWordMgr();
private Xomw_MagicWordArray magic_word_ary;
public void Init__word(boolean cs, String word, String... synonyms) {
magic_word_mgr.Add(Bry_.new_u8(word), cs, Bry_.Ary(synonyms));
}
public void Init__ary(String... words) {
magic_word_ary = new Xomw_MagicWordArray(magic_word_mgr, Bry_.Ary(words));
}
public void Test__matchVariableStartToEnd(String src, String expd_name, String expd_val) {
byte[][] rv = new byte[2][];
magic_word_ary.matchVariableStartToEnd(rv, Bry_.new_u8(src));
Gftest.Eq__str(expd_name, rv[0], expd_name);
Gftest.Eq__str(expd_val , rv[1], expd_val);
}
}

View File

@@ -0,0 +1,28 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_MagicWordMgr {
private final Hash_adp_bry hash = Hash_adp_bry.cs();
public void Add(byte[] name, boolean cs, byte[]... synonyms) {
Xomw_MagicWord mw = new Xomw_MagicWord(name, cs, synonyms);
hash.Add(name, mw);
}
public Xomw_MagicWord Get(byte[] name) {
return (Xomw_MagicWord)hash.Get_by(name);
}
}

View File

@@ -0,0 +1,91 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_MagicWordSynonym {
public final byte[] magic_name;
public final boolean case_match;
public final byte[] text;
public final byte[] text_wo_arg1;
public final byte arg1_tid;
public Xomw_MagicWordSynonym(byte[] magic_name, boolean case_match, byte[] text) {
this.magic_name = magic_name;
this.case_match = case_match;
this.text = text;
this.arg1_tid = Get_arg1_tid(text);
switch (arg1_tid) {
case Arg1__bgn:
text_wo_arg1 = Bry_.Mid(text, 2);
break;
case Arg1__end:
text_wo_arg1 = Bry_.Mid(text, 0, text.length - 2);
break;
default:
text_wo_arg1 = text;
break;
}
}
private static byte Get_arg1_tid(byte[] src) {
int len = src.length;
byte rv = Arg1__nil;
int cur = 0;
while (true) {
if (cur == len) break;
byte b = src[cur];
// "$" matched
if (b == Byte_ascii.Dollar) {
// "1" matched?
int nxt_pos = cur + 1;
if (nxt_pos < len && src[nxt_pos] == Byte_ascii.Num_1) {
// "$1" matched
if (cur == 0) {
rv = Arg1__bgn;
}
else if (cur == len - 2) {
rv = rv == Arg1__nil ? Arg1__end : Arg1__mix;
}
else {
if (rv == Arg1__nil)
rv = Arg1__mid;
else if (rv == Arg1__mid)
rv = Arg1__mix;
}
cur += 2;
continue;
}
else {
cur += 1;
continue;
}
}
else {
cur += 1;
continue;
}
}
return rv;
}
public static final byte
Arg1__nil = 0 // EX: "thumb"
, Arg1__bgn = 1 // EX: "$1px"
, Arg1__end = 2 // EX: "thumb=$1"
, Arg1__mid = 3 // EX: "a$1b"
, Arg1__mix = 4 // EX: "a$1b$cc"
;
}

View File

@@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_Message {
public byte[] text() {return null;}
public byte[] escaped() {return null;}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_linker__normalize_subpage_link {
public byte[] link;
public byte[] text;
public Xomw_linker__normalize_subpage_link Init(byte[] link, byte[] text) {
this.link = link;
this.text = text;
return this;
}
}

View File

@@ -0,0 +1,43 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_linker__normalize_subpage_link__tst {
private final Xomw_linker__normalize_subpage_link__fxt fxt = new Xomw_linker__normalize_subpage_link__fxt();
@Test public void None() {fxt.Test__normalize_subpage_link("A/B/C" , "Z" , "" , "Z" , "");}
@Test public void Hash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Y#Z" , "" , "A/B/C/Y#Z" , "/Y#Z");}
@Test public void Slash__basic() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z" , "" , "A/B/C/Z" , "/Z");}
@Test public void Slash__slash() {fxt.Test__normalize_subpage_link("A/B/C" , "/Z/" , "" , "A/B/C/Z" , "Z");}
@Test public void Dot2__empty() {fxt.Test__normalize_subpage_link("A/B/C" , "../" , "" , "A/B" , "");}
@Test public void Dot2__many() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z" , "z1" , "A/Z" , "z1");}
@Test public void Dot2__trailing() {fxt.Test__normalize_subpage_link("A/B/C" , "../../Z/" , "" , "A/Z" , "Z");}
}
class Xomw_linker__normalize_subpage_link__fxt {
private final Xomw_linker mgr = new Xomw_linker(new gplx.xowa.mediawiki.includes.linkers.Xomw_link_renderer(new Xomw_sanitizer()));
private final Xowe_wiki wiki;
private final Xomw_linker__normalize_subpage_link normalize_subpage_link = new Xomw_linker__normalize_subpage_link();
public Xomw_linker__normalize_subpage_link__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
this.wiki = Xoa_app_fxt.Make__wiki__edit(app);
}
public void Test__normalize_subpage_link(String page_title_str, String link, String text, String expd_link, String expd_text) {
mgr.normalizeSubpageLink(normalize_subpage_link, wiki.Ttl_parse(Bry_.new_u8(page_title_str)), Bry_.new_u8(link), Bry_.new_u8(text));
Gftest.Eq__str(expd_link, String_.new_u8(normalize_subpage_link.link));
Gftest.Eq__str(expd_text, String_.new_u8(normalize_subpage_link.text));
}
}

View File

@@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_linker__split_trail__tst {
private final Xomw_linker__split_trail__fxt fxt = new Xomw_linker__split_trail__fxt();
@Test public void Basic() {fxt.Test__split_trail("abc def" , "abc" , " def");}
@Test public void None() {fxt.Test__split_trail(" abc" , null , " abc");}
}
class Xomw_linker__split_trail__fxt {
private final Xomw_linker linker = new Xomw_linker(new gplx.xowa.mediawiki.includes.linkers.Xomw_link_renderer(new Xomw_sanitizer()));
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
public Xomw_linker__split_trail__fxt() {
String[] ary = new String[] {"a", "b", "c", "d", "e", "f"};
for (String itm : ary)
trie.Add_str_str(itm, itm);
linker.Init_by_wiki(new Xomw_parser_env(), trie);
}
public void Test__split_trail(String trail_str, String expd_inside, String expd_trail) {
byte[][] split_trail = linker.splitTrail(Bry_.new_u8(trail_str));
Gftest.Eq__str(expd_inside, String_.new_u8(split_trail[0]));
Gftest.Eq__str(expd_trail , String_.new_u8(split_trail[1]));
}
}

View File

@@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_message_mgr {
private final Hash_adp hash = Hash_adp_.New();
public Xomw_Message Get_by_str(String key) {return (Xomw_Message)hash.Get_by(key);}
}

View File

@@ -0,0 +1,921 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import gplx.core.brys.*; import gplx.core.btries.*; import gplx.core.encoders.*; import gplx.core.primitives.*; import gplx.langs.htmls.entitys.*;
import gplx.xowa.parsers.htmls.*;
import gplx.langs.htmls.*; import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_sanitizer {
private final Mwh_doc_wkr__atr_bldr atr_bldr = new Mwh_doc_wkr__atr_bldr();
private final Mwh_atr_parser atr_parser = new Mwh_atr_parser();
private final Xomw_regex_escape_invalid regex_clean_url = new Xomw_regex_escape_invalid();
private final Xomw_regex_find_domain regex_find_domain = new Xomw_regex_find_domain();
private final Xomw_regex_ipv6_brack regex_ipv6_brack = new Xomw_regex_ipv6_brack();
private final Bry_tmp tmp_host = new Bry_tmp();
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private final Xomw_regex_url_char_cbk__normalize normalize_cbk;
private final Xomw_regex_url_char_cbk__decode decode_cbk;
private static Xomw_regex_url_char regex_url_char;
private static Btrie_slim_mgr invalid_idn_trie;
public Xomw_sanitizer() {
this.normalize_cbk = new Xomw_regex_url_char_cbk__normalize(this);
this.decode_cbk = new Xomw_regex_url_char_cbk__decode(this);
if (regex_url_char == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
regex_url_char = new Xomw_regex_url_char();
// Characters that will be ignored in IDNs.
// https://tools.ietf.org/html/rfc3454#section-3.1
// $strip = "/
// \\s| // general whitespace
// \xc2\xad| // 00ad SOFT HYPHEN
// \xe1\xa0\x86| // 1806 MONGOLIAN TODO SOFT HYPHEN
// \xe2\x80\x8b| // 200b ZERO WIDTH SPACE
// \xe2\x81\xa0| // 2060 WORD JOINER
// \xef\xbb\xbf| // feff ZERO WIDTH NO-BREAK SPACE
// \xcd\x8f| // 034f COMBINING GRAPHEME JOINER
// \xe1\xa0\x8b| // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
// \xe1\xa0\x8c| // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
// \xe1\xa0\x8d| // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
// \xe2\x80\x8c| // 200c ZERO WIDTH NON-JOINER
// \xe2\x80\x8d| // 200d ZERO WIDTH JOINER
// [\xef\xb8\x80-\xef\xb8\x8f] // fe00-fe0f VARIATION SELECTOR-1-16
// /xuD";
// XO.MW.REGEX:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
// /x : ignore embedded ws
// /u : enabled pcre utf8
// /D : $ matches EOS, not NL
invalid_idn_trie = Btrie_slim_mgr.cs()
.Add_many_bry(new Xomw_regex_parser().Add_ary
( "\\s"
, "\\xc2\\xad" // 00ad SOFT HYPHEN
, "\\xe1\\xa0\\x86" // 1806 MONGOLIAN TODO SOFT HYPHEN
, "\\xe2\\x80\\x8b" // 200b ZERO WIDTH SPACE
, "\\xe2\\x81\\xa0" // 2060 WORD JOINER
, "\\xef\\xbb\\xbf" // feff ZERO WIDTH NO-BREAK SPACE
, "\\xcd\\x8f" // 034f COMBINING GRAPHEME JOINER
, "\\xe1\\xa0\\x8b" // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
, "\\xe1\\xa0\\x8c" // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
, "\\xe1\\xa0\\x8d" // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
, "\\xe2\\x80\\x8c" // 200c ZERO WIDTH NON-JOINER
, "\\xe2\\x80\\x8d" // 200d ZERO WIDTH JOINER
)
.Add_rng
( "\\xef\\xb8\\x80", "\\xef\\xb8\\x8f" // fe00-fe0f VARIATION SELECTOR-1-16
)
.Rslt());
// assert static structs
if (html_entities == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
html_entities = Html_entities_new();
}
}
}
}
}
// Merge two sets of HTML attributes. Conflicting items in the second set
// will override those in the first, except for 'class' attributes which
// will be combined (if they're both strings).
// XO.MW: XO does src += trg; MW does rv = src + trg;
public void Merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg) {
int trg_len = trg.Len();
for (int i = 0; i < trg_len; i++) {
Xomw_atr_itm trg_atr = trg.Get_at(i);
// merge trg and src
byte[] atr_cls = Gfh_atr_.Bry__class;
if (Bry_.Eq(trg_atr.Key_bry(), atr_cls)) {
Xomw_atr_itm src_atr = src.Get_by_or_null(atr_cls);
if (src_atr != null) {
// NOTE: need byte[]-creation is unavoidable b/c src_atr and trg_atr are non-null
Merge_atrs_combine(tmp_bfr, src_atr.Val(), Byte_ascii.Space);
tmp_bfr.Add_byte_space();
Merge_atrs_combine(tmp_bfr, trg_atr.Val(), Byte_ascii.Space);
src_atr.Val_(tmp_bfr.To_bry_and_clear());
continue;
}
}
src.Add_or_set(trg_atr);
}
}
private void Merge_atrs_combine(Bry_bfr trg, byte[] src, byte sep) {
int src_len = src.length;
for (int i = 0; i < src_len; i++) {
byte b = src[i];
if (b == sep) {
// gobble ws; EX: "a b"
int space_bgn = i;
int space_end = Bry_find_.Find_fwd_while(src, i, src_len, sep);
i = space_end - 1; // -1 b/c i++ above
// ignore ws at BOS; EX: " a"
if (space_bgn == 0)
continue;
// ignore ws at EOS; EX: "a "
if (space_end == src_len)
break;
}
trg.Add_byte(b);
}
}
public byte[] Clean_url(byte[] url) {
// Normalize any HTML entities in input. They will be
// re-escaped by makeExternalLink().
url = Decode_char_references(null, Bool_.Y, url, 0, url.length);
// Escape any control characters introduced by the above step
// XO.MW.REGEX: $url = preg_replace_callback('/[\][<>"\\x00-\\x20\\x7F\|]/', [ __CLASS__, 'cleanUrlCallback' ], $url);
// '[]<>"' | '00 -> 32' | 127
if (regex_clean_url.Escape(tmp_bfr, url, 0, url.length))
url = tmp_bfr.To_bry_and_clear();
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches))
if (regex_find_domain.Match(url, 0, url.length)) {
// Characters that will be ignored in IDNs.
// https://tools.ietf.org/html/rfc3454#section-3.1
// Strip them before further processing so blacklists and such work.
Php_preg_.Replace(tmp_host.Init(url, regex_find_domain.host_bgn, regex_find_domain.host_end), tmp_bfr, invalid_idn_trie, trv, Bry_.Empty);
// IPv6 host names are bracketed with []. Url-decode these.
// if (substr_compare("//%5B", $host, 0, 5) === 0 &&
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// XO.MW.REGEX:
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
// EX: [ABCD]:80:12
if (regex_ipv6_brack.Match(tmp_host.src, tmp_host.src_bgn, tmp_host.src_end)) {
tmp_bfr.Add_str_a7("//[").Add_mid(tmp_host.src, regex_ipv6_brack.host_bgn, regex_ipv6_brack.host_end)
.Add_byte(Byte_ascii.Brack_end).Add_mid(tmp_host.src, regex_ipv6_brack.segs_bgn, regex_ipv6_brack.segs_end);
tmp_host.Set_by_bfr(tmp_bfr);
}
// @todo FIXME: Validate hostnames here
tmp_bfr.Add_mid(url, regex_find_domain.prot_bgn, regex_find_domain.prot_end);
tmp_host.Add_to_bfr(tmp_bfr);
tmp_bfr.Add_mid(url, regex_find_domain.rest_bgn, regex_find_domain.rest_end);
return tmp_bfr.To_bry_and_clear();
}
else {
return url;
}
}
public void Fix_tag_attributes(Bry_bfr bfr, byte[] tag_name, byte[] atrs) {
atr_bldr.Atrs__clear();
atr_parser.Parse(atr_bldr, -1, -1, atrs, 0, atrs.length);
int len = atr_bldr.Atrs__len();
// PORTED: Sanitizer.php|safeEncodeTagAttributes
for (int i = 0; i < len; i++) {
// $encAttribute = htmlspecialchars($attribute);
// $encValue = Sanitizer::safeEncodeAttribute($value);
// $attribs[] = "$encAttribute=\"$encValue\"";
Mwh_atr_itm itm = atr_bldr.Atrs__get_at(i);
bfr.Add_byte_space(); // "return count($attribs) ? ' ' . implode(' ', $attribs) : '';"
bfr.Add_bry_escape_html(itm.Key_bry(), itm.Key_bgn(), itm.Key_end());
bfr.Add_byte_eq().Add_byte_quote();
bfr.Add(itm.Val_as_bry()); // TODO.XO:Sanitizer::encode
bfr.Add_byte_quote();
}
}
public void Normalize_char_references(Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
Normalize_char_references(bfr, Bool_.N, src, src_bgn, src_end);
}
public byte[] Normalize_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, normalize_cbk);
}
public byte[] Decode_char_references(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end) {
return regex_url_char.Replace_by_cbk(bfr, lone_bfr, src, src_bgn, src_end, decode_cbk);
}
public boolean Validate_codepoint(int codepoint) {
// U+000C is valid in HTML5 but not allowed in XML.
// U+000D is valid in XML but not allowed in HTML5.
// U+007F - U+009F are disallowed in HTML5 (control characters).
return codepoint == 0x09
|| codepoint == 0x0a
|| (codepoint >= 0x20 && codepoint <= 0x7e)
|| (codepoint >= 0xa0 && codepoint <= 0xd7ff)
|| (codepoint >= 0xe000 && codepoint <= 0xfffd)
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
}
// Encode an attribute value for HTML output.
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Encode_attribute(Bry_bfr bfr, byte[] text) {
// Whitespace is normalized during attribute decoding,
// so if we've been passed non-spaces we must encode them
// ahead of time or they won't be preserved.
bfr.Add_bry_escape_xml(text, 0, text.length);
}
public static Hash_adp_bry html_entities;
private static Hash_adp_bry Html_entities_new() {
Bry_bfr tmp = Bry_bfr_.New();
Hash_adp_bry rv = Hash_adp_bry.cs();
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "רלמ", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__alias, 8207, "رلم", "&rlm;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 60, "lt", "&lt;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 62, "gt", "&gt;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 38, "amp", "&amp;");
Html_entities_set(rv, Xomw_html_ent.Type__char, 34, "quot", "&quot;");
// List of all named character entities defined in HTML 4.01
// https://www.w3.org/TR/html4/sgml/entities.html
// As well as &apos; which is only defined starting in XHTML1.
Html_entities_set(rv, tmp, "Aacute" , 193);
Html_entities_set(rv, tmp, "aacute" , 225);
Html_entities_set(rv, tmp, "Acirc" , 194);
Html_entities_set(rv, tmp, "acirc" , 226);
Html_entities_set(rv, tmp, "acute" , 180);
Html_entities_set(rv, tmp, "AElig" , 198);
Html_entities_set(rv, tmp, "aelig" , 230);
Html_entities_set(rv, tmp, "Agrave" , 192);
Html_entities_set(rv, tmp, "agrave" , 224);
Html_entities_set(rv, tmp, "alefsym" , 8501);
Html_entities_set(rv, tmp, "Alpha" , 913);
Html_entities_set(rv, tmp, "alpha" , 945);
Html_entities_set(rv, tmp, "amp" , 38); // XO: identical to Type__char entry; note that Type__char should be evaluated first
Html_entities_set(rv, tmp, "and" , 8743);
Html_entities_set(rv, tmp, "ang" , 8736);
Html_entities_set(rv, tmp, "apos" , 39); // New in XHTML & HTML 5; avoid in output for compatibility with IE.
Html_entities_set(rv, tmp, "Aring" , 197);
Html_entities_set(rv, tmp, "aring" , 229);
Html_entities_set(rv, tmp, "asymp" , 8776);
Html_entities_set(rv, tmp, "Atilde" , 195);
Html_entities_set(rv, tmp, "atilde" , 227);
Html_entities_set(rv, tmp, "Auml" , 196);
Html_entities_set(rv, tmp, "auml" , 228);
Html_entities_set(rv, tmp, "bdquo" , 8222);
Html_entities_set(rv, tmp, "Beta" , 914);
Html_entities_set(rv, tmp, "beta" , 946);
Html_entities_set(rv, tmp, "brvbar" , 166);
Html_entities_set(rv, tmp, "bull" , 8226);
Html_entities_set(rv, tmp, "cap" , 8745);
Html_entities_set(rv, tmp, "Ccedil" , 199);
Html_entities_set(rv, tmp, "ccedil" , 231);
Html_entities_set(rv, tmp, "cedil" , 184);
Html_entities_set(rv, tmp, "cent" , 162);
Html_entities_set(rv, tmp, "Chi" , 935);
Html_entities_set(rv, tmp, "chi" , 967);
Html_entities_set(rv, tmp, "circ" , 710);
Html_entities_set(rv, tmp, "clubs" , 9827);
Html_entities_set(rv, tmp, "cong" , 8773);
Html_entities_set(rv, tmp, "copy" , 169);
Html_entities_set(rv, tmp, "crarr" , 8629);
Html_entities_set(rv, tmp, "cup" , 8746);
Html_entities_set(rv, tmp, "curren" , 164);
Html_entities_set(rv, tmp, "dagger" , 8224);
Html_entities_set(rv, tmp, "Dagger" , 8225);
Html_entities_set(rv, tmp, "darr" , 8595);
Html_entities_set(rv, tmp, "dArr" , 8659);
Html_entities_set(rv, tmp, "deg" , 176);
Html_entities_set(rv, tmp, "Delta" , 916);
Html_entities_set(rv, tmp, "delta" , 948);
Html_entities_set(rv, tmp, "diams" , 9830);
Html_entities_set(rv, tmp, "divide" , 247);
Html_entities_set(rv, tmp, "Eacute" , 201);
Html_entities_set(rv, tmp, "eacute" , 233);
Html_entities_set(rv, tmp, "Ecirc" , 202);
Html_entities_set(rv, tmp, "ecirc" , 234);
Html_entities_set(rv, tmp, "Egrave" , 200);
Html_entities_set(rv, tmp, "egrave" , 232);
Html_entities_set(rv, tmp, "empty" , 8709);
Html_entities_set(rv, tmp, "emsp" , 8195);
Html_entities_set(rv, tmp, "ensp" , 8194);
Html_entities_set(rv, tmp, "Epsilon" , 917);
Html_entities_set(rv, tmp, "epsilon" , 949);
Html_entities_set(rv, tmp, "equiv" , 8801);
Html_entities_set(rv, tmp, "Eta" , 919);
Html_entities_set(rv, tmp, "eta" , 951);
Html_entities_set(rv, tmp, "ETH" , 208);
Html_entities_set(rv, tmp, "eth" , 240);
Html_entities_set(rv, tmp, "Euml" , 203);
Html_entities_set(rv, tmp, "euml" , 235);
Html_entities_set(rv, tmp, "euro" , 8364);
Html_entities_set(rv, tmp, "exist" , 8707);
Html_entities_set(rv, tmp, "fnof" , 402);
Html_entities_set(rv, tmp, "forall" , 8704);
Html_entities_set(rv, tmp, "frac12" , 189);
Html_entities_set(rv, tmp, "frac14" , 188);
Html_entities_set(rv, tmp, "frac34" , 190);
Html_entities_set(rv, tmp, "frasl" , 8260);
Html_entities_set(rv, tmp, "Gamma" , 915);
Html_entities_set(rv, tmp, "gamma" , 947);
Html_entities_set(rv, tmp, "ge" , 8805);
Html_entities_set(rv, tmp, "gt" , 62);
Html_entities_set(rv, tmp, "harr" , 8596);
Html_entities_set(rv, tmp, "hArr" , 8660);
Html_entities_set(rv, tmp, "hearts" , 9829);
Html_entities_set(rv, tmp, "hellip" , 8230);
Html_entities_set(rv, tmp, "Iacute" , 205);
Html_entities_set(rv, tmp, "iacute" , 237);
Html_entities_set(rv, tmp, "Icirc" , 206);
Html_entities_set(rv, tmp, "icirc" , 238);
Html_entities_set(rv, tmp, "iexcl" , 161);
Html_entities_set(rv, tmp, "Igrave" , 204);
Html_entities_set(rv, tmp, "igrave" , 236);
Html_entities_set(rv, tmp, "image" , 8465);
Html_entities_set(rv, tmp, "infin" , 8734);
Html_entities_set(rv, tmp, "int" , 8747);
Html_entities_set(rv, tmp, "Iota" , 921);
Html_entities_set(rv, tmp, "iota" , 953);
Html_entities_set(rv, tmp, "iquest" , 191);
Html_entities_set(rv, tmp, "isin" , 8712);
Html_entities_set(rv, tmp, "Iuml" , 207);
Html_entities_set(rv, tmp, "iuml" , 239);
Html_entities_set(rv, tmp, "Kappa" , 922);
Html_entities_set(rv, tmp, "kappa" , 954);
Html_entities_set(rv, tmp, "Lambda" , 923);
Html_entities_set(rv, tmp, "lambda" , 955);
Html_entities_set(rv, tmp, "lang" , 9001);
Html_entities_set(rv, tmp, "laquo" , 171);
Html_entities_set(rv, tmp, "larr" , 8592);
Html_entities_set(rv, tmp, "lArr" , 8656);
Html_entities_set(rv, tmp, "lceil" , 8968);
Html_entities_set(rv, tmp, "ldquo" , 8220);
Html_entities_set(rv, tmp, "le" , 8804);
Html_entities_set(rv, tmp, "lfloor" , 8970);
Html_entities_set(rv, tmp, "lowast" , 8727);
Html_entities_set(rv, tmp, "loz" , 9674);
Html_entities_set(rv, tmp, "lrm" , 8206);
Html_entities_set(rv, tmp, "lsaquo" , 8249);
Html_entities_set(rv, tmp, "lsquo" , 8216);
Html_entities_set(rv, tmp, "lt" , 60);
Html_entities_set(rv, tmp, "macr" , 175);
Html_entities_set(rv, tmp, "mdash" , 8212);
Html_entities_set(rv, tmp, "micro" , 181);
Html_entities_set(rv, tmp, "middot" , 183);
Html_entities_set(rv, tmp, "minus" , 8722);
Html_entities_set(rv, tmp, "Mu" , 924);
Html_entities_set(rv, tmp, "mu" , 956);
Html_entities_set(rv, tmp, "nabla" , 8711);
Html_entities_set(rv, tmp, "nbsp" , 160);
Html_entities_set(rv, tmp, "ndash" , 8211);
Html_entities_set(rv, tmp, "ne" , 8800);
Html_entities_set(rv, tmp, "ni" , 8715);
Html_entities_set(rv, tmp, "not" , 172);
Html_entities_set(rv, tmp, "notin" , 8713);
Html_entities_set(rv, tmp, "nsub" , 8836);
Html_entities_set(rv, tmp, "Ntilde" , 209);
Html_entities_set(rv, tmp, "ntilde" , 241);
Html_entities_set(rv, tmp, "Nu" , 925);
Html_entities_set(rv, tmp, "nu" , 957);
Html_entities_set(rv, tmp, "Oacute" , 211);
Html_entities_set(rv, tmp, "oacute" , 243);
Html_entities_set(rv, tmp, "Ocirc" , 212);
Html_entities_set(rv, tmp, "ocirc" , 244);
Html_entities_set(rv, tmp, "OElig" , 338);
Html_entities_set(rv, tmp, "oelig" , 339);
Html_entities_set(rv, tmp, "Ograve" , 210);
Html_entities_set(rv, tmp, "ograve" , 242);
Html_entities_set(rv, tmp, "oline" , 8254);
Html_entities_set(rv, tmp, "Omega" , 937);
Html_entities_set(rv, tmp, "omega" , 969);
Html_entities_set(rv, tmp, "Omicron" , 927);
Html_entities_set(rv, tmp, "omicron" , 959);
Html_entities_set(rv, tmp, "oplus" , 8853);
Html_entities_set(rv, tmp, "or" , 8744);
Html_entities_set(rv, tmp, "ordf" , 170);
Html_entities_set(rv, tmp, "ordm" , 186);
Html_entities_set(rv, tmp, "Oslash" , 216);
Html_entities_set(rv, tmp, "oslash" , 248);
Html_entities_set(rv, tmp, "Otilde" , 213);
Html_entities_set(rv, tmp, "otilde" , 245);
Html_entities_set(rv, tmp, "otimes" , 8855);
Html_entities_set(rv, tmp, "Ouml" , 214);
Html_entities_set(rv, tmp, "ouml" , 246);
Html_entities_set(rv, tmp, "para" , 182);
Html_entities_set(rv, tmp, "part" , 8706);
Html_entities_set(rv, tmp, "permil" , 8240);
Html_entities_set(rv, tmp, "perp" , 8869);
Html_entities_set(rv, tmp, "Phi" , 934);
Html_entities_set(rv, tmp, "phi" , 966);
Html_entities_set(rv, tmp, "Pi" , 928);
Html_entities_set(rv, tmp, "pi" , 960);
Html_entities_set(rv, tmp, "piv" , 982);
Html_entities_set(rv, tmp, "plusmn" , 177);
Html_entities_set(rv, tmp, "pound" , 163);
Html_entities_set(rv, tmp, "prime" , 8242);
Html_entities_set(rv, tmp, "Prime" , 8243);
Html_entities_set(rv, tmp, "prod" , 8719);
Html_entities_set(rv, tmp, "prop" , 8733);
Html_entities_set(rv, tmp, "Psi" , 936);
Html_entities_set(rv, tmp, "psi" , 968);
Html_entities_set(rv, tmp, "quot" , 34);
Html_entities_set(rv, tmp, "radic" , 8730);
Html_entities_set(rv, tmp, "rang" , 9002);
Html_entities_set(rv, tmp, "raquo" , 187);
Html_entities_set(rv, tmp, "rarr" , 8594);
Html_entities_set(rv, tmp, "rArr" , 8658);
Html_entities_set(rv, tmp, "rceil" , 8969);
Html_entities_set(rv, tmp, "rdquo" , 8221);
Html_entities_set(rv, tmp, "real" , 8476);
Html_entities_set(rv, tmp, "reg" , 174);
Html_entities_set(rv, tmp, "rfloor" , 8971);
Html_entities_set(rv, tmp, "Rho" , 929);
Html_entities_set(rv, tmp, "rho" , 961);
Html_entities_set(rv, tmp, "rlm" , 8207);
Html_entities_set(rv, tmp, "rsaquo" , 8250);
Html_entities_set(rv, tmp, "rsquo" , 8217);
Html_entities_set(rv, tmp, "sbquo" , 8218);
Html_entities_set(rv, tmp, "Scaron" , 352);
Html_entities_set(rv, tmp, "scaron" , 353);
Html_entities_set(rv, tmp, "sdot" , 8901);
Html_entities_set(rv, tmp, "sect" , 167);
Html_entities_set(rv, tmp, "shy" , 173);
Html_entities_set(rv, tmp, "Sigma" , 931);
Html_entities_set(rv, tmp, "sigma" , 963);
Html_entities_set(rv, tmp, "sigmaf" , 962);
Html_entities_set(rv, tmp, "sim" , 8764);
Html_entities_set(rv, tmp, "spades" , 9824);
Html_entities_set(rv, tmp, "sub" , 8834);
Html_entities_set(rv, tmp, "sube" , 8838);
Html_entities_set(rv, tmp, "sum" , 8721);
Html_entities_set(rv, tmp, "sup" , 8835);
Html_entities_set(rv, tmp, "sup1" , 185);
Html_entities_set(rv, tmp, "sup2" , 178);
Html_entities_set(rv, tmp, "sup3" , 179);
Html_entities_set(rv, tmp, "supe" , 8839);
Html_entities_set(rv, tmp, "szlig" , 223);
Html_entities_set(rv, tmp, "Tau" , 932);
Html_entities_set(rv, tmp, "tau" , 964);
Html_entities_set(rv, tmp, "there4" , 8756);
Html_entities_set(rv, tmp, "Theta" , 920);
Html_entities_set(rv, tmp, "theta" , 952);
Html_entities_set(rv, tmp, "thetasym" , 977);
Html_entities_set(rv, tmp, "thinsp" , 8201);
Html_entities_set(rv, tmp, "THORN" , 222);
Html_entities_set(rv, tmp, "thorn" , 254);
Html_entities_set(rv, tmp, "tilde" , 732);
Html_entities_set(rv, tmp, "times" , 215);
Html_entities_set(rv, tmp, "trade" , 8482);
Html_entities_set(rv, tmp, "Uacute" , 218);
Html_entities_set(rv, tmp, "uacute" , 250);
Html_entities_set(rv, tmp, "uarr" , 8593);
Html_entities_set(rv, tmp, "uArr" , 8657);
Html_entities_set(rv, tmp, "Ucirc" , 219);
Html_entities_set(rv, tmp, "ucirc" , 251);
Html_entities_set(rv, tmp, "Ugrave" , 217);
Html_entities_set(rv, tmp, "ugrave" , 249);
Html_entities_set(rv, tmp, "uml" , 168);
Html_entities_set(rv, tmp, "upsih" , 978);
Html_entities_set(rv, tmp, "Upsilon" , 933);
Html_entities_set(rv, tmp, "upsilon" , 965);
Html_entities_set(rv, tmp, "Uuml" , 220);
Html_entities_set(rv, tmp, "uuml" , 252);
Html_entities_set(rv, tmp, "weierp" , 8472);
Html_entities_set(rv, tmp, "Xi" , 926);
Html_entities_set(rv, tmp, "xi" , 958);
Html_entities_set(rv, tmp, "Yacute" , 221);
Html_entities_set(rv, tmp, "yacute" , 253);
Html_entities_set(rv, tmp, "yen" , 165);
Html_entities_set(rv, tmp, "Yuml" , 376);
Html_entities_set(rv, tmp, "yuml" , 255);
Html_entities_set(rv, tmp, "Zeta" , 918);
Html_entities_set(rv, tmp, "zeta" , 950);
Html_entities_set(rv, tmp, "zwj" , 8205);
Html_entities_set(rv, tmp, "zwnj" , 8204);
return rv;
}
private static void Html_entities_set(Hash_adp_bry rv, Bry_bfr tmp, String name_str, int code) {
byte[] html_bry = tmp.Add_str_a7("&#").Add_int_variable(code).Add_byte_semic().To_bry_and_clear();
Html_entities_set(rv, Xomw_html_ent.Type__entity, code, name_str, html_bry);
}
private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, String html_str) {Html_entities_set(rv, type, code, name_str, Bry_.new_u8(html_str));}
private static void Html_entities_set(Hash_adp_bry rv, byte type, int code, String name_str, byte[] html_bry) {
byte[] name_bry = Bry_.new_u8(name_str);
rv.Add_if_dupe_use_1st(name_bry, new Xomw_html_ent(type, code, name_bry, html_bry)); // Add_dupe needed b/c "lt" and co. are added early; ignore subsequent call
}
}
class Xomw_html_ent {
public Xomw_html_ent(byte type, int code, byte[] name, byte[] html) {
this.type = type;
this.code = code;
this.name = name;
this.html = html;
}
public final byte type;
public final int code;
public final byte[] name;
public final byte[] html;
public static final byte Type__null = 0, Type__alias = 1, Type__char = 2, Type__entity = 3;
}
class Xomw_regex_find_domain {
public int prot_bgn;
public int prot_end;
public int host_bgn;
public int host_end;
public int rest_bgn;
public int rest_end;
public boolean Match(byte[] src, int src_bgn, int src_end) {
// Validate hostname portion
// XO.MW.REGEX: if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
// ([^:]+:)(//[^/]+)?(.*)
// "protocol" + "host" + "rest"
// "protocol" -> ([^:]+:) EX: "https:" anything not-colon up to colon
// "host" -> (//[^/]+)? EX: "//abc/" anything not-slash up to slash
// "rest" -> (.*) EX: rest"
// /i : case-insensitive
// /D : $ matches EOS, not NL
// find prot; EX: "https:"
prot_bgn = src_bgn;
prot_end = Bry_find_.Move_fwd(src, Byte_ascii.Colon, prot_bgn, src_end);
// exit if not found
if (prot_end == Bry_find_.Not_found) return false;
// find host: EX: "//a.org"
host_bgn = prot_end;
int double_slash_end = host_bgn + 2;
// exit if eos
if (double_slash_end >= src_end) return false;
// exit if not "//"
if ( src[host_bgn ] != Byte_ascii.Slash
|| src[host_bgn + 1] != Byte_ascii.Slash
) return false;
host_end = Bry_find_.Find_fwd(src, Byte_ascii.Slash, double_slash_end, src_end);
// exit if not found
if (host_end == Bry_find_.Not_found) {
host_end = src_end;
rest_bgn = rest_end = -1;
}
// exit if only "//"
if (host_end - host_bgn == 2) return false;
// set rest
rest_bgn = host_end;
rest_end = src_end;
return true;
}
}
class Xomw_regex_escape_invalid {
// [\][<>"\\x00-\\x20\\x7F\|]
public boolean Escape(Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
boolean dirty = false;
int cur = src_bgn;
int prv = cur;
while (true) {
// eos
if (cur == src_end) {
if (dirty) {
bfr.Add_mid(src, prv, src_end);
}
break;
}
boolean match = false;
byte b = src[cur];
switch (b) {
case Byte_ascii.Brack_bgn:
case Byte_ascii.Brack_end:
case Byte_ascii.Angle_bgn:
case Byte_ascii.Angle_end:
case Byte_ascii.Quote:
case Byte_ascii.Pipe:
case Byte_ascii.Delete:
match = true;
break;
default:
if (b >= 0 && b <= 32)
match = true;
break;
}
if (match) {
bfr.Add_mid(src, prv, cur);
gplx.langs.htmls.encoders.Gfo_url_encoder_.Php_urlencode.Encode(bfr, src, cur, cur + 1);
dirty = true;
cur++;
prv = cur;
}
else
cur++;
}
return dirty;
}
}
class Xomw_regex_ipv6_brack {
public int host_bgn;
public int host_end;
public int segs_bgn;
public int segs_end;
private final byte[]
Bry__host_bgn = Bry_.new_a7("//%5B")
, Bry__host_end = Bry_.new_a7("%5D")
;
public boolean Match(byte[] src, int src_bgn, int src_end) {
// preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches)
// XO.MW.REGEX:
// !^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!
// "//%5B" + ("hex-dec" | [:.]) + "%5D" + numbers
// EX: [ABCD]:80:12
host_bgn = src_bgn + Bry__host_bgn.length;
// exit if no match for "//%5B"
if (!Bry_.Match(src, src_bgn, host_bgn, Bry__host_bgn)) return false;
// skip all [0-9A-Fa-f:.]
host_end = host_bgn;
while (true) {
// exit if eos
if (host_end == src_end) return false;
boolean done = false;
byte b = src[host_end];
switch (b) {
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E: case Byte_ascii.Ltr_F:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e: case Byte_ascii.Ltr_f:
case Byte_ascii.Colon:
case Byte_ascii.Dot:
host_end++;
break;
case Byte_ascii.Percent:
// matches "%5D"
segs_bgn = host_end + Bry__host_end.length;
if ( Bry_.Match(src, host_end, segs_bgn, Bry__host_end)
&& host_end - host_bgn > 0) // host can't be 0-len; EX: "//%5B%5D"
done = true;
// exit if no match
else {
return false;
}
break;
// exit if no match
default: {
return false;
}
}
if (done) break;
}
// skip all (:\d+)
segs_end = segs_bgn;
while (true) {
// stop if eos
if (segs_end == src_end) return true;
// check if ":"
if (src[segs_end] == Byte_ascii.Colon) {
int num_bgn = segs_end + 1;
int num_end = Bry_find_.Find_fwd_while_num(src, num_bgn, src_end);
// exit if no nums found; EX:"[ABC]:80:"
if (num_end == num_bgn) {
return false;
}
segs_end = num_end;
}
// exit if seg doesn't start with ":"
else {
return false;
}
}
}
}
interface Xomw_regex_url_char_cbk {
boolean When_ent(Bry_bfr bfr, byte[] name);
boolean When_dec(Bry_bfr bfr, byte[] name);
boolean When_hex(Bry_bfr bfr, byte[] name);
boolean When_amp(Bry_bfr bfr);
}
class Xomw_regex_url_char_cbk__normalize implements Xomw_regex_url_char_cbk {
private final Xomw_sanitizer sanitizer;
public Xomw_regex_url_char_cbk__normalize(Xomw_sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
public boolean When_ent(Bry_bfr bfr, byte[] name) { // XO.MW:normalizeEntity
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the equivalent numeric entity reference (except for the core &lt;
// &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
// the HTML equivalent. Otherwise, returns HTML-escaped text of
// pseudo-entity source (eg &amp;foo;)
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_str_a7("&amp;").Add(name).Add_byte_semic();
return false;
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(entity.html);
return true;
}
}
public boolean When_dec(Bry_bfr bfr, byte[] name) { // XO.MW:decCharReference
int point = Bry_.To_int_or(name, -1);
if (sanitizer.Validate_codepoint(point)) {
bfr.Add_str_a7("&#").Add_int_variable(point).Add_byte_semic();
return true;
}
return false;
}
public boolean When_hex(Bry_bfr bfr, byte[] name) { // XO.MW:hexCharReference
int point = Hex_utl_.Parse_or(name, -1);
if (sanitizer.Validate_codepoint(point)) {
bfr.Add_str_a7("&#x");
Hex_utl_.Write_bfr(bfr, Bool_.Y, point); // sprintf('&#x%x;', $point)
bfr.Add_byte_semic();
return true;
}
return false;
}
public boolean When_amp(Bry_bfr bfr) {
bfr.Add(Gfh_entity_.Amp_bry); // transform "&" to "&amp;"
return true;
}
}
class Xomw_regex_url_char_cbk__decode implements Xomw_regex_url_char_cbk {
private final Xomw_sanitizer sanitizer;
public Xomw_regex_url_char_cbk__decode(Xomw_sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
public boolean When_ent(Bry_bfr bfr, byte[] name) {// XO.MW:decodeEntity
// If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
// return the UTF-8 encoding of that character. Otherwise, returns
// pseudo-entity source (eg "&foo;")
Object o = Xomw_sanitizer.html_entities.Get_by_bry(name);
if (o == null) {
bfr.Add_byte(Byte_ascii.Amp).Add(name).Add_byte_semic();
}
else {
Xomw_html_ent entity = (Xomw_html_ent)o;
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(entity.code));
}
return true;
}
public boolean When_dec(Bry_bfr bfr, byte[] name) {
return Decode_char(bfr, Bry_.To_int(name));
}
public boolean When_hex(Bry_bfr bfr, byte[] name) {
return Decode_char(bfr, gplx.core.encoders.Hex_utl_.Parse_or(name, 0, name.length, -1));
}
public boolean When_amp(Bry_bfr bfr) {
bfr.Add_byte(Byte_ascii.Amp);
return true;
}
private boolean Decode_char(Bry_bfr bfr, int point) {// XO.MW:decodeChar
// Return UTF-8 String for a codepoint if that is a valid
// character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
if (sanitizer.Validate_codepoint(point)) {
bfr.Add(gplx.core.intls.Utf16_.Encode_int_to_bry(point));
}
else {
bfr.Add(Utf8_replacement_char);
}
return true;
}
private static final byte[] Utf8_replacement_char = Bry_.New_by_ints(255, 253); // 0xfffd
}
class Xomw_regex_url_char {
// Regular expression to match various types of character references in
// Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
// static final CHAR_REFS_REGEX =
// '/&([A-Za-z0-9\x80-\xff]+);
// |&\#([0-9]+);
// |&\#[xX]([0-9A-Fa-f]+);
// |(&)/x';
public Xomw_regex_url_char() {
// assert static structs
if (Normalize__dec == null) {
synchronized (Xomw_sanitizer.class) {
Normalize__dec = Bool_ary_bldr.New_u8().Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9).To_ary();
Normalize__hex = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.To_ary();
Normalize__ent = Bool_ary_bldr.New_u8()
.Set_rng(Byte_ascii.Num_0, Byte_ascii.Num_9)
.Set_rng(Byte_ascii.Ltr_A, Byte_ascii.Ltr_Z)
.Set_rng(Byte_ascii.Ltr_a, Byte_ascii.Ltr_z)
.Set_rng(128, 255)
.To_ary();
}
}
}
public byte[] Replace_by_cbk(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, Xomw_regex_url_char_cbk cbk) {
// XO.BRY_BFR
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
// search for "&"
int find_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Amp, cur);
if (find_bgn == Bry_find_.Not_found) { // "&" not found; exit
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
int ent_bgn = find_bgn + 1; // +1 to skip &
// get regex; (a) dec (&#09;); (b) hex (&#xFF;); (c) entity (&alpha;);
boolean[] regex = null;
// check for #;
if (ent_bgn < src_end && src[ent_bgn] == Byte_ascii.Hash) {
ent_bgn++;
if (ent_bgn < src_end) {
byte nxt = src[ent_bgn];
// check for x
if (nxt == Byte_ascii.Ltr_X || nxt == Byte_ascii.Ltr_x) {
ent_bgn++;
regex = Normalize__hex;
}
}
if (regex == null)
regex = Normalize__dec;
}
else {
regex = Normalize__ent;
}
// keep looping until invalid regex
int ent_end = ent_bgn;
int b = Byte_ascii.Null;
for (int i = ent_bgn; i < src_end; i++) {
b = src[i] & 0xFF; // PATCH.JAVA:need to convert to unsigned byte
if (regex[b])
ent_end++;
else
break;
}
// mark dirty; can optimize later by checking if "&lt;" already exists
dirty = true;
if (bfr == null) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn); // add everything before &
// invalid <- regex ended, but not at semic
if (b != Byte_ascii.Semic) {
cbk.When_amp(bfr);
cur = find_bgn + 1; // position after "&"
continue;
}
// do normalization
byte[] name = Bry_.Mid(src, ent_bgn, ent_end);
boolean ret = false;
if (regex == Normalize__ent) {
cbk.When_ent(bfr, name);
ret = true;
}
else if (regex == Normalize__dec) {
ret = cbk.When_dec(bfr, name);
}
else if (regex == Normalize__hex) {
ret = cbk.When_hex(bfr, name);
}
if (!ret) {
cbk.When_amp(bfr);
cur = find_bgn + 1; // position after "&"
continue;
}
cur = ent_end + 1; // +1 to position after ";"
}
// XO.BRY_BFR
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
private static boolean[] Normalize__dec, Normalize__hex, Normalize__ent;
}

View File

@@ -0,0 +1,168 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
import org.junit.*; import gplx.core.tests.*; import gplx.core.btries.*; import gplx.xowa.mediawiki.includes.htmls.*;
public class Xomw_sanitizer__tst {
private final Xomw_sanitizer__fxt fxt = new Xomw_sanitizer__fxt();
@Test public void Normalize__text() {fxt.Test__normalize_char_references("abc" , "abc");}
@Test public void Normalize__dec() {fxt.Test__normalize_char_references("&#08;" , "&amp;#08;");}
@Test public void Normalize__dec__invalid() {fxt.Test__normalize_char_references("&#09;" , "&#9;");}
@Test public void Normalize__hex() {fxt.Test__normalize_char_references("&#xFF;" , "&#xff;");}
@Test public void Normalize__entity() {fxt.Test__normalize_char_references("&alpha;" , "&#945;");}
@Test public void Normalize__entity__lt() {fxt.Test__normalize_char_references("&lt;" , "&lt;");}
@Test public void Normalize__entity__alias() {fxt.Test__normalize_char_references("&רלמ;" , "&rlm;");}
@Test public void Normalize__amp() {fxt.Test__normalize_char_references("a&b" , "a&amp;b");}
@Test public void Normalize__invalid() {fxt.Test__normalize_char_references("&(invalid);" , "&amp;(invalid);");}
@Test public void Normalize__many() {
fxt.Test__normalize_char_references
( "a &#09; b &alpha; c &#xFF; d &(invalid); e"
, "a &#9; b &#945; c &#xff; d &amp;(invalid); e"
);
}
@Test public void Regex__domain() {
Xomw_regex_find_domain regex_domain = new Xomw_regex_find_domain();
// normal
fxt.Test__regex_domain_y(regex_domain, "https://a.org/bcd", "https:", "//a.org", "/bcd");
// trailing backslash
fxt.Test__regex_domain_y(regex_domain, "https://a.org/", "https:", "//a.org", "/");
// domain only
fxt.Test__regex_domain_y(regex_domain, "https://a.org", "https:", "//a.org", "");
// colon not found
fxt.Test__regex_domain_n(regex_domain, "https//a.org/bcd");
// host_bgn.eos
fxt.Test__regex_domain_n(regex_domain, "https:");
// host_bgn.//
fxt.Test__regex_domain_n(regex_domain, "https:a//");
// host_bgn.///
fxt.Test__regex_domain_n(regex_domain, "https:///a.org/b");
}
@Test public void Regex__clean_url() {
Xomw_regex_escape_invalid regex = new Xomw_regex_escape_invalid();
// noop
fxt.Test__regex_escape_invalid(regex, "https://a.org/bcd", Bool_.N, "");
// symbols
fxt.Test__regex_escape_invalid(regex, "[]<>\"|", Bool_.Y, "%5B%5D%3C%3E%22%7C%7F");
// range: 00 - 32
fxt.Test__regex_escape_invalid(regex, "\t\n ", Bool_.Y, "%09%0A+");
}
@Test public void Regex__ipv6_brack() {
Xomw_regex_ipv6_brack regex = new Xomw_regex_ipv6_brack();
// basic
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5B0a.1b:12%5D:123");
// port: none
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D");
// port: multiple
fxt.Test__regex_ipv6_brack(regex, Bool_.Y, "//%5Ba%5D:1:2:3");
// "//%5B" missing
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "abc");
// ipv6: invalid
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba!%5D:1");
// ipv6: 0-len
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5B%5D:1");
// port: invalid
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:a");
// port: 0-len
fxt.Test__regex_ipv6_brack(regex, Bool_.N, "//%5Ba%5D:");
}
@Test public void Decode() {
// dec
fxt.Test__decode_char_references("&#33;" , "!");
// hex
fxt.Test__decode_char_references("&#x23;" , "#");
// entity
fxt.Test__decode_char_references("&alpha;" , "α");
// entity:lt
fxt.Test__decode_char_references("&lt;" , "<");
// entity:rlm
fxt.Test__decode_char_references("&רלמ;" , "");
// entity:invalid
fxt.Test__decode_char_references("&invalid;" , "&invalid;");
// amp
fxt.Test__decode_char_references("a&b" , "a&b");
}
@Test public void Clean_url() {
// entity
fxt.Test__clean_url("http://a.org/b&amp;c" , "http://a.org/b&c");
// entity: escape
fxt.Test__clean_url("http://a.org/b&quot;c" , "http://a.org/b%22c");
// domain=n; make sure &quot; is changed, but not soft-hyphen
fxt.Test__clean_url("a&quot;­z" , "a%22­z");
// host: invalid idn
fxt.Test__clean_url("http://a᠆b.org/c᠆d" , "http://ab.org/c᠆d");
// ipv6_brack
fxt.Test__clean_url("http://[0a.1b:12]:123/cd" , "http://[0a.1b:12]:123/cd");
}
@Test public void Merge_atrs() {
Xomw_atr_mgr src_atrs = new Xomw_atr_mgr();
Xomw_atr_mgr trg_atrs = new Xomw_atr_mgr();
Xomw_atr_mgr expd_atrs = new Xomw_atr_mgr();
String cls = "class";
// basic: k1 + k2
fxt.Test__merge_attributes(src_atrs.Clear().Add_many("k1", "v1"), trg_atrs.Clear().Add_many("k2", "v2"), expd_atrs.Clear().Add_many("k1", "v1", "k2", "v2"));
// overwrite: k1 + k1
fxt.Test__merge_attributes(src_atrs.Clear().Add_many("k1", "v1"), trg_atrs.Clear().Add_many("k1", "v1a"), expd_atrs.Clear().Add_many("k1", "v1a"));
// cls: many
fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, "v1 v2"), trg_atrs.Clear().Add_many(cls, "v3 v4"), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
// cls: src.empty
fxt.Test__merge_attributes(src_atrs.Clear(), trg_atrs.Clear().Add_many(cls, "v1"), expd_atrs.Clear().Add_many(cls, "v1"));
// cls: ws
fxt.Test__merge_attributes(src_atrs.Clear().Add_many(cls, " v1 v2 "), trg_atrs.Clear().Add_many(cls, " v3 v4 "), expd_atrs.Clear().Add_many(cls, "v1 v2 v3 v4"));
}
}
class Xomw_sanitizer__fxt {
private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
private final Bry_bfr tmp = Bry_bfr_.New();
public void Test__normalize_char_references(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
sanitizer.Normalize_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
Gftest.Eq__str(expd, tmp.To_str_and_clear());
}
public void Test__regex_domain_y(Xomw_regex_find_domain regex_domain, String src_str, String expd_prot, String expd_host, String expd_rest) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(true, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
Gftest.Eq__str(expd_prot, Bry_.Mid(src_bry, regex_domain.prot_bgn, regex_domain.prot_end));
Gftest.Eq__str(expd_host, Bry_.Mid(src_bry, regex_domain.host_bgn, regex_domain.host_end));
Gftest.Eq__str(expd_rest, Bry_.Mid(src_bry, regex_domain.rest_bgn, regex_domain.rest_end));
}
public void Test__regex_domain_n(Xomw_regex_find_domain regex_domain, String src_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(false, regex_domain.Match(src_bry, 0, src_bry.length), src_str);
}
public void Test__regex_escape_invalid(Xomw_regex_escape_invalid regex, String src_str, boolean expd_rslt, String expd_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(expd_rslt, regex.Escape(tmp, src_bry, 0, src_bry.length));
Gftest.Eq__str(expd_str, tmp.To_bry_and_clear());
}
public void Test__regex_ipv6_brack(Xomw_regex_ipv6_brack regex, boolean expd_rslt, String src_str) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__bool(expd_rslt, regex.Match(src_bry, 0, src_bry.length));
}
public void Test__decode_char_references(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
sanitizer.Decode_char_references(tmp, Bool_.Y, src_bry, 0, src_bry.length);
Gftest.Eq__str(expd, tmp.To_str_and_clear());
}
public void Test__clean_url(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Gftest.Eq__str(expd, sanitizer.Clean_url(src_bry));
}
public void Test__merge_attributes(Xomw_atr_mgr src, Xomw_atr_mgr trg, Xomw_atr_mgr expd) {
sanitizer.Merge_attributes(src, trg);
Gftest.Eq__ary__lines(expd.To_str(tmp), src.To_str(tmp), "merge_atrs");
}
}

View File

@@ -0,0 +1,85 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*;
public class Xomw_xml {
// Format an XML element with given attributes and, optionally, text content.
// Element and attribute names are assumed to be ready for literal inclusion.
// Strings are assumed to not contain XML-illegal characters; special
// characters (<, >, &) are escaped but illegals are not touched.
// ARGS: contents defaults to ""
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Element(Bry_bfr bfr, byte[] element, List_adp attribs, byte[] contents, boolean allow_short_tag) {
bfr.Add_byte(Byte_ascii.Angle_bgn).Add(element);
if (attribs.Len() > 0) {
Expand_attributes(bfr, attribs);
}
if (contents == null) {
bfr.Add_byte(Byte_ascii.Angle_end);
}
else {
if (allow_short_tag && contents == Bry_.Empty) {
bfr.Add_str_a7(" />");
}
else {
bfr.Add_byte(Byte_ascii.Angle_end);
bfr.Add_bry_escape_html(contents);
bfr.Add_byte(Byte_ascii.Angle_bgn).Add_byte(Byte_ascii.Slash).Add(element).Add_byte(Byte_ascii.Angle_end);
}
}
}
// Given an array of ('attributename' => 'value'), it generates the code
// to set the XML attributes : attributename="value".
// The values are passed to Sanitizer::encodeAttribute.
// Return null if no attributes given.
// @param array $attribs Array of attributes for an XML element
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Expand_attributes(Bry_bfr bfr, List_adp attribs) {
int attribs_len = attribs.Len();
for (int i = 0; i < attribs_len; i += 2) {
// XO.MW: $out .= " {$name}=\"" . Sanitizer::encodeAttribute( $val ) . '"';
bfr.Add_byte_space();
bfr.Add((byte[])attribs.Get_at(i));
bfr.Add_byte_eq().Add_byte_quote();
Xomw_sanitizer.Encode_attribute(bfr, (byte[])attribs.Get_at(i + 1));
bfr.Add_byte_quote();
}
}
// This opens an XML element
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Open_element(Bry_bfr bfr, byte[] element, List_adp attribs) {
bfr.Add_byte(Byte_ascii.Angle_bgn).Add(element);
Expand_attributes(bfr, attribs);
bfr.Add_byte(Byte_ascii.Angle_end);
}
// Shortcut to close an XML element
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Close_element(Bry_bfr bfr, byte[] element) {
bfr.Add_byte(Byte_ascii.Angle_bgn).Add_byte(Byte_ascii.Slash).Add(element).Add_byte(Byte_ascii.Angle_end);
}
// Same as Xml::element(), but does not escape contents. Handy when the
// content you have is already valid xml.
// XO.MW:SYNC:1.29; DATE:2017-02-03
public static void Tags(Bry_bfr bfr, byte[] element, List_adp attribs, byte[] contents) {
Open_element(bfr, element, attribs);
bfr.Add(contents);
bfr.Add_byte(Byte_ascii.Angle_bgn).Add_byte(Byte_ascii.Slash).Add(element).Add_byte(Byte_ascii.Angle_end);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.filerepo.file; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.filerepo.*;
public interface Xomw_file_finder {
Xomw_File Find_file(Xoa_ttl ttl);
}

View File

@@ -0,0 +1,32 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.filerepo.file; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.filerepo.*;
import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_file_finder__mock implements Xomw_file_finder {
private final Xomw_parser_env env;
public Xomw_file_finder__mock(Xomw_parser_env env) {this.env = env;}
private final Hash_adp hash = Hash_adp_.New();
public void Clear() {hash.Clear();}
public Xomw_File Find_file(Xoa_ttl ttl) {
return (Xomw_File)hash.Get_by(ttl.Page_db_as_str());
}
public void Add(String title, Xomw_FileRepo repo, int w, int h, byte[] mime) {
Xomw_LocalFile file = new Xomw_LocalFile(env, Bry_.new_u8(title), repo, w, h, mime);
hash.Add_if_dupe_use_nth(title, file);
}
}

View File

@@ -0,0 +1,21 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.filerepo.file; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.filerepo.*;
public class Xomw_file_finder__noop implements Xomw_file_finder {
public Xomw_File Find_file(Xoa_ttl ttl) {return null;}
}

View File

@@ -0,0 +1,29 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_atr_itm {
public Xomw_atr_itm(int key_int, byte[] key, byte[] val) {
this.key_int = key_int;
this.key_bry = key;
this.val = val;
}
public int Key_int() {return key_int;} private int key_int;
public byte[] Key_bry() {return key_bry;} private byte[] key_bry;
public byte[] Val() {return val;} private byte[] val;
public void Val_(byte[] v) {this.val = v;}
}

View File

@@ -0,0 +1,72 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_atr_mgr {
private final Ordered_hash hash = Ordered_hash_.New_bry();
public int Len() {return hash.Len();}
public Xomw_atr_itm Get_at(int i) {return (Xomw_atr_itm)hash.Get_at(i);}
public Xomw_atr_itm Get_by_or_null(byte[] k) {return (Xomw_atr_itm)hash.Get_by(k);}
public Xomw_atr_mgr Clear() {hash.Clear(); return this;}
public void Del(byte[] key) {hash.Del(key);}
public void Add(Xomw_atr_itm itm) {hash.Add(itm.Key_bry(), itm);}
public Xomw_atr_mgr Add(byte[] key, byte[] val) {
this.Add(new Xomw_atr_itm(-1, key, val));
return this;
}
public void Add_or_set(Xomw_atr_itm src) {
Xomw_atr_itm trg = (Xomw_atr_itm)hash.Get_by(src.Key_bry());
if (trg == null)
this.Add(src);
else
trg.Val_(src.Val());
}
public void Set(byte[] key, byte[] val) {
Xomw_atr_itm atr = Get_by_or_make(key);
atr.Val_(val);
}
public Xomw_atr_itm Get_by_or_make(byte[] k) {
Xomw_atr_itm rv = (Xomw_atr_itm)hash.Get_by(k);
if (rv == null) {
rv = new Xomw_atr_itm(-1, k, null);
Add(rv);
}
return rv;
}
public byte[] Get_val_or_null(byte[] k) {
Xomw_atr_itm atr = (Xomw_atr_itm)hash.Get_by(k);
return atr == null ? null : atr.Val();
}
public Xomw_atr_mgr Add_many(String... kvs) {// TEST
int len = kvs.length;
for (int i = 0; i < len; i += 2) {
byte[] key = Bry_.new_u8(kvs[i]);
byte[] val = Bry_.new_u8(kvs[i + 1]);
Add(key, val);
}
return this;
}
public String To_str(Bry_bfr tmp) { // TEST
int len = this.Len();
for (int i = 0; i < len; i++) {
Xomw_atr_itm itm = this.Get_at(i);
tmp.Add(itm.Key_bry()).Add_byte_eq();
tmp.Add(itm.Val()).Add_byte_nl();
}
return tmp.To_str_and_clear();
}
}

View File

@@ -0,0 +1,26 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_html_elem {
public Xomw_html_elem(byte[] name) {
this.name = name;
}
public byte[] Name() {return name;} private final byte[] name; // EX: "a", "div", "img"
// private static final Hash_adp_bry void_elements = Hash_adp_bry.cs().Add_many_str("area", "super", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr");
}

View File

@@ -0,0 +1,267 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_html_utl {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
public void Raw_element(Bry_bfr bfr, byte[] element, Xomw_atr_mgr attribs, byte[] contents) {
Bry_.Lcase__all(element); // XO:lcase element
Open_element__lcased(bfr, element, attribs);
if (void_elements.Has(element)) {
bfr.Del_by_1().Add(Bry__elem__lhs__inl);
}
else {
bfr.Add(contents);
Close_element__lcased(bfr, element);
}
}
private void Open_element__lcased(Bry_bfr bfr, byte[] element, Xomw_atr_mgr attribs) {
// This is not required in HTML5, but let's do it anyway, for
// consistency and better compression.
// $element = strtolower($element); // XO:handled by callers
// Remove invalid input types
if (Bry_.Eq(element, Tag__input)) {
// PORTED.HEADER:valid_input_types
byte[] type_atr_val = attribs.Get_val_or_null(Atr__type);
if (type_atr_val != null && !valid_input_types.Has(type_atr_val)) {
attribs.Del(Atr__type);
}
}
// According to standard the default type for <button> elements is "submit".
// Depending on compatibility mode IE might use "button", instead.
// We enforce the standard "submit".
if (Bry_.Eq(element, Tag__button) && attribs.Get_val_or_null(Atr__type) == null) {
attribs.Set(Atr__type, Val__type__submit);
}
bfr.Add_byte(Byte_ascii.Angle_bgn).Add(element);
Expand_attributes(bfr, attribs); // TODO.XO:self::dropDefaults($element, $attribs)
bfr.Add_byte(Byte_ascii.Angle_end);
}
public void Expand_attributes(Bry_bfr bfr, Xomw_atr_mgr atrs) {
int len = atrs.Len();
for (int i = 0; i < len; i++) {
Xomw_atr_itm atr = (Xomw_atr_itm)atrs.Get_at(i);
byte[] key = atr.Key_bry();
byte[] val = atr.Val();
// Support intuitive [ 'checked' => true/false ] form
if (val == null) { // TESTME
continue;
}
// For boolean attributes, support [ 'foo' ] instead of
// requiring [ 'foo' => 'meaningless' ].
boolean bool_attrib = bool_attribs.Has(val);
if (atr.Key_int() != -1 && bool_attrib) {
key = val;
}
// Not technically required in HTML5 but we'd like consistency
// and better compression anyway.
key = Bry_.Xcase__build__all(tmp, Bool_.N, key);
// PORTED.HEADER:$spaceSeparatedListAttributes
// Specific features for attributes that allow a list of space-separated values
if (space_separated_list_attributes.Has(key)) {
// Apply some normalization and remove duplicates
// Convert into correct array. Array can contain space-separated
// values. Implode/explode to get those into the main array as well.
// if (is_array($value)) {
// If input wasn't an array, we can skip this step
// $newValue = [];
// foreach ($value as $k => $v) {
// if (is_string($v)) {
// String values should be normal `array('foo')`
// Just append them
// if (!isset($value[$v])) {
// As a special case don't set 'foo' if a
// separate 'foo' => true/false exists in the array
// keys should be authoritative
// $newValue[] = $v;
// }
// }
// elseif ($v) {
// If the value is truthy but not a String this is likely
// an [ 'foo' => true ], falsy values don't add strings
// $newValue[] = $k;
// }
// }
// $value = implode(' ', $newValue);
// }
// $value = explode(' ', $value);
// Normalize spacing by fixing up cases where people used
// more than 1 space and/or a trailing/leading space
// $value = array_diff($value, [ '', ' ' ]);
// Remove duplicates and create the String
// $value = implode(' ', array_unique($value));
}
// DELETE
// elseif (is_array($value)) {
// throw new MWException("HTML attribute $key can not contain a list of values");
// }
if (bool_attrib) {
bfr.Add_byte_space().Add(key).Add(Bry__atr__val__empty); // $ret .= " $key=\"\"";
}
else {
// PORTED.HEADER:atr_val_encodings
val = Php_str_.Strtr(val, atr_val_encodings, tmp, trv);
bfr.Add_byte_space().Add(key).Add(Bry__atr__val__quote).Add(val).Add_byte_quote();
}
}
}
private void Close_element__lcased(Bry_bfr bfr, byte[] element) {
bfr.Add(Bry__elem__rhs__bgn).Add(element).Add_byte(Byte_ascii.Angle_end); // EX: "</", element, ">";
}
private static final byte[]
Bry__elem__lhs__inl = Bry_.new_a7("/>")
, Bry__elem__rhs__bgn = Bry_.new_a7("</")
, Bry__atr__val__quote = Bry_.new_a7("=\"")
, Bry__atr__val__empty = Bry_.new_a7("=\"\"")
, Tag__input = Bry_.new_a7("input")
, Tag__button = Bry_.new_a7("button")
, Atr__type = Bry_.new_a7("type")
, Val__type__submit = Bry_.new_a7("submit")
;
// List of void elements from HTML5, section 8.1.2 as of 2016-09-19
private static final Hash_adp_bry void_elements = Hash_adp_bry.cs().Add_many_str
(
"area",
"super",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
);
// Boolean attributes, which may have the value omitted entirely. Manually
// collected from the HTML5 spec as of 2011-08-12.
private static final Hash_adp_bry bool_attribs = Hash_adp_bry.ci_a7().Add_many_str(
"async",
"autofocus",
"autoplay",
"checked",
"controls",
"default",
"defer",
"disabled",
"formnovalidate",
"hidden",
"ismap",
// "itemscope", //XO:duplicate; added below
"loop",
"multiple",
"muted",
"novalidate",
"open",
"pubdate",
"final ",
"required",
"reversed",
"scoped",
"seamless",
"selected",
"truespeed",
"typemustmatch",
// HTML5 Microdata
"itemscope"
);
private static final Btrie_slim_mgr atr_val_encodings = Btrie_slim_mgr.cs()
// Apparently we need to entity-encode \n, \r, \t, although the
// spec doesn't mention that. Since we're doing strtr() anyway,
// we may as well not call htmlspecialchars().
// @todo FIXME: Verify that we actually need to
// escape \n\r\t here, and explain why, exactly.
// We could call Sanitizer::encodeAttribute() for this, but we
// don't because we're stubborn and like our marginal savings on
// byte size from not having to encode unnecessary quotes.
// The only difference between this transform and the one by
// Sanitizer::encodeAttribute() is ' is not encoded.
.Add_str_str("&" , "&amp;")
.Add_str_str("\"" , "&quot;")
.Add_str_str(">" , "&gt;")
// '<' allegedly allowed per spec
// but breaks some tools if not escaped.
.Add_str_str("<" , "&lt;")
.Add_str_str("\n" , "&#10;")
.Add_str_str("\r" , "&#13;")
.Add_str_str("\t" , "&#9;");
// https://www.w3.org/TR/html401/index/attributes.html ("space-separated")
// https://www.w3.org/TR/html5/index.html#attributes-1 ("space-separated")
private static final Hash_adp_bry space_separated_list_attributes = Hash_adp_bry.ci_a7().Add_many_str(
"class", // html4, html5
"accesskey", // as of html5, multiple space-separated values allowed
// html4-spec doesn't document rel= as space-separated
// but has been used like that and is now documented as such
// in the html5-spec.
"rel"
);
private static final Hash_adp_bry valid_input_types = Hash_adp_bry.ci_a7().Add_many_str(
// Remove invalid input types
"hidden",
"text",
"password",
"checkbox",
"radio",
"file",
"submit",
"image",
"reset",
"button",
// HTML input types
"datetime",
"datetime-local",
"date",
"month",
"time",
"week",
"number",
"range",
"email",
"url",
"search",
"tel",
"color"
);
}

View File

@@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_html_utl__expand_attributes__tst {
private final Xomw_html_utl__expand_attributes__fxt fxt = new Xomw_html_utl__expand_attributes__fxt();
@Test public void Basic() {fxt.Test__expand_attributes(" a=\"b\"", "a", "b");}
}
class Xomw_html_utl__expand_attributes__fxt {
private final Xomw_html_utl utl = new Xomw_html_utl();
private final Bry_bfr bfr = Bry_bfr_.New();
public void Test__expand_attributes(String expd, String... kvs) {
Xomw_atr_mgr atrs = new Xomw_atr_mgr();
int kvs_len = kvs.length;
for (int i = 0; i < kvs_len; i += 2) {
byte[] key = Bry_.new_a7(kvs[i]);
byte[] val = Bry_.new_a7(kvs[i + 1]);
Xomw_atr_itm itm = new Xomw_atr_itm(-1, key, val);
atrs.Add(itm);
}
utl.Expand_attributes(bfr, atrs);
Gftest.Eq__str(expd, bfr.To_str_and_clear());
}
}

View File

@@ -0,0 +1,24 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_opt_mgr {
public boolean known;
public boolean broken;
public boolean no_classes;
public byte[] time = null;
}

View File

@@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.htmls; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_qry_mgr {
public byte[] action;
public int redlink;
public Xomw_qry_mgr Clear() {
action = null;
redlink = -1;
return this;
}
}

View File

@@ -0,0 +1,125 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_string_utils {
// Explode a String, but ignore any instances of the separator inside
// the given start and end delimiters, which may optionally nest.
// The delimiters are literal strings, not regular expressions.
// @param String bgn_delim Start delimiter
// @param String end_delim End delimiter
// @param String separator Separator String for the explode.
// @param String subject Subject String to explode.
// @param boolean nested True iff the delimiters are allowed to nest.
// @return ArrayIterator
// XO.MW: hard-coding (a) nested=true; (b) bgn="-{" end="}-" sep="|"
// XO.MW:SYNC:1.29; DATE:2017-02-03
private static final byte Delimiter_explode__sep = 0, Delimiter_explode__bgn = 1, Delimiter_explode__end = 2;
private static final Btrie_slim_mgr delimiter_explode_trie = Btrie_slim_mgr.cs()
.Add_str_byte("|" , Delimiter_explode__sep)
.Add_str_byte("-{", Delimiter_explode__bgn)
.Add_str_byte("}-", Delimiter_explode__end)
;
public static byte[][] Delimiter_explode(List_adp tmp, Btrie_rv trv, byte[] src) {
int src_bgn = 0;
int src_end = src.length;
int depth = 0;
int cur = src_bgn;
int prv = cur;
while (true) {
// eos
if (cur == src_end) {
// add rest
tmp.Add(Bry_.Mid(src, prv, src_end));
break;
}
Object o = delimiter_explode_trie.Match_at(trv, src, cur, src_end);
// regular char; continue;
if (o == null) {
cur++;
continue;
}
// handle sep, bgn, end
byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
switch (tid) {
case Delimiter_explode__sep:
if (depth == 0) {
tmp.Add(Bry_.Mid(src, prv, cur));
prv = cur + 1;
}
break;
case Delimiter_explode__bgn:
depth++;
break;
case Delimiter_explode__end:
depth--;
break;
}
cur = trv.Pos();
}
return (byte[][])tmp.To_ary_and_clear(byte[].class);
}
// More or less "markup-safe" str_replace()
// Ignores any instances of the separator inside `<...>`
public static void Replace_markup(byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) { // REF:/includes/libs/StringUtils.php|replaceMarkup
// PORTED: avoiding multiple regex calls / String creations
// $placeholder = "\x00";
// Remove placeholder instances
// $text = str_replace( $placeholder, '', $text );
// Replace instances of the separator inside HTML-like tags with the placeholder
// $replacer = new DoubleReplacer( $search, $placeholder );
// $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
// Explode, then put the replaced separators back in
// $cleaned = str_replace( $search, $replace, $cleaned );
// $text = str_replace( $placeholder, $search, $cleaned );
// if same length find / repl, do in-place replacement; EX: "!!" -> "||"
int find_len = find.length;
int repl_len = repl.length;
if (find_len != repl_len) throw Err_.new_wo_type("find and repl should be same length");
byte find_0 = find[0];
byte dlm_bgn = Byte_ascii.Angle_bgn;
byte dlm_end = Byte_ascii.Angle_end;
boolean repl_active = true;
// loop every char in array
for (int i = src_bgn; i < src_end; i++) {
byte b = src[i];
if ( b == find_0
&& Bry_.Match(src, i + 1, i + find_len, find, 1, find_len)
&& repl_active
) {
Bry_.Set(src, i, i + find_len, repl);
}
else if (b == dlm_bgn) {
repl_active = false;
}
else if (b == dlm_end) {
repl_active = true;
}
}
}
}

View File

@@ -0,0 +1,60 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_string_utils__tst {
private final Xomw_string_utils__fxt fxt = new Xomw_string_utils__fxt();
@Test public void Delimiter_explode() {
// basic
fxt.Test__delimiter_explode("a|b|c" , "a", "b", "c");
// empty
fxt.Test__delimiter_explode("|a||c|" , "", "a", "", "c", "");
// nest_1
fxt.Test__delimiter_explode("a|-{b|c}-|d" , "a", "-{b|c}-", "d");
// nest_many
fxt.Test__delimiter_explode("a|-{b-{c|d}-e}-|f" , "a", "-{b-{c|d}-e}-", "f");
}
@Test public void Replace_markup() {
// basic
fxt.Test__replace_markup("a!!b" , "!!", "||", "a||b");
// missing
fxt.Test__replace_markup("abcd" , "!!", "||", "abcd");
// eos
fxt.Test__replace_markup("a!!" , "!!", "||", "a||");
// ignore
fxt.Test__replace_markup("a!!b<!!>!!c" , "!!", "||", "a||b<!!>||c");
// ignore asym_lhs
fxt.Test__replace_markup("a!!b<!!<!!>!!c" , "!!", "||", "a||b<!!<!!>||c");
// ignore asym_lhs
fxt.Test__replace_markup("a!!b<!!>!!>!!c" , "!!", "||", "a||b<!!>||>||c"); // NOTE: should probably be "!!>!!>", but unmatched ">" are escaped to "&gt;"
}
}
class Xomw_string_utils__fxt {
public void Test__delimiter_explode(String src_str, String... expd) {
List_adp tmp = List_adp_.New();
gplx.core.btries.Btrie_rv trv = new gplx.core.btries.Btrie_rv();
byte[][] actl = Xomw_string_utils.Delimiter_explode(tmp, trv, Bry_.new_u8(src_str));
Gftest.Eq__ary(expd, actl, "src=~{0}", src_str);
}
public void Test__replace_markup(String src_str, String find, String repl, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
Xomw_string_utils.Replace_markup(src_bry, 0, src_bry.length, Bry_.new_a7(find), Bry_.new_a7(repl));
Gftest.Eq__str(expd, src_bry);
}
}

View File

@@ -0,0 +1,213 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.linkers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.langs.htmls.*;
import gplx.xowa.mediawiki.includes.htmls.*;
/* TODO.XO
* P7: $html = HtmlArmor::getHtml($text);
* P3: Get_link_url [alternate urls? EX: mw/wiki/index.php/title?]
* P2: titleFormatter->getPrefixedText [depends on redlinks]
* P1: Get_link_classes [depends on redlinks]
*/
public class Xomw_link_renderer {
private boolean expand_urls = false;
private final Xomw_html_utl html_utl = new Xomw_html_utl();
private final Xomw_atr_mgr attribs = new Xomw_atr_mgr();
private final List_adp tmp_merge_deleted = List_adp_.New();
private final Xomw_sanitizer sanitizer;
public Xomw_link_renderer(Xomw_sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
public void Make_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, byte[] classes, Xomw_atr_mgr extra_atrs, Xomw_qry_mgr query) {
if (target.Is_known()) {
this.Make_known_link(bfr, target, text, extra_atrs, query);
} else {
this.Make_broken_link(bfr, target, text, extra_atrs, query);
}
}
// If you have already looked up the proper CSS classes using LinkRenderer::getLinkClasses()
// or some other method, use this to avoid looking it up again.
// XO.MW:SYNC:1.29; DATE:2017-01-31
public void Make_preloaded_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, byte[] classes, Xomw_atr_mgr extra_atrs, Xomw_qry_mgr query) {
// XO.MW.HOOK: $this->runBeginHook --> 'HtmlPageLinkRendererBegin', 'LinkBegin'
target = Normalize_target(target);
byte[] url = Get_link_url(target, query);
attribs.Clear();
attribs.Add(Gfh_atr_.Bry__href, url); // XO.MW: add url 1st; MW does attribs["url", url] + attribs + extra_attribs
if (classes.length > 0) // XO.MW:do not bother adding if empty
attribs.Add(Gfh_atr_.Bry__class, classes);
byte[] prefixed_text = target.Get_prefixed_text();
if (prefixed_text != Bry_.Empty) {
attribs.Add(Gfh_atr_.Bry__title, prefixed_text);
}
Merge_attribs(attribs, extra_atrs);
if (text == null) {
text = this.Get_link_text(target);
}
Build_a_element(bfr, target,text, attribs, true);
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
public void Make_known_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, Xomw_atr_mgr extra_atrs, Xomw_qry_mgr query) {
byte[] classes = Bry_.Empty;
if (target.Is_external()) {
classes = Bry__classes__extiw;
}
byte[] colour = Get_link_classes(target);
if (colour != Bry_.Empty) {
classes = Bry_.Add(classes, Byte_ascii.Space_bry, colour);
}
Make_preloaded_link(bfr, target, text, classes, extra_atrs, query);
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
public void Make_broken_link(Bry_bfr bfr, Xoa_ttl target, byte[] text, Xomw_atr_mgr extra_atrs, Xomw_qry_mgr query) {
// XO.MW.HOOK: Run legacy hook
// We don't want to include fragments for broken links, because they
// generally make no sense.
if (target.Has_fragment()) {
target = target.Create_fragment_target();
}
target = Normalize_target(target);
if (query.action == null && target.Ns().Id() != gplx.xowa.wikis.nss.Xow_ns_.Tid__special) {
query.action = Bry_.new_a7("edit");
query.redlink = 1;
}
byte[] url = Get_link_url(target, query);
attribs.Clear();
attribs.Add(Gfh_atr_.Bry__href, url); // $attribs = ['href' => $url,] + $this->mergeAttribs($attribs, $extraAttribs);
attribs.Add(Gfh_atr_.Bry__class, Bry_.new_a7("new"));
Merge_attribs(attribs, extra_atrs);
// $prefixedText = $this->titleFormatter->getPrefixedText($target);
// if ($prefixedText !== '') {
// // This ends up in parser cache!
// $attribs['title'] = wfMessage('red-link-title', $prefixedText)
// ->inContentLanguage()
// ->text();
// }
if (text == null) {
text = Get_link_text(target);
}
Build_a_element(bfr, target, text, attribs, false);
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
private void Build_a_element(Bry_bfr bfr, Xoa_ttl target, byte[] text, Xomw_atr_mgr attribs, boolean is_known) {
// XO.MW.HOOK:HtmlPageLinkRendererEnd
byte[] html = text;
// $html = HtmlArmor::getHtml($text);
// XO.MW.HOOK:LinkEnd
html_utl.Raw_element(bfr, Gfh_tag_.Bry__a, attribs, html);
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
private byte[] Get_link_text(Xoa_ttl target) {
byte[] prefixed_text = target.Get_prefixed_text();
// If the target is just a fragment, with no title, we return the fragment
// text. Otherwise, we return the title text itself.
if (prefixed_text == Bry_.Empty && target.Has_fragment()) {
return target.Get_fragment();
}
return prefixed_text;
}
private byte[] Get_link_url(Xoa_ttl target, Xomw_qry_mgr query) {
// TODO: Use a LinkTargetResolver service instead of Title
// if ($this->forceArticlePath) {
// $realQuery = $query;
// $query = [];
// }
// else {
// $realQuery = [];
// }
byte[] url = target.Get_link_url(query, false, expand_urls);
// if ($this->forceArticlePath && $realQuery) {
// $url = wfAppendQuery($url, $realQuery);
// }
return url;
}
// XO.MW:SYNC:1.29; DATE:2017-01-31
private Xoa_ttl Normalize_target(Xoa_ttl target) {
return Xomw_linker.normaliseSpecialPage(target);
}
// XO.MW:SYNC:1.29; DATE:2017-02-01
private void Merge_attribs(Xomw_atr_mgr src, Xomw_atr_mgr trg) {
// XO.MW: ignore; src is always non-null and empty; if trg exists, it will be merged below
// if (!$attribs) {return $defaults;}
// Merge the custom attribs with the default ones, and iterate
// over that, deleting all "false" attributes.
sanitizer.Merge_attributes(src, trg);
// XO.MW:MW removes "false" values; XO removes "null" values
boolean deleted = false;
int len = trg.Len();
for (int i = 0; i < len; i++) {
Xomw_atr_itm trg_atr = trg.Get_at(i);
// A false value suppresses the attribute
if (trg_atr.Val() == null) {
tmp_merge_deleted.Add(trg_atr);
deleted = true;
}
}
if (deleted) {
len = tmp_merge_deleted.Len();
for (int i = 0; i < len; i++) {
Xomw_atr_itm atr = (Xomw_atr_itm)trg.Get_at(i);
trg.Del(atr.Key_bry());
}
tmp_merge_deleted.Clear();
}
}
public byte[] Get_link_classes(Xoa_ttl target) {
// Make sure the target is in the cache
// $id = $this->linkCache->addLinkObj($target);
// if ($id == 0) {
// // Doesn't exist
// return '';
// }
// if ($this->linkCache->getGoodLinkFieldObj($target, 'redirect')) {
// Page is a redirect
// return 'mw-redirect';
// }
// elseif ($this->stubThreshold > 0 && MWNamespace::isContent($target->getNamespace())
// && $this->linkCache->getGoodLinkFieldObj($target, 'length') < $this->stubThreshold
// ) {
// Page is a stub
// return 'stub';
// }
return Bry_.Empty;
}
private static final byte[] Bry__classes__extiw = Bry_.new_a7("extiw");
}

View File

@@ -0,0 +1,35 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.linkers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
// import org.junit.*;
// public class Xomw_link_renderer__tst {
// private final Xomw_link_renderer__fxt fxt = new Xomw_link_renderer__fxt();
/*
Make_broken_link
target.Has_fragment()
*/
// }
// class Xomw_link_renderer__fxt {
// private final Xomw_link_renderer wkr = new Xomw_link_renderer(new Xomw_parser());
// public void Test__parse(String src_str, String expd) {
// byte[] src_bry = Bry_.new_u8(src_str);
// wkr.Replace_external_links(new Xomw_parser_ctx(), pbfr.Init(src_bry));
// if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
// Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
// }
// }

View File

@@ -0,0 +1,304 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*; import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
import gplx.xowa.mediawiki.includes.utls.*;
/* XO.TODO:
* validateThumbParams
*/
// MEMORY:only one instance per wiki
public abstract class Xomw_ImageHandler extends Xomw_MediaHandler { private final Xomw_param_map paramMap = new Xomw_param_map();
public Xomw_ImageHandler(byte[] key) {super(key);
paramMap.Add(Xomw_param_itm.Mw__img_width, Xomw_param_map.Type__handler, Xomw_param_itm.Name_bry__width);
}
/**
* @param File file
* @return boolean
*/
@Override public boolean canRender(Xomw_File file) {
return (Php_utl_.istrue(file.getWidth()) && Php_utl_.istrue(file.getHeight()));
}
@Override public Xomw_param_map getParamMap() {
// XO.MW: defined above: "return [ 'img_width' => 'width' ];"
return paramMap;
}
@Override public boolean validateParam(int name_uid, byte[] val_bry, int val_int) {
if (name_uid == Xomw_param_itm.Name__width || name_uid == Xomw_param_itm.Name__height) {
if (val_int <= 0) {
return false;
}
else {
return true;
}
}
else {
return false;
}
}
@Override public byte[] makeParamString(Xomw_params_handler handlerParams) {
int width = 0;
if (Php_utl_.isset(handlerParams.physicalWidth)) {
width = handlerParams.physicalWidth;
}
else if (Php_utl_.isset(handlerParams.width)) {
width = handlerParams.width;
}
else {
throw Err_.new_wo_type("No width specified to makeParamString");
}
// Removed for ProofreadPage
// width = intval(width);
return Bry_.Add(Int_.To_bry(width), Xomw_lnki_wkr.Bry__px);
}
// public Xomw_param_map parseParamString(byte[] src) {
// int len = src.length;
// // XO.MW.REGEX: if (preg_match('/^(\d+)px/', str, m)) {
// if ( len > 0 // at least one char
// && Byte_ascii.Is_num(src[0])) // 1st char is numeric
// {
// pos = Bry_find_.Find_fwd_while_num(src, 1, len); // skip numeric
// if (Bry_.Match(src, pos, len, Xomw_lnki_wkr.Bry__px)) { // matches "px"
// Xomw_params_handler rv = new Xomw_params_handler();
// rv.width = Bry_.To_int_or(src, 0, pos, Php_utl_.Null_int);
// return rv;
// }
// }
// return null;
// }
// function getScriptParams(paramsVar) {
// return [ 'width' => paramsVar['width'] ];
// }
/**
* @param File image
* @param array paramsVar
* @return boolean
*/
@Override public boolean normaliseParams(Xomw_File image, Xomw_params_handler handlerParams) {
byte[] mimeType = image.getMimeType();
if (!Php_utl_.isset(handlerParams.width)) {
return false;
}
if (!Php_utl_.isset(handlerParams.page)) {
handlerParams.page = 1;
}
else {
// handlerParams.page = intval(handlerParams.page);
// if (handlerParams.page > image.pageCount()) {
// handlerParams.page = image.pageCount();
// }
//
// if (handlerParams.page < 1) {
// handlerParams.page = 1;
// }
}
int srcWidth = image.getWidth(handlerParams.page);
int srcHeight = image.getHeight(handlerParams.page);
if (Php_utl_.isset(handlerParams.height) && handlerParams.height != -1) {
// Height & width were both set
if (handlerParams.width * srcHeight > handlerParams.height * srcWidth) {
// Height is the relative smaller dimension, so scale width accordingly
handlerParams.width = fitBoxWidth(srcWidth, srcHeight, handlerParams.height);
if (handlerParams.width == 0) {
// Very small image, so we need to rely on client side scaling :(
handlerParams.width = 1;
}
handlerParams.physicalWidth = handlerParams.width;
} else {
// Height was crap, unset it so that it will be calculated later
handlerParams.height = Php_utl_.Null_int;
}
}
if (!Php_utl_.isset(handlerParams.physicalWidth)) {
// Passed all validations, so set the physicalWidth
handlerParams.physicalWidth = handlerParams.width;
}
// Because thumbs are only referred to by width, the height always needs
// to be scaled by the width to keep the thumbnail sizes consistent,
// even if it was set inside the if block above
handlerParams.physicalHeight = Xomw_File.scaleHeight(srcWidth, srcHeight,
handlerParams.physicalWidth);
// Set the height if it was not validated in the if block higher up
if (!Php_utl_.isset(handlerParams.height) || handlerParams.height == -1) {
handlerParams.height = handlerParams.physicalHeight;
}
if (!this.validateThumbParams(handlerParams, srcWidth, srcHeight, mimeType)
) {
return false;
}
return true;
}
/**
* Validate thumbnail parameters and fill in the correct height
*
* @param int width Specified width (input/output)
* @param int height Height (output only)
* @param int srcWidth Width of the source image
* @param int srcHeight Height of the source image
* @param String mimeType Unused
* @return boolean False to indicate that an error should be returned to the user.
*/
// XO.MW.NOTE: MW passes w and h by ref, but only changes h; XO will pass handlerParams directly
private boolean validateThumbParams(Xomw_params_handler handlerParams, int srcWidth, int srcHeight, byte[] mimeType) {
int width = handlerParams.physicalWidth;
int height = handlerParams.physicalHeight;
// width = intval(width);
// Sanity check width
if (width <= 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "validateThumbParams: Invalid destination width: width");
return false;
}
if (srcWidth <= 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "validateThumbParams: Invalid source width: srcWidth");
return false;
}
height = Xomw_File.scaleHeight(srcWidth, srcHeight, width);
if (height == 0) {
// Force height to be at least 1 pixel
height = 1;
}
handlerParams.height = height;
return true;
}
// /**
// * @param File image
// * @param String script
// * @param array paramsVar
// * @return boolean|MediaTransformOutput
// */
// function getScriptedTransform(image, script, paramsVar) {
// if (!this.normaliseParams(image, paramsVar)) {
// return false;
// }
// url = wfAppendQuery(script, this.getScriptParams(paramsVar));
//
// if (image.mustRender() || paramsVar['width'] < image.getWidth()) {
// return new ThumbnailImage(image, url, false, paramsVar);
// }
// }
//
// function getImageSize(image, path) {
// MediaWiki\suppressWarnings();
// gis = getimagesize(path);
// MediaWiki\restoreWarnings();
//
// return gis;
// }
//
// /**
// * Function that returns the number of pixels to be thumbnailed.
// * Intended for animated GIFs to multiply by the number of frames.
// *
// * If the file doesn't support a notion of "area" return 0.
// *
// * @param File image
// * @return int
// */
// function getImageArea(image) {
// return image.getWidth() * image.getHeight();
// }
//
// /**
// * @param File file
// * @return String
// */
// function getShortDesc(file) {
// global wgLang;
// nbytes = htmlspecialchars(wgLang.formatSize(file.getSize()));
// widthheight = wfMessage('widthheight')
// .numParams(file.getWidth(), file.getHeight()).escaped();
//
// return "widthheight (nbytes)";
// }
//
// /**
// * @param File file
// * @return String
// */
// function getLongDesc(file) {
// global wgLang;
// pages = file.pageCount();
// size = htmlspecialchars(wgLang.formatSize(file.getSize()));
// if (pages === false || pages <= 1) {
// msg = wfMessage('file-info-size').numParams(file.getWidth(),
// file.getHeight()).paramsVar(size,
// '<span class="mime-type">' . file.getMimeType() . '</span>').parse();
// } else {
// msg = wfMessage('file-info-size-pages').numParams(file.getWidth(),
// file.getHeight()).paramsVar(size,
// '<span class="mime-type">' . file.getMimeType() . '</span>').numParams(pages).parse();
// }
//
// return msg;
// }
//
// /**
// * @param File file
// * @return String
// */
// function getDimensionsString(file) {
// pages = file.pageCount();
// if (pages > 1) {
// return wfMessage('widthheightpage')
// .numParams(file.getWidth(), file.getHeight(), pages).text();
// } else {
// return wfMessage('widthheight')
// .numParams(file.getWidth(), file.getHeight()).text();
// }
// }
//
// public function sanitizeParamsForBucketing(paramsVar) {
// paramsVar = parent::sanitizeParamsForBucketing(paramsVar);
//
// // We unset the height parameters in order to let normaliseParams recalculate them
// // Otherwise there might be a height discrepancy
// if (isset(paramsVar['height'])) {
// unset(paramsVar['height']);
// }
//
// if (isset(paramsVar['physicalHeight'])) {
// unset(paramsVar['physicalHeight']);
// }
//
// return paramsVar;
// }
}

View File

@@ -0,0 +1,63 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
import gplx.xowa.mediawiki.includes.filerepo.*; import gplx.xowa.mediawiki.includes.filerepo.file.*;
public class Xomw_ImageHandler__tst {
private final Xomw_ImageHandler__fxt fxt = new Xomw_ImageHandler__fxt();
@Before public void init() {
fxt.Init__file("A.png", 400, 200);
}
@Test public void normaliseParams() {
// widthOnly; "Because thumbs are only referred to by width, the height always needs"
fxt.Test__normaliseParams(fxt.Make__handlerParams(200), fxt.Make__handlerParams(200, 100, 200, 100));
}
}
class Xomw_ImageHandler__fxt {
private final Xomw_ImageHandler handler;
private final Xomw_FileRepo repo = new Xomw_FileRepo(Bry_.new_a7("/orig"), Bry_.new_a7("/thumb"));
private final Xomw_parser_env env = new Xomw_parser_env();
private Xomw_File file;
public Xomw_ImageHandler__fxt() {
handler = new Xomw_TransformationalImageHandler(Bry_.new_a7("test_handler"));
}
public Xomw_params_handler Make__handlerParams(int w) {return Make__handlerParams(w, Php_utl_.Null_int, Php_utl_.Null_int, Php_utl_.Null_int);}
public Xomw_params_handler Make__handlerParams(int w, int h, int phys_w, int phys_h) {
Xomw_params_handler rv = new Xomw_params_handler();
rv.width = w;
rv.height = h;
rv.physicalWidth = phys_w;
rv.physicalHeight = phys_h;
return rv;
}
public void Init__file(String title, int w, int h) {
this.file = new Xomw_LocalFile(env, Bry_.new_u8(title), repo, w, h, Xomw_MediaHandlerFactory.Mime__image__png);
}
public void Test__normaliseParams(Xomw_params_handler prms, Xomw_params_handler expd) {
// exec
handler.normaliseParams(file, prms);
// test
Gftest.Eq__int(expd.width, prms.width);
Gftest.Eq__int(expd.height, prms.height);
Gftest.Eq__int(expd.physicalWidth, prms.physicalWidth);
Gftest.Eq__int(expd.physicalHeight, prms.physicalHeight);
}
}

View File

@@ -0,0 +1,868 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*;
import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
public abstract class Xomw_MediaHandler {
public byte[] Key() {return key;} private byte[] key;
public Xomw_MediaHandler(byte[] key) {
this.key = key;
}
private static final int TRANSFORM_LATER = 1;
// static final METADATA_GOOD = true;
// static final METADATA_BAD = false;
// static final METADATA_COMPATIBLE = 2; // for old but backwards compatible.
// /**
// * Max length of error logged by logErrorForExternalProcess()
// */
// static final MAX_ERR_LOG_SIZE = 65535;
//
// /**
// * Get a MediaHandler for a given MIME type from the instance cache
// *
// * @param String $type
// * @return MediaHandler|boolean
// */
// static function getHandler($type) {
// return MediaWikiServices::getInstance()
// ->getMediaHandlerFactory()->getHandler($type);
// }
/**
* Get an associative array mapping magic word IDs to parameter names.
* Will be used by the parser to identify parameters.
*/
public abstract Xomw_param_map getParamMap();
/**
* Validate a thumbnail parameter at parse time.
* Return true to accept the parameter, and false to reject it.
* If you return false, the parser will do something quiet and forgiving.
*
* @param String $name
* @param mixed $value
*/
public abstract boolean validateParam(int name_uid, byte[] val_bry, int val_int);
/**
* Merge a parameter array into a String appropriate for inclusion in filenames
*
* @param array paramsVar Array of parameters that have been through normaliseParams.
* @return String
*/
public abstract byte[] makeParamString(Xomw_params_handler handlerParams);
// /**
// * Parse a param String made with makeParamString back into an array
// *
// * @param String $str The parameter String without file name (e.g. 122px)
// * @return array|boolean Array of parameters or false on failure.
// */
// abstract public function parseParamString($str);
/**
* Changes the parameter array as necessary, ready for transformation.
* Should be idempotent.
* Returns false if the parameters are unacceptable and the transform should fail
* @param File $image
* @param array $paramsVar
*/
public abstract boolean normaliseParams(Xomw_File image, Xomw_params_handler handlerParams);
// /**
// * Get an image size array like that returned by getimagesize(), or false if it
// * can't be determined.
// *
// * This function is used for determining the width, height and bitdepth directly
// * from an image. The results are stored in the database in the img_width,
// * img_height, img_bits fields.
// *
// * @note If this is a multipage file, return the width and height of the
// * first page.
// *
// * @param File|FSFile $image The image Object, or false if there isn't one.
// * Warning, FSFile::getPropsFromPath might pass an FSFile instead of File (!)
// * @param String $path The filename
// * @return array|boolean Follow the format of PHP getimagesize() @gplx.Internal protected function.
// * See https://secure.php.net/getimagesize. MediaWiki will only ever use the
// * first two array keys (the width and height), and the 'bits' associative
// * key. All other array keys are ignored. Returning a 'bits' key is optional
// * as not all formats have a notion of "bitdepth". Returns false on failure.
// */
// abstract function getImageSize($image, $path);
//
// /**
// * Get handler-specific metadata which will be saved in the img_metadata field.
// *
// * @param File|FSFile $image The image Object, or false if there isn't one.
// * Warning, FSFile::getPropsFromPath might pass an FSFile instead of File (!)
// * @param String $path The filename
// * @return String A String of metadata in php serialized form (Run through serialize())
// */
// function getMetadata($image, $path) {
// return '';
// }
//
// /**
// * Get metadata version.
// *
// * This is not used for validating metadata, this is used for the api when returning
// * metadata, since api content formats should stay the same over time, and so things
// * using ForeignApiRepo can keep backwards compatibility
// *
// * All core media handlers share a common version number, and extensions can
// * use the GetMetadataVersion hook to append to the array (they should append a unique
// * String so not to get confusing). If there was a media handler named 'foo' with metadata
// * version 3 it might add to the end of the array the element 'foo=3'. if the core metadata
// * version is 2, the end version String would look like '2;foo=3'.
// *
// * @return String Version String
// */
// static function getMetadataVersion() {
// $version = [ '2' ]; // core metadata version
// Hooks::run('GetMetadataVersion', [ &$version ]);
//
// return implode(';', $version);
// }
//
// /**
// * Convert metadata version.
// *
// * By default just returns $metadata, but can be used to allow
// * media handlers to convert between metadata versions.
// *
// * @param String|array $metadata Metadata array (serialized if String)
// * @param int $version Target version
// * @return array Serialized metadata in specified version, or $metadata on fail.
// */
// function convertMetadataVersion($metadata, $version = 1) {
// if (!is_array($metadata)) {
//
// // unserialize to keep return parameter consistent.
// MediaWiki\suppressWarnings();
// $ret = unserialize($metadata);
// MediaWiki\restoreWarnings();
//
// return $ret;
// }
//
// return $metadata;
// }
//
// /**
// * Get a String describing the type of metadata, for display purposes.
// *
// * @note This method is currently unused.
// * @param File $image
// * @return String
// */
// function getMetadataType($image) {
// return false;
// }
//
// /**
// * Check if the metadata String is valid for this handler.
// * If it returns MediaHandler::METADATA_BAD (or false), Image
// * will reload the metadata from the file and update the database.
// * MediaHandler::METADATA_GOOD for if the metadata is a-ok,
// * MediaHandler::METADATA_COMPATIBLE if metadata is old but backwards
// * compatible (which may or may not trigger a metadata reload).
// *
// * @note Returning self::METADATA_BAD will trigger a metadata reload from
// * file on page view. Always returning this from a broken file, or suddenly
// * triggering as bad metadata for a large number of files can cause
// * performance problems.
// * @param File $image
// * @param String $metadata The metadata in serialized form
// * @return boolean
// */
// function isMetadataValid($image, $metadata) {
// return self::METADATA_GOOD;
// }
//
// /**
// * Get an array of standard (FormatMetadata type) metadata values.
// *
// * The returned data is largely the same as that from getMetadata(),
// * but formatted in a standard, stable, handler-independent way.
// * The idea being that some values like ImageDescription or Artist
// * are universal and should be retrievable in a handler generic way.
// *
// * The specific properties are the type of properties that can be
// * handled by the FormatMetadata class. These values are exposed to the
// * user via the filemetadata parser function.
// *
// * Details of the response format of this function can be found at
// * https://www.mediawiki.org/wiki/Manual:File_metadata_handling
// * tl/dr: the response is an associative array of
// * properties keyed by name, but the value can be complex. You probably
// * want to call one of the FormatMetadata::flatten* functions on the
// * property values before using them, or call
// * FormatMetadata::getFormattedData() on the full response array, which
// * transforms all values into prettified, human-readable text.
// *
// * Subclasses overriding this function must return a value which is a
// * valid API response fragment (all associative array keys are valid
// * XML tagnames).
// *
// * Note, if the file simply has no metadata, but the handler supports
// * this interface, it should return an empty array, not false.
// *
// * @param File $file
// * @return array|boolean False if interface not supported
// * @since 1.23
// */
// public function getCommonMetaArray(File $file) {
// return false;
// }
//
// /**
// * Get a MediaTransformOutput Object representing an alternate of the transformed
// * output which will call an intermediary thumbnail assist script.
// *
// * Used when the repository has a thumbnailScriptUrl option configured.
// *
// * Return false to fall back to the regular getTransform().
// * @param File $image
// * @param String $script
// * @param array $paramsVar
// * @return boolean|ThumbnailImage
// */
// function getScriptedTransform($image, $script, $paramsVar) {
// return false;
// }
/**
* Get a MediaTransformOutput Object representing the transformed output. Does not
* actually do the transform.
*
* @param File $image The image Object
* @param String $dstPath Filesystem destination path
* @param String $dstUrl Destination URL to use in output HTML
* @param array $paramsVar Arbitrary set of parameters validated by $this->validateParam()
* @return MediaTransformOutput
*/
public Xomw_MediaTransformOutput getTransform(Xomw_File image, byte[] dstPath, byte[] dstUrl, Xomw_params_handler handlerParams) {
return this.doTransform(image, dstPath, dstUrl, handlerParams, TRANSFORM_LATER);
}
/**
* Get a MediaTransformOutput Object representing the transformed output. Does the
* transform unless $flags contains self::TRANSFORM_LATER.
*
* @param File $image The image Object
* @param String $dstPath Filesystem destination path
* @param String $dstUrl Destination URL to use in output HTML
* @param array $paramsVar Arbitrary set of parameters validated by $this->validateParam()
* Note: These parameters have *not* gone through $this->normaliseParams()
* @param int $flags A bitfield, may contain self::TRANSFORM_LATER
* @return MediaTransformOutput
*/
public Xomw_MediaTransformOutput doTransform(Xomw_File image, byte[] dstPath, byte[] dstUrl, Xomw_params_handler handlerParams) {return doTransform(image, dstPath, dstUrl, handlerParams, 0);}
public abstract Xomw_MediaTransformOutput doTransform(Xomw_File image, byte[] dstPath, byte[] dstUrl, Xomw_params_handler handlerParams, int flags);
// /**
// * Get the thumbnail extension and MIME type for a given source MIME type
// *
// * @param String $ext Extension of original file
// * @param String $mime MIME type of original file
// * @param array $paramsVar Handler specific rendering parameters
// * @return array Thumbnail extension and MIME type
// */
// function getThumbType($ext, $mime, $paramsVar = null) {
// $magic = MimeMagic::singleton();
// if (!$ext || $magic->isMatchingExtension($ext, $mime) === false) {
// // The extension is not valid for this MIME type and we do
// // recognize the MIME type
// $extensions = $magic->getExtensionsForType($mime);
// if ($extensions) {
// return [ strtok($extensions, ' '), $mime ];
// }
// }
//
// // The extension is correct (true) or the MIME type is unknown to
// // MediaWiki (null)
// return [ $ext, $mime ];
// }
//
// /**
// * Get useful response headers for GET/HEAD requests for a file with the given metadata
// *
// * @param mixed $metadata Result of the getMetadata() function of this handler for a file
// * @return array
// */
// public function getStreamHeaders($metadata) {
// return [];
// }
/**
* True if the handled types can be transformed
*
* @param File $file
* @return boolean
*/
@gplx.Virtual public boolean canRender(Xomw_File file) {
return true;
}
/**
* True if handled types cannot be displayed directly in a browser
* but can be rendered
*
* @param File $file
* @return boolean
*/
public boolean mustRender(Xomw_File file) {
return false;
}
// /**
// * True if the type has multi-page capabilities
// *
// * @param File $file
// * @return boolean
// */
// public function isMultiPage($file) {
// return false;
// }
//
// /**
// * Page count for a multi-page document, false if unsupported or unknown
// *
// * @param File $file
// * @return boolean
// */
// function pageCount(File $file) {
// return false;
// }
//
// /**
// * The material is vectorized and thus scaling is lossless
// *
// * @param File $file
// * @return boolean
// */
// function isVectorized($file) {
// return false;
// }
//
// /**
// * The material is an image, and is animated.
// * In particular, video material need not return true.
// * @note Before 1.20, this was a method of ImageHandler only
// *
// * @param File $file
// * @return boolean
// */
// function isAnimatedImage($file) {
// return false;
// }
//
// /**
// * If the material is animated, we can animate the thumbnail
// * @since 1.20
// *
// * @param File $file
// * @return boolean If material is not animated, handler may return any value.
// */
// function canAnimateThumbnail($file) {
// return true;
// }
//
// /**
// * False if the handler is disabled for all files
// * @return boolean
// */
// function isEnabled() {
// return true;
// }
//
// /**
// * Get an associative array of page dimensions
// * Currently "width" and "height" are understood, but this might be
// * expanded in the future.
// * Returns false if unknown.
// *
// * It is expected that handlers for paged media (e.g. DjVuHandler)
// * will override this method so that it gives the correct results
// * for each specific page of the file, using the $page argument.
// *
// * @note For non-paged media, use getImageSize.
// *
// * @param File $image
// * @param int $page What page to get dimensions of
// * @return array|boolean
// */
// function getPageDimensions(File $image, $page) {
// $gis = $this->getImageSize($image, $image->getLocalRefPath());
// if ($gis) {
// return [
// 'width' => $gis[0],
// 'height' => $gis[1]
// ];
// } else {
// return false;
// }
// }
//
// /**
// * Generic getter for text layer.
// * Currently overloaded by PDF and DjVu handlers
// * @param File $image
// * @param int $page Page number to get information for
// * @return boolean|String Page text or false when no text found or if
// * unsupported.
// */
// function getPageText(File $image, $page) {
// return false;
// }
//
// /**
// * Get the text of the entire document.
// * @param File $file
// * @return boolean|String The text of the document or false if unsupported.
// */
// public function getEntireText(File $file) {
// $numPages = $file->pageCount();
// if (!$numPages) {
// // Not a multipage document
// return $this->getPageText($file, 1);
// }
// $document = '';
// for ($i = 1; $i <= $numPages; $i++) {
// $curPage = $this->getPageText($file, $i);
// if (is_string($curPage)) {
// $document .= $curPage . "\n";
// }
// }
// if ($document !== '') {
// return $document;
// }
// return false;
// }
//
// /**
// * Get an array structure that looks like this:
// *
// * [
// * 'visible' => [
// * 'Human-readable name' => 'Human readable value',
// * ...
// * ],
// * 'collapsed' => [
// * 'Human-readable name' => 'Human readable value',
// * ...
// * ]
// * ]
// * The UI will format this into a table where the visible fields are always
// * visible, and the collapsed fields are optionally visible.
// *
// * The function should return false if there is no metadata to display.
// */
//
// /**
// * @todo FIXME: This interface is not very flexible. The media handler
// * should generate HTML instead. It can do all the formatting according
// * to some standard. That makes it possible to do things like visual
// * indication of grouped and chained streams in ogg container files.
// * @param File $image
// * @param boolean|IContextSource $context Context to use (optional)
// * @return array|boolean
// */
// function formatMetadata($image, $context = false) {
// return false;
// }
//
// /** sorts the visible/invisible field.
// * Split off from ImageHandler::formatMetadata, as used by more than
// * one type of handler.
// *
// * This is used by the media handlers that use the FormatMetadata class
// *
// * @param array $metadataArray Metadata array
// * @param boolean|IContextSource $context Context to use (optional)
// * @return array Array for use displaying metadata.
// */
// function formatMetadataHelper($metadataArray, $context = false) {
// $result = [
// 'visible' => [],
// 'collapsed' => []
// ];
//
// $formatted = FormatMetadata::getFormattedData($metadataArray, $context);
// // Sort fields into visible and collapsed
// $visibleFields = $this->visibleMetadataFields();
// foreach ($formatted as $name => $value) {
// $tag = strtolower($name);
// self::addMeta($result,
// in_array($tag, $visibleFields) ? 'visible' : 'collapsed',
// 'exif',
// $tag,
// $value
// );
// }
//
// return $result;
// }
//
// /**
// * Get a list of metadata items which should be displayed when
// * the metadata table is collapsed.
// *
// * @return array Array of strings
// */
// protected function visibleMetadataFields() {
// return FormatMetadata::getVisibleFields();
// }
//
// /**
// * This is used to generate an array element for each metadata value
// * That array is then used to generate the table of metadata values
// * on the image page
// *
// * @param array &$array An array containing elements for each type of visibility
// * and each of those elements being an array of metadata items. This function adds
// * a value to that array.
// * @param String $visibility ('visible' or 'collapsed') if this value is hidden
// * by default.
// * @param String $type Type of metadata tag (currently always 'exif')
// * @param String $id The name of the metadata tag (like 'artist' for example).
// * its name in the table displayed is the message "$type-$id" (Ex exif-artist).
// * @param String $value Thingy goes into a wikitext table; it used to be escaped but
// * that was incompatible with previous practise of customized display
// * with wikitext formatting via messages such as 'exif-model-value'.
// * So the escaping is taken back out, but generally this seems a confusing
// * interface.
// * @param boolean|String $param Value to pass to the message for the name of the field
// * as $1. Currently this parameter doesn't seem to ever be used.
// *
// * Note, everything here is passed through the parser later on (!)
// */
// protected static function addMeta(&$array, $visibility, $type, $id, $value, $param = false) {
// $msg = wfMessage("$type-$id", $param);
// if ($msg->exists()) {
// $name = $msg->text();
// } else {
// // This is for future compatibility when using instant commons.
// // So as to not display as ugly a name if a new metadata
// // property is defined that we don't know about
// // (not a major issue since such a property would be collapsed
// // by default).
// wfDebug(__METHOD__ . ' Unknown metadata name: ' . $id . "\n");
// $name = wfEscapeWikiText($id);
// }
// $array[$visibility][] = [
// 'id' => "$type-$id",
// 'name' => $name,
// 'value' => $value
// ];
// }
//
// /**
// * Short description. Shown on Special:Search results.
// *
// * @param File $file
// * @return String
// */
// function getShortDesc($file) {
// return self::getGeneralShortDesc($file);
// }
//
// /**
// * Long description. Shown under image on image description page surounded by ().
// *
// * @param File $file
// * @return String
// */
// function getLongDesc($file) {
// return self::getGeneralLongDesc($file);
// }
//
// /**
// * Used instead of getShortDesc if there is no handler registered for file.
// *
// * @param File $file
// * @return String
// */
// static function getGeneralShortDesc($file) {
// global $wgLang;
//
// return htmlspecialchars($wgLang->formatSize($file->getSize()));
// }
//
// /**
// * Used instead of getLongDesc if there is no handler registered for file.
// *
// * @param File $file
// * @return String
// */
// static function getGeneralLongDesc($file) {
// return wfMessage('file-info')->sizeParams($file->getSize())
// ->paramsVar('<span class="mime-type">' . $file->getMimeType() . '</span>')->parse();
// }
/**
* Calculate the largest thumbnail width for a given original file size
* such that the thumbnail's height is at most $maxHeight.
* @param int $boxWidth Width of the thumbnail box.
* @param int $boxHeight Height of the thumbnail box.
* @param int $maxHeight Maximum height expected for the thumbnail.
* @return int
*/
public static int fitBoxWidth(int boxWidth, int boxHeight, int maxHeight) {
double idealWidth = boxWidth * maxHeight / boxHeight;
int roundedUp = Math_.Ceil_as_int(idealWidth);
if (Math_.Round(roundedUp * boxHeight / boxWidth, 0) > maxHeight) {
return Math_.Floor_as_int(idealWidth);
} else {
return roundedUp;
}
}
// /**
// * Shown in file history box on image description page.
// *
// * @param File $file
// * @return String Dimensions
// */
// function getDimensionsString($file) {
// return '';
// }
//
// /**
// * Modify the parser Object post-transform.
// *
// * This is often used to do $parser->addOutputHook(),
// * in order to add some javascript to render a viewer.
// * See TimedMediaHandler or OggHandler for an example.
// *
// * @param Parser $parser
// * @param File $file
// */
// function parserTransformHook($parser, $file) {
// }
//
// /**
// * File validation hook called on upload.
// *
// * If the file at the given local path is not valid, or its MIME type does not
// * match the handler class, a Status Object should be returned containing
// * relevant errors.
// *
// * @param String $fileName The local path to the file.
// * @return Status
// */
// function verifyUpload($fileName) {
// return Status::newGood();
// }
//
// /**
// * Check for zero-sized thumbnails. These can be generated when
// * no disk space is available or some other error occurs
// *
// * @param String $dstPath The location of the suspect file
// * @param int $retval Return value of some shell process, file will be deleted if this is non-zero
// * @return boolean True if removed, false otherwise
// */
// function removeBadFile($dstPath, $retval = 0) {
// if (file_exists($dstPath)) {
// $thumbstat = stat($dstPath);
// if ($thumbstat['size'] == 0 || $retval != 0) {
// $result = unlink($dstPath);
//
// if ($result) {
// wfDebugLog('thumbnail',
// sprintf('Removing bad %d-byte thumbnail "%s". unlink() succeeded',
// $thumbstat['size'], $dstPath));
// } else {
// wfDebugLog('thumbnail',
// sprintf('Removing bad %d-byte thumbnail "%s". unlink() failed',
// $thumbstat['size'], $dstPath));
// }
//
// return true;
// }
// }
//
// return false;
// }
//
// /**
// * Remove files from the purge list.
// *
// * This is used by some video handlers to prevent ?action=purge
// * from removing a transcoded video, which is expensive to
// * regenerate.
// *
// * @see LocalFile::purgeThumbnails
// *
// * @param array $files
// * @param array $options Purge options. Currently will always be
// * an array with a single key 'forThumbRefresh' set to true.
// */
// public function filterThumbnailPurgeList(&$files, $options) {
// // Do nothing
// }
//
// /**
// * True if the handler can rotate the media
// * @since 1.24 non-static. From 1.21-1.23 was static
// * @return boolean
// */
// public function canRotate() {
// return false;
// }
//
// /**
// * On supporting image formats, try to read out the low-level orientation
// * of the file and return the angle that the file needs to be rotated to
// * be viewed.
// *
// * This information is only useful when manipulating the original file;
// * the width and height we normally work with is logical, and will match
// * any produced output views.
// *
// * For files we don't know, we return 0.
// *
// * @param File $file
// * @return int 0, 90, 180 or 270
// */
// public function getRotation($file) {
// return 0;
// }
//
// /**
// * Log an error that occurred in an external process
// *
// * Moved from BitmapHandler to MediaHandler with MediaWiki 1.23
// *
// * @since 1.23
// * @param int $retval
// * @param String $err Error reported by command. Anything longer than
// * MediaHandler::MAX_ERR_LOG_SIZE is stripped off.
// * @param String $cmd
// */
// protected function logErrorForExternalProcess($retval, $err, $cmd) {
// # Keep error output limited (bug 57985)
// $errMessage = trim(substr($err, 0, self::MAX_ERR_LOG_SIZE));
//
// wfDebugLog('thumbnail',
// sprintf('thumbnail failed on %s: error %d "%s" from "%s"',
// wfHostname(), $retval, $errMessage, $cmd));
// }
//
// /**
// * Get list of languages file can be viewed in.
// *
// * @param File $file
// * @return String[] Array of language codes, or empty array if unsupported.
// * @since 1.23
// */
// public function getAvailableLanguages(File $file) {
// return [];
// }
//
// /**
// * On file types that support renderings in multiple languages,
// * which language is used by default if unspecified.
// *
// * If getAvailableLanguages returns a non-empty array, this must return
// * a valid language code. Otherwise can return null if files of this
// * type do not support alternative language renderings.
// *
// * @param File $file
// * @return String|null Language code or null if multi-language not supported for filetype.
// * @since 1.23
// */
// public function getDefaultRenderLanguage(File $file) {
// return null;
// }
//
// /**
// * If its an audio file, return the length of the file. Otherwise 0.
// *
// * File::getLength() existed for a long time, but was calling a method
// * that only existed in some subclasses of this class (The TMH ones).
// *
// * @param File $file
// * @return float length in seconds
// * @since 1.23
// */
// public function getLength($file) {
// return 0.0;
// }
//
// /**
// * True if creating thumbnails from the file is large or otherwise resource-intensive.
// * @param File $file
// * @return boolean
// */
// public function isExpensiveToThumbnail($file) {
// return false;
// }
//
// /**
// * Returns whether or not this handler supports the chained generation of thumbnails according
// * to buckets
// * @return boolean
// * @since 1.24
// */
// public function supportsBucketing() {
// return false;
// }
//
// /**
// * Returns a normalised paramsVar array for which parameters have been cleaned up for bucketing
// * purposes
// * @param array $paramsVar
// * @return array
// */
// public function sanitizeParamsForBucketing($paramsVar) {
// return $paramsVar;
// }
//
// /**
// * Gets configuration for the file warning message. Return value of
// * the following structure:
// * [
// * // Required, module with messages loaded for the client
// * 'module' => 'example.filewarning.messages',
// * // Required, array of names of messages
// * 'messages' => [
// * // Required, main warning message
// * 'main' => 'example-filewarning-main',
// * // Optional, header for warning dialog
// * 'header' => 'example-filewarning-header',
// * // Optional, footer for warning dialog
// * 'footer' => 'example-filewarning-footer',
// * // Optional, text for more-information link (see below)
// * 'info' => 'example-filewarning-info',
// * ],
// * // Optional, link for more information
// * 'link' => 'http://example.com',
// * ]
// *
// * Returns null if no warning is necessary.
// * @param File $file
// * @return array|null
// */
// public function getWarningConfig($file) {
// return null;
// }
}

View File

@@ -0,0 +1,63 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
// XO.MW:MW has registry and instance cache; XO only has instance
// XO.MW:SYNC:1.29; DATE:2017-02-05
public class Xomw_MediaHandlerFactory {
private final Hash_adp_bry handlers = Hash_adp_bry.cs();
// XO.MW:SYNC:1.29; DATE:2017-02-05
public Xomw_MediaHandlerFactory() {
// Default, MediaWiki core media handlers
// 'image/jpeg' => JpegHandler::class,
handlers.Add(Mime__image__png, new Xomw_TransformationalImageHandler(Mime__image__png)); // PngHandler
// 'image/gif' => GIFHandler::class,
// 'image/tiff' => TiffHandler::class,
// 'image/webp' => WebPHandler::class,
// 'image/x-ms-bmp' => BmpHandler::class,
// 'image/x-bmp' => BmpHandler::class,
// 'image/x-xcf' => XCFHandler::class,
// 'image/svg+xml' => SvgHandler::class, // official
// 'image/svg' => SvgHandler::class, // compat
// 'image/vnd.djvu' => DjVuHandler::class, // official
// 'image/x.djvu' => DjVuHandler::class, // compat
// 'image/x-djvu' => DjVuHandler::class, // compat
}
// XO.MW:SYNC:1.29; DATE:2017-02-05
public Xomw_MediaHandler getHandler(byte[] type) {
return (Xomw_MediaHandler)handlers.Get_by(type);
}
public static byte[]
Mime__image__jpeg = Bry_.new_a7("image/jpeg")
, Mime__image__png = Bry_.new_a7("image/png")
, Mime__image__gif = Bry_.new_a7("image/gif")
, Mime__image__tiff = Bry_.new_a7("image/tiff")
, Mime__image__webp = Bry_.new_a7("image/webp")
, Mime__image__x_ms_bmp = Bry_.new_a7("image/x-ms-bmp")
, Mime__image__x_bmp = Bry_.new_a7("image/x-bmp")
, Mime__image__x_xcf = Bry_.new_a7("image/x-xcf")
, Mime__image__svg_xml = Bry_.new_a7("image/svg+xml")
, Mime__image__svg = Bry_.new_a7("image/svg")
, Mime__image__vnd_djvu = Bry_.new_a7("image/vnd.djvu")
, Mime__image__x_djvu_dot = Bry_.new_a7("image/x.djvu")
, Mime__image__x_djvu_dash = Bry_.new_a7("image/x-djvu")
;
}

View File

@@ -0,0 +1,281 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.langs.htmls.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*;
public abstract class Xomw_MediaTransformOutput {
public Xomw_MediaTransformOutput(Xomw_File file, byte[] url, byte[] path, int width, int height) {
this.file = file;
this.url = url;
this.width = width;
this.height = height;
}
// /** @var array Associative array mapping optional supplementary image files
// * from pixel density (eg 1.5 or 2) to additional URLs.
// */
// public $responsiveUrls = [];
/** @var File */
private final Xomw_File file;
/** @var int Image width */
protected final int width;
/** @var int Image height */
protected final int height;
/** @var String URL path to the thumb */
protected final byte[] url;
// /** @var boolean|String */
// protected $page;
//
// /** @var boolean|String Filesystem path to the thumb */
// protected $path;
//
// /** @var boolean|String Language code, false if not set */
// protected $lang;
//
// /** @var boolean|String Permanent storage path */
// protected $storagePath = false;
/**
* @return int Width of the output box
*/
public int getWidth() {
return this.width;
}
/**
* @return int Height of the output box
*/
public int getHeight() {
return this.height;
}
// /**
// * @return File
// */
// public function getFile() {
// return $this->file;
// }
//
// /**
// * Get the final extension of the thumbnail.
// * Returns false for scripted transformations.
// * @return String|boolean
// */
// public function getExtension() {
// return $this->path ? FileBackend::extensionFromPath( $this->path ) : false;
// }
//
// /**
// * @return String|boolean The thumbnail URL
// */
// public function getUrl() {
// return $this->url;
// }
//
// /**
// * @return String|boolean The permanent thumbnail storage path
// */
// public function getStoragePath() {
// return $this->storagePath;
// }
//
// /**
// * @param String $storagePath The permanent storage path
// * @return void
// */
// public function setStoragePath( $storagePath ) {
// $this->storagePath = $storagePath;
// if ( $this->path === false ) {
// $this->path = $storagePath;
// }
// }
/**
* Fetch HTML for this transform output
*
* @param array $options Associative array of options. Boolean options
* should be indicated with a value of true for true, and false or
* absent for false.
*
* alt Alternate text or caption
* desc-link Boolean, show a description link
* file-link Boolean, show a file download link
* custom-url-link Custom URL to link to
* custom-title-link Custom Title Object to link to
* valign vertical-align property, if the output is an inline element
* img-class Class applied to the "<img>" tag, if there is such a tag
*
* For images, desc-link and file-link are implemented as a click-through. For
* sounds and videos, they may be displayed in other ways.
*
* @return String
*/
public abstract void toHtml(Bry_bfr bfr, Bry_bfr tmp, Xomw_params_mto options);
// /**
// * This will be overridden to return true in error classes
// * @return boolean
// */
// public function isError() {
// return false;
// }
//
// /**
// * Check if an output thumbnail file actually exists.
// *
// * This will return false if there was an error, the
// * thumbnail is to be handled client-side only, or if
// * transformation was deferred via TRANSFORM_LATER.
// * This file may exist as a new file in /tmp, a file
// * in permanent storage, or even refer to the original.
// *
// * @return boolean
// */
// public function hasFile() {
// // If TRANSFORM_LATER, $this->path will be false.
// // Note: a null path means "use the source file".
// return ( !$this->isError() && ( $this->path || $this->path === null ) );
// }
//
// /**
// * Check if the output thumbnail is the same as the source.
// * This can occur if the requested width was bigger than the source.
// *
// * @return boolean
// */
// public function fileIsSource() {
// return ( !$this->isError() && $this->path === null );
// }
//
// /**
// * Get the path of a file system copy of the thumbnail.
// * Callers should never write to this path.
// *
// * @return String|boolean Returns false if there isn't one
// */
// public function getLocalCopyPath() {
// if ( $this->isError() ) {
// return false;
// } elseif ( $this->path === null ) {
// return $this->file->getLocalRefPath(); // assume thumb was not scaled
// } elseif ( FileBackend::isStoragePath( $this->path ) ) {
// $be = $this->file->getRepo()->getBackend();
// // The temp file will be process cached by FileBackend
// $fsFile = $be->getLocalReference( [ 'src' => $this->path ] );
//
// return $fsFile ? $fsFile->getPath() : false;
// } else {
// return $this->path; // may return false
// }
// }
//
// /**
// * Stream the file if there were no errors
// *
// * @param array $headers Additional HTTP headers to send on success
// * @return Status
// * @since 1.27
// */
// public function streamFileWithStatus( $headers = [] ) {
// if ( !$this->path ) {
// return Status::newFatal( 'backend-fail-stream', '<no path>' );
// } elseif ( FileBackend::isStoragePath( $this->path ) ) {
// $be = $this->file->getRepo()->getBackend();
// return $be->streamFile( [ 'src' => $this->path, 'headers' => $headers ] );
// } else { // FS-file
// $success = StreamFile::stream( $this->getLocalCopyPath(), $headers );
// return $success ? Status::newGood() : Status::newFatal( 'backend-fail-stream', $this->path );
// }
// }
//
// /**
// * Stream the file if there were no errors
// *
// * @deprecated since 1.26, use streamFileWithStatus
// * @param array $headers Additional HTTP headers to send on success
// * @return boolean Success
// */
// public function streamFile( $headers = [] ) {
// $this->streamFileWithStatus( $headers )->isOK();
// }
//
// /**
// * Wrap some XHTML text in an anchor tag with the given attributes
// *
// * @param array $linkAttribs
// * @param String $contents
// * @return String
// */
// protected function linkWrap( $linkAttribs, $contents ) {
// if ( $linkAttribs ) {
// return Xml::tags( 'a', $linkAttribs, $contents );
// } else {
// return $contents;
// }
// }
/**
* @param String $title
* @param String|array $prms Query parameters to add
* @return array
*/
public void getDescLinkAttribs(List_adp attribs, byte[] title, List_adp prms) {
// if ( is_array( prms ) ) {
// $query = prms;
// } else {
// $query = [];
// }
// if ( $this->page && $this->page !== 1 ) {
// $query['page'] = $this->page;
// }
// if ( $this->lang ) {
// $query['lang'] = $this->lang;
// }
//
// if ( is_string( prms ) && prms !== '' ) {
// $query = prms . '&' . wfArrayToCgi( $query );
// }
attribs.Clear();
// 'href' => $this->file->getTitle()->getLocalURL( $query ),
attribs.Add_many(Gfh_atr_.Bry__href, this.file.getTitle());
attribs.Add_many(Gfh_atr_.Bry__class, Bry__class__image);
if (title != null) {
attribs.Add_many(Gfh_atr_.Bry__title, title);
}
}
// Wrap some XHTML text in an anchor tag with the given attributes
// XO.MW:SYNC:1.29; DATE:2017-02-03
protected void Link_wrap(Bry_bfr bfr, List_adp link_attribs, byte[] contents) {
if (link_attribs != null) {
Xomw_xml.Tags(bfr, Gfh_tag_.Bry__a, link_attribs, contents);
}
else {
bfr.Add(contents);
}
}
private static final byte[] Bry__class__image = Bry_.new_a7("image");
}

View File

@@ -0,0 +1,214 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.langs.htmls.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*;
// Media transform output for images
public class Xomw_ThumbnailImage extends Xomw_MediaTransformOutput { private final List_adp attribs = List_adp_.New(), link_attribs = List_adp_.New();
public Xomw_ThumbnailImage(Xomw_File file, byte[] url, byte[] path, int w, int h) {super(file, url, path, w, h);
}
/**
* Get a thumbnail Object from a file and parameters.
* If path is set to null, the output file is treated as a source copy.
* If path is set to false, no output file will be created.
* parameters should include, as a minimum, (file) 'width' and 'height'.
* It may also include a 'page' parameter for multipage files.
*
* @param File file
* @param String url URL path to the thumb
* @param String|boolean path Filesystem path to the thumb
* @param array parameters Associative array of parameters
*/
public Xomw_ThumbnailImage(Xomw_File file, byte[] url, byte[] path, Xomw_params_handler parameters) {super(file, url, path, parameters.width, parameters.height);
// defaults = [
// 'page' => false,
// 'lang' => false
// ];
//
// if (is_array(parameters)) {
// actualParams = parameters + defaults;
// } else {
// // Using old format, should convert. Later a warning could be added here.
// numArgs = func_num_args();
// actualParams = [
// 'width' => path,
// 'height' => parameters,
// 'page' => (numArgs > 5) ? func_get_arg(5) : false
// ] + defaults;
// path = (numArgs > 4) ? func_get_arg(4) : false;
// }
// this->file = file;
// this->url = url;
// this->path = path;
// These should be integers when they get here.
// If not, there's a bug somewhere. But let's at
// least produce valid HTML code regardless.
// this->width = round(actualParams['width']);
// this->height = round(actualParams['height']);
// this->page = actualParams['page'];
// this->lang = actualParams['lang'];
}
/**
* Return HTML <img ... /> tag for the thumbnail, will include
* width and height attributes and a blank alt text (as required).
*
* @param array options Associative array of options. Boolean options
* should be indicated with a value of true for true, and false or
* absent for false.
*
* alt HTML alt attribute
* title HTML title attribute
* desc-link Boolean, show a description link
* file-link Boolean, show a file download link
* valign vertical-align property, if the output is an inline element
* img-class Class applied to the \<img\> tag, if there is such a tag
* desc-query String, description link query prms
* @Override width Override width attribute. Should generally not set
* @Override height Override height attribute. Should generally not set
* no-dimensions Boolean, skip width and height attributes (useful if
* set in CSS)
* custom-url-link Custom URL to link to
* custom-title-link Custom Title Object to link to
* custom target-link Value of the target attribute, for custom-target-link
* parser-extlink-* Attributes added by parser for external links:
* parser-extlink-rel: add rel="nofollow"
* parser-extlink-target: link target, but overridden by custom-target-link
*
* For images, desc-link and file-link are implemented as a click-through. For
* sounds and videos, they may be displayed in other ways.
*
* @throws MWException
* @return String
*/
// Return HTML <img ... /> tag for the thumbnail, will include
// width and height attributes and a blank alt text (as required).
//
// @param array options Associative array of options. Boolean options
// should be indicated with a value of true for true, and false or
// absent for false.
//
// alt HTML alt attribute
// title HTML title attribute
// desc-link Boolean, show a description link
// file-link Boolean, show a file download link
// valign vertical-align property, if the output is an inline element
// img-class Class applied to the \<img\> tag, if there is such a tag
// desc-query String, description link query prms
// override-width Override width attribute. Should generally not set
// override-height Override height attribute. Should generally not set
// no-dimensions Boolean, skip width and height attributes (useful if
// set in CSS)
// custom-url-link Custom URL to link to
// custom-title-link Custom Title Object to link to
// custom target-link Value of the target attribute, for custom-target-link
// parser-extlink-* Attributes added by parser for external links:
// parser-extlink-rel: add rel="nofollow"
// parser-extlink-target: link target, but overridden by custom-target-link
//
// For images, desc-link and file-link are implemented as a click-through. For
// sounds and videos, they may be displayed in other ways.
// XO.MW:SYNC:1.29; DATE:2017-02-03
@Override public void toHtml(Bry_bfr bfr, Bry_bfr tmp, Xomw_params_mto options) {
byte[] alt = options.alt;
// byte[] query = options.desc_query;
attribs.Clear();
attribs.Add_many(Gfh_atr_.Bry__alt, alt);
attribs.Add_many(Gfh_atr_.Bry__src, url);
boolean link_attribs_is_null = false;
if (!Php_utl_.empty(options.custom_url_link)) {
link_attribs.Clear();
link_attribs.Add_many(Gfh_atr_.Bry__href, options.custom_url_link);
if (!Php_utl_.empty(options.title)) {
link_attribs.Add_many(Gfh_atr_.Bry__title, options.title);
}
if (Php_utl_.empty(options.custom_target_link)) {
link_attribs.Add_many(Gfh_atr_.Bry__target, options.custom_target_link);
}
else if (Php_utl_.empty(options.parser_extlink_target)) {
link_attribs.Add_many(Gfh_atr_.Bry__target, options.parser_extlink_target);
}
if (Php_utl_.empty(options.parser_extlink_rel)) {
link_attribs.Add_many(Gfh_atr_.Bry__rel, options.parser_extlink_rel);
}
}
else if (!Php_utl_.empty(options.custom_title_link)) {
// byte[] title = options.custom_title_link;
// link_attribs.Clear();
// link_attribs.Add_many(Gfh_atr_.Bry__href, title.Get_link_url());
// byte[] options_title = options.title;
// link_attribs.Add_many(Gfh_atr_.Bry__title, Php_utl_.empty(options_title) ? title.Get_full_text() : options_title);
}
else if (!Php_utl_.empty(options.desc_link)) {
// link_attribs = this.getDescLinkAttribs(
// empty(options['title']) ? null : options['title'],
// $query
// );
link_attribs.Clear();
this.getDescLinkAttribs(link_attribs,
Php_utl_.empty(options.title) ? null : options.title,
null);
}
else if (!Php_utl_.empty(options.file_link)) {
// link_attribs.Clear();
// link_attribs.Add_many(Gfh_atr_.Bry__href, file.Get_url());
}
else {
link_attribs_is_null = true;
if (!Php_utl_.empty(options.title)) {
attribs.Add_many(Gfh_atr_.Bry__title, options.title);
}
}
if (!Php_utl_.empty(options.no_dimensions)) {
attribs.Add_many(Gfh_atr_.Bry__width, Int_.To_bry(width));
attribs.Add_many(Gfh_atr_.Bry__height, Int_.To_bry(height));
}
if (!Php_utl_.empty(options.valign)) {
attribs.Add_many(Gfh_atr_.Bry__style, Bry_.Add(Bry__vertical_align, options.valign));
}
if (!Php_utl_.empty(options.img_cls)) {
attribs.Add_many(Gfh_atr_.Bry__class, options.img_cls);
}
if (Php_utl_.isset(options.override_height)) {
attribs.Add_many(Gfh_atr_.Bry__class, options.override_height);
}
if (Php_utl_.isset(options.override_width)) {
attribs.Add_many(Gfh_atr_.Bry__width, options.override_height);
}
// Additional densities for responsive images, if specified.
// If any of these urls is the same as src url, it'll be excluded.
// $responsiveUrls = array_diff(this.responsiveUrls, [ this.url ]);
// if (!Php_utl_.empty($responsiveUrls)) {
// $attribs['srcset'] = Html::srcSet($responsiveUrls);
// }
// XO.MW.HOOK:ThumbnailBeforeProduceHTML
Xomw_xml.Element(tmp, Gfh_tag_.Bry__img, attribs, Bry_.Empty, Bool_.Y);
Link_wrap(bfr, link_attribs_is_null ? null : link_attribs, tmp.To_bry_and_clear());
}
private static final byte[] Bry__vertical_align = Bry_.new_a7("vertical-align: ");
}

View File

@@ -0,0 +1,611 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.media; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*;
import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
public class Xomw_TransformationalImageHandler extends Xomw_ImageHandler { public Xomw_TransformationalImageHandler(byte[] key) {super(key);
}
/**
* @param File image
* @param array paramsVar Transform parameters. Entries with the keys 'width'
* and 'height' are the respective screen width and height, while the keys
* 'physicalWidth' and 'physicalHeight' indicate the thumbnail dimensions.
* @return boolean
*/
@Override public boolean normaliseParams(Xomw_File image, Xomw_params_handler prms) {
if (!super.normaliseParams(image, prms)) {
return false;
}
// Obtain the source, pre-rotation dimensions
int srcWidth = image.getWidth(prms.page);
int srcHeight = image.getHeight(prms.page);
// Don't make an image bigger than the source
if (prms.physicalWidth >= srcWidth) {
prms.physicalWidth = srcWidth;
prms.physicalHeight = srcHeight;
// Skip scaling limit checks if no scaling is required
// due to requested size being bigger than source.
if (!image.mustRender()) {
return true;
}
}
return true;
}
// /**
// * Extracts the width/height if the image will be scaled before rotating
// *
// * This will match the physical size/aspect ratio of the original image
// * prior to application of the rotation -- so for a portrait image that's
// * stored as raw landscape with 90-degress rotation, the resulting size
// * will be wider than it is tall.
// *
// * @param array paramsVar Parameters as returned by normaliseParams
// * @param int rotation The rotation angle that will be applied
// * @return array (width, height) array
// */
// public function extractPreRotationDimensions(paramsVar, rotation) {
// if (rotation == 90 || rotation == 270) {
// // We'll resize before rotation, so swap the dimensions again
// width = paramsVar['physicalHeight'];
// height = paramsVar['physicalWidth'];
// } else {
// width = paramsVar['physicalWidth'];
// height = paramsVar['physicalHeight'];
// }
//
// return [ width, height ];
// }
//
/**
* Create a thumbnail.
*
* This sets up various parameters, and then calls a helper method
* based on this.getScalerType in order to scale the image.
*
* @param File image
* @param String dstPath
* @param String dstUrl
* @param array paramsVar
* @param int flags
* @return MediaTransformError|ThumbnailImage|TransformParameterError
*/
@Override public Xomw_MediaTransformOutput doTransform(Xomw_File image, byte[] dstPath, byte[] dstUrl, Xomw_params_handler prms, int flags) {
// if (!this.normaliseParams(image, paramsVar)) {
// return new TransformParameterError(paramsVar);
// }
//
// // Create a parameter array to pass to the scaler
Xomw_params_scalar scalerParams = new Xomw_params_scalar();
// // The size to which the image will be resized
scalerParams.physicalWidth = prms.physicalWidth;
scalerParams.physicalHeight = prms.physicalHeight;
// 'physicalDimensions' => "{paramsVar['physicalWidth']}x{paramsVar['physicalHeight']}",
// The size of the image on the page
scalerParams.clientWidth = prms.width;
scalerParams.clientHeight = prms.height;
// Comment as will be added to the Exif of the thumbnail
// 'comment' => isset(paramsVar['descriptionUrl'])
// ? "File source: {paramsVar['descriptionUrl']}"
// : '',
// Properties of the original image
scalerParams.srcWidth = image.getWidth();
scalerParams.srcHeight = image.getHeight();
scalerParams.mimeType = image.getMimeType();
scalerParams.dstPath = dstPath;
scalerParams.dstUrl = dstUrl;
// 'interlace' => isset(paramsVar['interlace']) ? paramsVar['interlace'] : false,
// if (isset(paramsVar['quality']) && paramsVar['quality'] === 'low') {
// scalerParams['quality'] = 30;
// }
// For subclasses that might be paged.
// if (image.isMultipage() && isset(paramsVar['page'])) {
// scalerParams['page'] = intval(paramsVar['page']);
// }
// Determine scaler type
// scaler = this.getScalerType(dstPath);
//
// if (is_array(scaler)) {
// scalerName = get_class(scaler[0]);
// } else {
// scalerName = scaler;
// }
//
// wfDebug(__METHOD__ . ": creating {scalerParams['physicalDimensions']} " .
// "thumbnail at dstPath using scaler scalerName\n");
if (!image.mustRender() &&
scalerParams.physicalWidth == scalerParams.srcWidth
&& scalerParams.physicalHeight == scalerParams.srcHeight
// && !isset(scalerParams['quality'])
) {
// normaliseParams (or the user) wants us to return the unscaled image
// wfDebug(__METHOD__ . ": returning unscaled image\n");
return this.getClientScalingThumbnailImage(image, scalerParams);
}
// if (scaler == 'client') {
// // Client-side image scaling, use the source URL
// // Using the destination URL in a TRANSFORM_LATER request would be incorrect
// return this.getClientScalingThumbnailImage(image, scalerParams);
// }
//
// if (image.isTransformedLocally() && !this.isImageAreaOkForThumbnaling(image, paramsVar)) {
// global wgMaxImageArea;
// return new TransformTooBigImageAreaError(paramsVar, wgMaxImageArea);
// }
//
// if (flags & self::TRANSFORM_LATER) {
// wfDebug(__METHOD__ . ": Transforming later per flags.\n");
// newParams = [
// 'width' => scalerParams['clientWidth'],
// 'height' => scalerParams['clientHeight']
// ];
// if (isset(paramsVar['quality'])) {
// newParams['quality'] = paramsVar['quality'];
// }
// if (isset(paramsVar['page']) && paramsVar['page']) {
// newParams['page'] = paramsVar['page'];
// }
// return new Xomw_ThumbnailImage(image, dstUrl, null, newParams);
return new Xomw_ThumbnailImage(image, dstUrl, null, prms);
// }
//
// // Try to make a target path for the thumbnail
// if (!wfMkdirParents(dirname(dstPath), null, __METHOD__)) {
// wfDebug(__METHOD__ . ": Unable to create thumbnail destination " .
// "directory, falling back to client scaling\n");
//
// return this.getClientScalingThumbnailImage(image, scalerParams);
// }
//
// // Transform functions and binaries need a FS source file
// thumbnailSource = this.getThumbnailSource(image, paramsVar);
//
// // If the source isn't the original, disable EXIF rotation because it's already been applied
// if (scalerParams['srcWidth'] != thumbnailSource['width']
// || scalerParams['srcHeight'] != thumbnailSource['height']) {
// scalerParams['disableRotation'] = true;
// }
//
// scalerParams['srcPath'] = thumbnailSource['path'];
// scalerParams['srcWidth'] = thumbnailSource['width'];
// scalerParams['srcHeight'] = thumbnailSource['height'];
//
// if (scalerParams['srcPath'] === false) { // Failed to get local copy
// wfDebugLog('thumbnail',
// sprintf('Thumbnail failed on %s: could not get local copy of "%s"',
// wfHostname(), image.getName()));
//
// return new MediaTransformError('thumbnail_error',
// scalerParams['clientWidth'], scalerParams['clientHeight'],
// wfMessage('filemissing')
// );
// }
//
// // Try a hook. Called "Bitmap" for historical reasons.
// /** @var mto MediaTransformOutput */
// mto = null;
// Hooks::run('BitmapHandlerTransform', [ this, image, &scalerParams, &mto ]);
// if (!is_null(mto)) {
// wfDebug(__METHOD__ . ": Hook to BitmapHandlerTransform created an mto\n");
// scaler = 'hookaborted';
// }
//
// // scaler will return a MediaTransformError on failure, or false on success.
// // If the scaler is succesful, it will have created a thumbnail at the destination
// // path.
// if (is_array(scaler) && is_callable(scaler)) {
// // Allow subclasses to specify their own rendering methods.
// err = call_user_func(scaler, image, scalerParams);
// } else {
// switch (scaler) {
// case 'hookaborted':
// // Handled by the hook above
// err = mto.isError() ? mto : false;
// break;
// case 'im':
// err = this.transformImageMagick(image, scalerParams);
// break;
// case 'custom':
// err = this.transformCustom(image, scalerParams);
// break;
// case 'imext':
// err = this.transformImageMagickExt(image, scalerParams);
// break;
// case 'gd':
// default:
// err = this.transformGd(image, scalerParams);
// break;
// }
// }
//
// // Remove the file if a zero-byte thumbnail was created, or if there was an error
// removed = this.removeBadFile(dstPath, (boolean)err);
// if (err) {
// // transform returned MediaTransforError
// return err;
// } elseif (removed) {
// // Thumbnail was zero-byte and had to be removed
// return new MediaTransformError('thumbnail_error',
// scalerParams['clientWidth'], scalerParams['clientHeight'],
// wfMessage('unknown-error')
// );
// } elseif (mto) {
// return mto;
// } else {
// newParams = [
// 'width' => scalerParams['clientWidth'],
// 'height' => scalerParams['clientHeight']
// ];
// if (isset(paramsVar['quality'])) {
// newParams['quality'] = paramsVar['quality'];
// }
// if (isset(paramsVar['page']) && paramsVar['page']) {
// newParams['page'] = paramsVar['page'];
// }
// return new ThumbnailImage(image, dstUrl, dstPath, newParams);
// }
// return null;
}
// /**
// * Get the source file for the transform
// *
// * @param File file
// * @param array paramsVar
// * @return array Array with keys width, height and path.
// */
// protected function getThumbnailSource(file, paramsVar) {
// return file.getThumbnailSource(paramsVar);
// }
//
// /**
// * Returns what sort of scaler type should be used.
// *
// * Values can be one of client, im, custom, gd, imext, or an array
// * of Object, method-name to call that specific method.
// *
// * If specifying a custom scaler command with [ Obj, method ],
// * the method in question should take 2 parameters, a File Object,
// * and a scalerParams array with various options (See doTransform
// * for what is in scalerParams). On error it should return a
// * MediaTransformError Object. On success it should return false,
// * and simply make sure the thumbnail file is located at
// * scalerParams['dstPath'].
// *
// * If there is a problem with the output path, it returns "client"
// * to do client side scaling.
// *
// * @param String dstPath
// * @param boolean checkDstPath Check that dstPath is valid
// * @return String|Callable One of client, im, custom, gd, imext, or a Callable array.
// */
// abstract protected function getScalerType(dstPath, checkDstPath = true);
/**
* Get a ThumbnailImage that respresents an image that will be scaled
* client side
*
* @param File image File associated with this thumbnail
* @param array scalerParams Array with scaler paramsVar
* @return ThumbnailImage
*
* @todo FIXME: No rotation support
*/
private Xomw_ThumbnailImage getClientScalingThumbnailImage(Xomw_File image, Xomw_params_scalar scalerParams) {
Xomw_params_handler prms = new Xomw_params_handler();
prms.width = scalerParams.clientWidth;
prms.height = scalerParams.clientHeight;
return new Xomw_ThumbnailImage(image, image.getUrl(), null, prms);
}
// /**
// * Transform an image using ImageMagick
// *
// * This is a stub method. The real method is in BitmapHander.
// *
// * @param File image File associated with this thumbnail
// * @param array paramsVar Array with scaler paramsVar
// *
// * @return MediaTransformError Error Object if error occurred, false (=no error) otherwise
// */
// protected function transformImageMagick(image, paramsVar) {
// return this.getMediaTransformError(paramsVar, "Unimplemented");
// }
//
// /**
// * Transform an image using the Imagick PHP extension
// *
// * This is a stub method. The real method is in BitmapHander.
// *
// * @param File image File associated with this thumbnail
// * @param array paramsVar Array with scaler paramsVar
// *
// * @return MediaTransformError Error Object if error occurred, false (=no error) otherwise
// */
// protected function transformImageMagickExt(image, paramsVar) {
// return this.getMediaTransformError(paramsVar, "Unimplemented");
// }
//
// /**
// * Transform an image using a custom command
// *
// * This is a stub method. The real method is in BitmapHander.
// *
// * @param File image File associated with this thumbnail
// * @param array paramsVar Array with scaler paramsVar
// *
// * @return MediaTransformError Error Object if error occurred, false (=no error) otherwise
// */
// protected function transformCustom(image, paramsVar) {
// return this.getMediaTransformError(paramsVar, "Unimplemented");
// }
//
// /**
// * Get a MediaTransformError with error 'thumbnail_error'
// *
// * @param array paramsVar Parameter array as passed to the transform* functions
// * @param String errMsg Error message
// * @return MediaTransformError
// */
// public function getMediaTransformError(paramsVar, errMsg) {
// return new MediaTransformError('thumbnail_error', paramsVar['clientWidth'],
// paramsVar['clientHeight'], errMsg);
// }
//
// /**
// * Transform an image using the built in GD library
// *
// * This is a stub method. The real method is in BitmapHander.
// *
// * @param File image File associated with this thumbnail
// * @param array paramsVar Array with scaler paramsVar
// *
// * @return MediaTransformError Error Object if error occurred, false (=no error) otherwise
// */
// protected function transformGd(image, paramsVar) {
// return this.getMediaTransformError(paramsVar, "Unimplemented");
// }
//
// /**
// * Escape a String for ImageMagick's property input (e.g. -set -comment)
// * See InterpretImageProperties() in magick/property.c
// * @param String s
// * @return String
// */
// function escapeMagickProperty(s) {
// // Double the backslashes
// s = str_replace('\\', '\\\\', s);
// // Double the percents
// s = str_replace('%', '%%', s);
// // Escape initial - or @
// if (strlen(s) > 0 && (s[0] === '-' || s[0] === '@')) {
// s = '\\' . s;
// }
//
// return s;
// }
//
// /**
// * Escape a String for ImageMagick's input filenames. See ExpandFilenames()
// * and GetPathComponent() in magick/utility.c.
// *
// * This won't work with an initial ~ or @, so input files should be prefixed
// * with the directory name.
// *
// * Glob character unescaping is broken in ImageMagick before 6.6.1-5, but
// * it's broken in a way that doesn't involve trying to convert every file
// * in a directory, so we're better off escaping and waiting for the bugfix
// * to filter down to users.
// *
// * @param String path The file path
// * @param boolean|String scene The scene specification, or false if there is none
// * @throws MWException
// * @return String
// */
// function escapeMagickInput(path, scene = false) {
// // Die on initial metacharacters (caller should prepend path)
// firstChar = substr(path, 0, 1);
// if (firstChar === '~' || firstChar === '@') {
// throw new MWException(__METHOD__ . ': cannot escape this path name');
// }
//
// // Escape glob chars
// path = preg_replace('/[*?\[\]{}]/', '\\\\\0', path);
//
// return this.escapeMagickPath(path, scene);
// }
//
// /**
// * Escape a String for ImageMagick's output filename. See
// * InterpretImageFilename() in magick/image.c.
// * @param String path The file path
// * @param boolean|String scene The scene specification, or false if there is none
// * @return String
// */
// function escapeMagickOutput(path, scene = false) {
// path = str_replace('%', '%%', path);
//
// return this.escapeMagickPath(path, scene);
// }
//
// /**
// * Armour a String against ImageMagick's GetPathComponent(). This is a
// * helper function for escapeMagickInput() and escapeMagickOutput().
// *
// * @param String path The file path
// * @param boolean|String scene The scene specification, or false if there is none
// * @throws MWException
// * @return String
// */
// protected function escapeMagickPath(path, scene = false) {
// // Die on format specifiers (other than drive letters). The regex is
// // meant to match all the formats you get from "convert -list format"
// if (preg_match('/^([a-zA-Z0-9-]+):/', path, m)) {
// if (wfIsWindows() && is_dir(m[0])) {
// // OK, it's a drive letter
// // ImageMagick has a similar exception, see IsMagickConflict()
// } else {
// throw new MWException(__METHOD__ . ': unexpected colon character in path name');
// }
// }
//
// // If there are square brackets, add a do-nothing scene specification
// // to force a literal interpretation
// if (scene === false) {
// if (strpos(path, '[') !== false) {
// path .= '[0--1]';
// }
// } else {
// path .= "[scene]";
// }
//
// return path;
// }
//
// /**
// * Retrieve the version of the installed ImageMagick
// * You can use PHPs version_compare() to use this value
// * Value is cached for one hour.
// * @return String|boolean Representing the IM version; false on error
// */
// protected function getMagickVersion() {
// cache = MediaWikiServices::getInstance().getLocalServerObjectCache();
// method = __METHOD__;
// return cache.getWithSetCallback(
// 'imagemagick-version',
// cache::TTL_HOUR,
// function () use (method) {
// global wgImageMagickConvertCommand;
//
// cmd = wfEscapeShellArg(wgImageMagickConvertCommand) . ' -version';
// wfDebug(method . ": Running convert -version\n");
// retval = '';
// return = wfShellExec(cmd, retval);
// x = preg_match(
// '/Version: ImageMagick ([0-9]*\.[0-9]*\.[0-9]*)/', return, matches
// );
// if (x != 1) {
// wfDebug(method . ": ImageMagick version check failed\n");
// return false;
// }
//
// return matches[1];
// }
// );
// }
//
// /**
// * Returns whether the current scaler supports rotation.
// *
// * @since 1.24 No longer static
// * @return boolean
// */
// public function canRotate() {
// return false;
// }
//
// /**
// * Should we automatically rotate an image based on exif
// *
// * @since 1.24 No longer static
// * @see wgEnableAutoRotation
// * @return boolean Whether auto rotation is enabled
// */
// public function autoRotateEnabled() {
// return false;
// }
//
// /**
// * Rotate a thumbnail.
// *
// * This is a stub. See BitmapHandler::rotate.
// *
// * @param File file
// * @param array paramsVar Rotate parameters.
// * 'rotation' clockwise rotation in degrees, allowed are multiples of 90
// * @since 1.24 Is non-static. From 1.21 it was static
// * @return boolean|MediaTransformError
// */
// public function rotate(file, paramsVar) {
// return new MediaTransformError('thumbnail_error', 0, 0,
// get_class(this) . ' rotation not implemented');
// }
//
// /**
// * Returns whether the file needs to be rendered. Returns true if the
// * file requires rotation and we are able to rotate it.
// *
// * @param File file
// * @return boolean
// */
// public function mustRender(file) {
// return this.canRotate() && this.getRotation(file) != 0;
// }
//
// /**
// * Check if the file is smaller than the maximum image area for thumbnailing.
// *
// * Runs the 'BitmapHandlerCheckImageArea' hook.
// *
// * @param File file
// * @param array paramsVar
// * @return boolean
// * @since 1.25
// */
// public function isImageAreaOkForThumbnaling(file, &paramsVar) {
// global wgMaxImageArea;
//
// // For historical reasons, hook starts with BitmapHandler
// checkImageAreaHookResult = null;
// Hooks::run(
// 'BitmapHandlerCheckImageArea',
// [ file, &paramsVar, &checkImageAreaHookResult ]
// );
//
// if (!is_null(checkImageAreaHookResult)) {
// // was set by hook, so return that value
// return (boolean)checkImageAreaHookResult;
// }
//
// srcWidth = file.getWidth(paramsVar['page']);
// srcHeight = file.getHeight(paramsVar['page']);
//
// if (srcWidth * srcHeight > wgMaxImageArea
// && !(file.getMimeType() == 'image/jpeg'
// && this.getScalerType(false, false) == 'im')
// ) {
// // Only ImageMagick can efficiently downsize jpg images without loading
// // the entire file in memory
// return false;
// }
// return true;
// }
}

View File

@@ -0,0 +1,584 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.langs.htmls.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_block_level_pass {
private final Bry_bfr tmp = Bry_bfr_.New();
private final Btrie_rv trv = new Btrie_rv();
private boolean in_pre, dt_open;
private int last_section;
private byte[] find_colon_no_links__before, find_colon_no_links__after;
public void Do_block_levels(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, boolean line_start) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
// XO.STATIC
if (block_chars_ary == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
block_chars_ary = Block_chars_ary__new();
open_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
("<table", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<pre", "<tr", "<p", "<ul", "<ol", "<dl", "<li", "</tr", "</td", "</th");
close_match_trie = Btrie_slim_mgr.ci_a7().Add_many_str
( "</table", "</h1", "</h2", "</h3", "</h4", "</h5", "</h6", "<td", "<th", "<blockquote", "</blockquote", "<div", "</div", "<hr"
, "</pre", "</p", "</mw:", Xomw_strip_state.Str__marker_bgn + "-pre", "</li", "</ul", "</ol", "</dl", "<center", "</center");
blockquote_trie = Btrie_slim_mgr.ci_a7().Add_many_str("<blockquote", "</blockquote");
pre_trie = Btrie_slim_mgr.ci_a7().Add_str_int("<pre", Pre__bgn).Add_str_int("</pre", Pre__end);
}
}
// Parsing through the text line by line. The main thing
// happening here is handling of block-level elements p, pre,
// and making lists from lines starting with * # : etc.
byte[] last_prefix = Bry_.Empty;
bfr.Clear();
this.dt_open = false;
boolean in_block_elem = false;
int prefix_len = 0;
byte para_stack = Para_stack__none;
boolean in_blockquote = false;
this.in_pre = false;
this.last_section = Last_section__none;
byte[] prefix2 = null;
// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
int line_bgn = src_bgn;
while (line_bgn < src_end) {
int line_end = Bry_find_.Find_fwd(src, Byte_ascii.Nl, line_bgn);
if (line_end == Bry_find_.Not_found)
line_end = src_end;
// Fix up line_start
if (!line_start) {
bfr.Add_mid(src, line_bgn, line_end);
line_start = true;
continue;
}
// * = ul
// # = ol
// ; = dt
// : = dd
int last_prefix_len = last_prefix.length;
// PORTED: pre_close_match = preg_match('/<\\/pre/i', $oLine); pre_open_match = preg_match('/<pre/i', $oLine);
int pre_cur = line_bgn;
boolean pre_close_match = false;
boolean pre_open_match = false;
while (true) {
if (pre_cur >= line_end)
break;
Object o = pre_trie.Match_at(trv, src, pre_cur, line_end);
if (o == null)
pre_cur++;
else {
int pre_tid = Int_.cast(o);
if (pre_tid == Pre__bgn)
pre_open_match = true;
else if (pre_tid == Pre__end)
pre_close_match = true;
pre_cur = trv.Pos();
}
}
byte[] prefix = null, t = null;
// If not in a <pre> element, scan for and figure out what prefixes are there.
if (!in_pre) {
// Multiple prefixes may abut each other for nested lists.
prefix_len = Php_str_.Strspn_fwd__ary(src, block_chars_ary, line_bgn, line_end, line_end); // strspn($oLine, '*#:;');
prefix = Php_str_.Substr(src, line_bgn, prefix_len);
// eh?
// ; and : are both from definition-lists, so they're equivalent
// for the purposes of determining whether or not we need to open/close
// elements.
// substr( $inputLine, $prefixLength );
prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
t = Bry_.Mid(src, line_bgn + prefix_len, line_end);
in_pre = pre_open_match;
}
else {
// Don't interpret any other prefixes in preformatted text
prefix_len = 0;
prefix = prefix2 = Bry_.Empty;
t = Bry_.Mid(src, line_bgn, line_end);
}
// List generation
byte[] term = null, t2 = null;
int common_prefix_len = -1;
if (prefix_len > 0 && Bry_.Eq(last_prefix, prefix2)) {
// Same as the last item, so no need to deal with nesting or opening stuff
bfr.Add(Next_item(Php_str_.Substr_byte(prefix, -1)));
para_stack = Para_stack__none;
if (prefix_len > 0 && prefix[prefix_len - 1] == Byte_ascii.Semic) {
// The one nasty exception: definition lists work like this:
// ; title : definition text
// So we check for : in the remainder text to split up the
// title and definition, without b0rking links.
term = t2 = Bry_.Empty;
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
}
else if (prefix_len > 0 || last_prefix_len > 0) {
// We need to open or close prefixes, or both.
// Either open or close a level...
common_prefix_len = Get_common(prefix, last_prefix);
para_stack = Para_stack__none;
// Close all the prefixes which aren't shared.
while (common_prefix_len < last_prefix_len) {
bfr.Add(Close_list(last_prefix[last_prefix_len - 1]));
last_prefix_len--;
}
// Continue the current prefix if appropriate.
if (prefix_len <= common_prefix_len && common_prefix_len > 0) {
bfr.Add(Next_item(prefix[common_prefix_len - 1]));
}
// Open prefixes where appropriate.
if (Bry_.Len_gt_0(last_prefix) && prefix_len > common_prefix_len) {
bfr.Add_byte_nl();
}
while (prefix_len > common_prefix_len) {
byte c = Php_str_.Substr_byte(prefix, common_prefix_len, 1);
bfr.Add(Open_list(c));
if (c == Byte_ascii.Semic) {
// @todo FIXME: This is dupe of code above
if (Find_colon_no_links(t, term, t2) != Bry_find_.Not_found) {
term = find_colon_no_links__before;
t2 = find_colon_no_links__after;
t = t2;
bfr.Add(term).Add(Next_item(Byte_ascii.Colon));
}
}
++common_prefix_len;
}
if (prefix_len == 0 && Bry_.Len_gt_0(last_prefix)) {
bfr.Add_byte_nl();
}
last_prefix = prefix2;
}
// If we have no prefixes, go to paragraph mode.
if (0 == prefix_len) {
// No prefix (not in list)--go to paragraph mode
// XXX: use a stack for nestable elements like span, table and div
int t_len = t.length;
boolean open_match = Php_preg_.Match(open_match_trie, trv, t, 0, t_len) != null;
boolean close_match = Php_preg_.Match(close_match_trie, trv, t, 0, t_len) != null;
if (open_match || close_match) {
para_stack = Para_stack__none;
// @todo bug 5718: paragraph closed
bfr.Add(Close_paragraph());
if (pre_open_match && !pre_close_match) {
in_pre = true;
}
int bq_offset = 0;
// PORTED:preg_match('/<(\\/?)blockquote[\s>]/i', t, $bqMatch, PREG_OFFSET_CAPTURE, $bq_offset)
while (true) {
Object o = Php_preg_.Match(blockquote_trie, trv, t, bq_offset, t_len);
if (o == null) { // no more blockquotes found; exit
break;
}
else {
byte[] bq_bry = (byte[])o;
in_blockquote = bq_bry[1] != Byte_ascii.Slash; // is this a close tag?
bq_offset = trv.Pos();
}
}
in_block_elem = !close_match;
}
else if (!in_block_elem && !in_pre) {
if ( Php_str_.Substr_byte(t, 0) == Byte_ascii.Space
&& (last_section == Last_section__pre || Bry_.Trim(t) != Bry_.Empty)
&& !in_blockquote
) {
// pre
if (last_section != Last_section__pre) {
para_stack = Para_stack__none;
bfr.Add(Close_paragraph()).Add(Gfh_tag_.Pre_lhs);
last_section = Last_section__pre;
}
t = Bry_.Mid(t, 1);
}
else {
// paragraph
if (Bry_.Trim(t) == Bry_.Empty) {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
bfr.Add_str_a7("<br />");
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else {
if (last_section != Last_section__para) {
bfr.Add(Close_paragraph());
last_section = Last_section__none;
para_stack = Para_stack__bgn;
}
else {
para_stack = Para_stack__mid;
}
}
}
else {
if (para_stack != Para_stack__none) {
Para_stack_bfr(bfr, para_stack);
para_stack = Para_stack__none;
last_section = Last_section__para;
}
else if (last_section != Last_section__para) {
bfr.Add(Close_paragraph()).Add(Gfh_tag_.P_lhs);
this.last_section = Last_section__para;
}
}
}
}
}
// somewhere above we forget to get out of pre block (bug 785)
if (pre_close_match && in_pre) {
in_pre = false;
}
if (para_stack == Para_stack__none) {
bfr.Add(t);
if (prefix_len == 0) {
bfr.Add_byte_nl();
}
}
line_bgn = line_end + 1;
}
while (prefix_len > 0) {
bfr.Add(Close_list(prefix2[prefix_len - 1]));
prefix_len--;
if (prefix_len > 0) {
bfr.Add_byte_nl();
}
}
if (last_section != Last_section__none) {
bfr.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
last_section = Last_section__none;
}
}
// If a pre or p is open, return the corresponding close tag and update
// the state. If no tag is open, return an empty String.
public byte[] Close_paragraph() {
byte[] result = Bry_.Empty;
if (last_section != Last_section__none) {
tmp.Add(last_section == Last_section__para ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
result = tmp.Add_byte_nl().To_bry_and_clear();
}
in_pre = false;
last_section = Last_section__none;
return result;
}
// getCommon() returns the length of the longest common substring
// of both arguments, starting at the beginning of both.
private int Get_common(byte[] st1, byte[] st2) {
int st1_len = st1.length, st2_len = st2.length;
int shorter = st1_len < st2_len ? st1_len : st2_len;
int i;
for (i = 0; i < shorter; i++) {
if (st1[i] != st2[i]) {
break;
}
}
return i;
}
// Open the list item element identified by the prefix character.
private byte[] Open_list(byte c) {
byte[] result = Close_paragraph();
if (c == Byte_ascii.Star)
result = tmp.Add(result).Add_str_a7("<ul><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<ol><li>").To_bry_and_clear();
else if (c == Byte_ascii.Hash)
result = tmp.Add(result).Add_str_a7("<dl><dd>").To_bry_and_clear();
else if (c == Byte_ascii.Semic) {
result = tmp.Add(result).Add_str_a7("<dl><dt>").To_bry_and_clear();
dt_open = true;
}
else
result = tmp.Add_str_a7("<!-- ERR 1 -->").To_bry_and_clear();
return result;
}
// Close the current list item and open the next one.
private byte[] Next_item(byte c) {
if (c == Byte_ascii.Star || c == Byte_ascii.Hash) {
return tmp.Add_str_a7("</li>\n<li>").To_bry_and_clear();
}
else if (c == Byte_ascii.Colon || c == Byte_ascii.Semic) {
byte[] close = tmp.Add_str_a7("</dd>\n").To_bry_and_clear();
if (dt_open) {
close = tmp.Add_str_a7("</dt>\n").To_bry_and_clear();
}
if (c == Byte_ascii.Semic) {
dt_open = true;
return tmp.Add(close).Add_str_a7("<dt>").To_bry_and_clear();
}
else {
dt_open = false;
return tmp.Add(close).Add_str_a7("<dd>").To_bry_and_clear();
}
}
return tmp.Add_str_a7("<!-- ERR 2 -->").To_bry_and_clear();
}
// Close the current list item identified by the prefix character.
private byte[] Close_list(byte c) {
byte[] text = null;
if (c == Byte_ascii.Star) {
text = Bry_.new_a7("</li></ul>");
}
else if (c == Byte_ascii.Hash) {
text = Bry_.new_a7("</li></ol>");
}
else if (c == Byte_ascii.Colon) {
if (dt_open) {
dt_open = false;
text = Bry_.new_a7("</dt></dl>");
}
else {
text = Bry_.new_a7("</dd></dl>");
}
}
else {
return Bry_.new_a7("<!-- ERR 3 -->");
}
return text;
}
// Split up a String on ':', ignoring any occurrences inside tags
// to prevent illegal overlapping.
private int Find_colon_no_links(byte[] str, byte[] before, byte[] after) {
int len = str.length;
int colon_pos = Php_str_.Strpos(str, Byte_ascii.Colon, 0, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing to find!
return Bry_find_.Not_found;
}
int lt_pos = Php_str_.Strpos(str, Byte_ascii.Angle_bgn, 0, len);
if (lt_pos == Bry_find_.Not_found || lt_pos > colon_pos) {
// Easy; no tag nesting to worry about
find_colon_no_links__before = Php_str_.Substr(str, 0, colon_pos);
find_colon_no_links__after = Php_str_.Substr(str, colon_pos + 1);
return colon_pos;
}
// Ugly state machine to walk through avoiding tags.
int state = COLON_STATE_TEXT;
int level = 0;
for (int i = 0; i < len; i++) {
byte c = str[i];
switch (state) {
case COLON_STATE_TEXT:
switch (c) {
case Byte_ascii.Angle_bgn:
// Could be either a <start> tag or an </end> tag
state = COLON_STATE_TAGSTART;
break;
case Byte_ascii.Colon:
if (level == 0) {
// We found it!
find_colon_no_links__before = Php_str_.Substr(str, 0, i);
find_colon_no_links__after = Php_str_.Substr(str, i + 1);
return i;
}
// Embedded in a tag; don't break it.
break;
default:
// Skip ahead looking for something interesting
colon_pos = Php_str_.Strpos(str, Byte_ascii.Colon, i, len);
if (colon_pos == Bry_find_.Not_found) {
// Nothing else interesting
return Bry_find_.Not_found;
}
lt_pos = Php_str_.Strpos(str, Byte_ascii.Angle_bgn, i, len);
if (level == 0) {
if (lt_pos == Bry_find_.Not_found || colon_pos < lt_pos) {
// We found it!
find_colon_no_links__before = Php_str_.Substr(str, 0, colon_pos);
find_colon_no_links__after = Php_str_.Substr(str, colon_pos + 1);
return i;
}
}
if (lt_pos == Bry_find_.Not_found) {
// Nothing else interesting to find; abort!
// We're nested, but there's no close tags left. Abort!
i = len; // break 2
break;
}
// Skip ahead to next tag start
i = lt_pos;
state = COLON_STATE_TAGSTART;
break;
}
break;
case COLON_STATE_TAG:
// In a <tag>
switch (c) {
case Byte_ascii.Angle_bgn:
level++;
state = COLON_STATE_TEXT;
break;
case Byte_ascii.Slash:
// Slash may be followed by >?
state = COLON_STATE_TAGSLASH;
break;
default:
// ignore
break;
}
break;
case COLON_STATE_TAGSTART:
switch (c) {
case Byte_ascii.Slash:
state = COLON_STATE_CLOSETAG;
break;
case Byte_ascii.Bang:
state = COLON_STATE_COMMENT;
break;
case Byte_ascii.Angle_bgn:
// Illegal early close? This shouldn't happen D:
state = COLON_STATE_TEXT;
break;
default:
state = COLON_STATE_TAG;
break;
}
break;
case COLON_STATE_CLOSETAG:
// In a </tag>
if (c == Byte_ascii.Angle_bgn) {
level--;
if (level < 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; too many close tags");
return Bry_find_.Not_found;
}
state = COLON_STATE_TEXT;
}
break;
case COLON_STATE_TAGSLASH:
if (c == Byte_ascii.Angle_bgn) {
// Yes, a self-closed tag <blah/>
state = COLON_STATE_TEXT;
}
else {
// Probably we're jumping the gun, and this is an attribute
state = COLON_STATE_TAG;
}
break;
case COLON_STATE_COMMENT:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASH;
}
break;
case COLON_STATE_COMMENTDASH:
if (c == Byte_ascii.Dash) {
state = COLON_STATE_COMMENTDASHDASH;
}
else {
state = COLON_STATE_COMMENT;
}
break;
case COLON_STATE_COMMENTDASHDASH:
if (c == Byte_ascii.Angle_bgn) {
state = COLON_STATE_TEXT;
}
else {
state = COLON_STATE_COMMENT;
}
break;
default:
throw Err_.new_wo_type("State machine error");
}
}
if (level > 0) {
Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; not enough close tags (level ~{0}, state ~{1})", level, state);
return Bry_find_.Not_found;
}
return Bry_find_.Not_found;
}
private static final int
COLON_STATE_TEXT = 0
, COLON_STATE_TAG = 1
, COLON_STATE_TAGSTART = 2
, COLON_STATE_CLOSETAG = 3
, COLON_STATE_TAGSLASH = 4
, COLON_STATE_COMMENT = 5
, COLON_STATE_COMMENTDASH = 6
, COLON_STATE_COMMENTDASHDASH = 7
;
private static final byte
Last_section__none = 0 // ''
, Last_section__para = 1 // p
, Last_section__pre = 2 // pre
;
private static final byte
Para_stack__none = 0 // false
, Para_stack__bgn = 1 // <p>
, Para_stack__mid = 2 // </p><p>
;
private static final int Pre__bgn = 0, Pre__end = 1;
private static Btrie_slim_mgr pre_trie;
private static boolean[] block_chars_ary;
private static boolean[] Block_chars_ary__new() {
boolean[] rv = new boolean[256];
rv[Byte_ascii.Star] = true;
rv[Byte_ascii.Hash] = true;
rv[Byte_ascii.Colon] = true;
rv[Byte_ascii.Semic] = true;
return rv;
}
private static Btrie_slim_mgr open_match_trie, close_match_trie, blockquote_trie;
private static void Para_stack_bfr(Bry_bfr bfr, int id) {
switch (id) {
case Para_stack__bgn: bfr.Add_str_a7("<p>"); break;
case Para_stack__mid: bfr.Add_str_a7("</p><p>"); break;
default: throw Err_.new_unhandled_default(id);
}
}
}

View File

@@ -0,0 +1,42 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mediawiki.includes.linkers.*;
public class Xomw_block_level_pass__tst {
private final Xomw_block_level_pass__fxt fxt = new Xomw_block_level_pass__fxt();
@Test public void Basic() {
fxt.Test__do_block_levels(String_.Concat_lines_nl_skip_last
( "a"
), String_.Concat_lines_nl_skip_last
( "<p>a"
, "</p>"
));
}
}
class Xomw_block_level_pass__fxt {
private final Xomw_block_level_pass block_level_pass = new Xomw_block_level_pass();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public void Test__do_block_levels(String src, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
block_level_pass.Do_block_levels(pctx, pbfr.Init(Bry_.new_u8(src)), true);
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear());
}
}

View File

@@ -0,0 +1,251 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.langs.htmls.*;
import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.htmls.*;
import gplx.xowa.mediawiki.includes.linkers.*;
public class Xomw_link_holders {
private final Xomw_link_renderer link_renderer;
private final Bry_bfr tmp;
private int link_id = 0; // MOVED:Parser.php
private final Xomw_link_holder_list internals = new Xomw_link_holder_list();
private final Xomw_atr_mgr extra_atrs = new Xomw_atr_mgr();
private final Xomw_qry_mgr query = new Xomw_qry_mgr();
public Xomw_link_holders(Xomw_link_renderer link_renderer, Bry_bfr tmp) {
this.link_renderer = link_renderer;
this.tmp = tmp;
}
public void Clear() {
internals.Clear();
link_id = 0;
}
public void Make_holder(Bry_bfr bfr, Xoa_ttl nt, byte[] text, byte[][] query, byte[] trail, byte[] prefix) {
if (nt == null) {
// Fail gracefully
bfr.Add_str_a7("<!-- ERROR -->").Add(prefix).Add(text).Add(trail);
}
else {
// Separate the link trail from the rest of the link
// list( $inside, $trail ) = Linker::splitTrail( $trail );
byte[] inside = Bry_.Empty;
Xomw_link_holder_item entry = new Xomw_link_holder_item(nt, tmp.Add_bry_many(prefix, text, inside).To_bry_and_clear(), query);
boolean is_external = false; // $nt->isExternal()
if (is_external) {
// Use a globally unique ID to keep the objects mergable
// $key = $this->parent->nextLinkID();
// $this->interwikis[$key] = $entry;
// $retVal = "<!--IWLINK $key-->{$trail}";
}
else {
int key = link_id++;
internals.Add(key, entry);
bfr.Add(Bry__link__bgn).Add_int_variable(key).Add(Gfh_tag_.Comm_end).Add(trail); // "<!--LINK $ns:$key-->{$trail}";
}
}
}
public void Test__add(Xoa_ttl ttl, byte[] capt) {
int key = link_id++;
Xomw_link_holder_item item = new Xomw_link_holder_item(ttl, capt, Bry_.Ary_empty);
internals.Add(key, item);
}
public void Replace(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
this.Replace_internal(pbfr);
// $this->replaceInterwiki( $text );
}
private void Replace_internal(Xomw_parser_bfr pbfr) {
if (internals.Len() == 0)
return;
// $colours = [];
// $linkCache = LinkCache::singleton();
// $output = $this->parent->getOutput();
// $linkRenderer = $this->parent->getLinkRenderer();
// $linkcolour_ids = [];
// SKIP:Replace_internals does db lookup to identify redlinks;
// Construct search and replace arrays
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
int prv = 0;
while (true) {
int link_bgn = Bry_find_.Find_fwd(src, Bry__link__bgn, cur, src_end);
if (link_bgn == Bry_find_.Not_found) {
bfr.Add_mid(src, prv, src_end);
break;
}
int key_bgn = link_bgn + Bry__link__bgn.length;
int key_end = Bry_find_.Find_fwd_while_num(src, key_bgn, src_end);
int link_key = Bry_.To_int_or(src, key_bgn, key_end, -1);
Xomw_link_holder_item item = internals.Get_by(link_key);
// $pdbk = $entry['pdbk'];
// $title = $entry['title'];
// $query = isset( $entry['query'] ) ? $entry['query'] : [];
// $key = "$ns:$index";
// $searchkey = "<!--LINK $key-->";
// $displayText = $entry['text'];
// if ( isset( $entry['selflink'] ) ) {
// $replacePairs[$searchkey] = Linker::makeSelfLinkObj( $title, $displayText, $query );
// continue;
// }
// if ( $displayText === '' ) {
// $displayText = null;
// } else {
// $displayText = new HtmlArmor( $displayText );
// }
// if ( !isset( $colours[$pdbk] ) ) {
// $colours[$pdbk] = 'new';
// }
// $attribs = [];
// if ( $colours[$pdbk] == 'new' ) {
// $linkCache->addBadLinkObj( $title );
// $output->addLink( $title, 0 );
// $link = $linkRenderer->makeBrokenLink(
// $title, $displayText, $attribs, $query
// );
// } else {
// $link = $linkRenderer->makePreloadedLink(
// $title, $displayText, $colours[$pdbk], $attribs, $query
// );
// }
bfr.Add_mid(src, prv, link_bgn);
link_renderer.Make_preloaded_link(bfr, item.Title(), item.Text(), Bry_.Empty, extra_atrs, query.Clear());
cur = key_end + Gfh_tag_.Comm_end_len;
prv = cur;
}
}
// private void Replace_internal__db() {
// // Generate query
// $lb = new LinkBatch();
// $lb->setCaller( __METHOD__ );
//
// foreach ( $this->internals as $ns => $entries ) {
// foreach ( $entries as $entry ) {
// /** @var Title $title */
// $title = $entry['title'];
// $pdbk = $entry['pdbk'];
//
// # Skip invalid entries.
// # Result will be ugly, but prevents crash.
// if ( is_null( $title ) ) {
// continue;
// }
//
// # Check if it's a static known link, e.g. interwiki
// if ( $title->isAlwaysKnown() ) {
// $colours[$pdbk] = '';
// } elseif ( $ns == NS_SPECIAL ) {
// $colours[$pdbk] = 'new';
// } else {
// $id = $linkCache->getGoodLinkID( $pdbk );
// if ( $id != 0 ) {
// $colours[$pdbk] = $linkRenderer->getLinkClasses( $title );
// $output->addLink( $title, $id );
// $linkcolour_ids[$id] = $pdbk;
// } elseif ( $linkCache->isBadLink( $pdbk ) ) {
// $colours[$pdbk] = 'new';
// } else {
// # Not in the link cache, add it to the query
// $lb->addObj( $title );
// }
// }
// }
// }
// if ( !$lb->isEmpty() ) {
// $fields = array_merge(
// LinkCache::getSelectFields(),
// [ 'page_namespace', 'page_title' ]
// );
//
// $res = $dbr->select(
// 'page',
// $fields,
// $lb->constructSet( 'page', $dbr ),
// __METHOD__
// );
//
// # Fetch data and form into an associative array
// # non-existent = broken
// foreach ( $res as $s ) {
// $title = Title::makeTitle( $s->page_namespace, $s->page_title );
// $pdbk = $title->getPrefixedDBkey();
// $linkCache->addGoodLinkObjFromRow( $title, $s );
// $output->addLink( $title, $s->page_id );
// $colours[$pdbk] = $linkRenderer->getLinkClasses( $title );
// // add id to the extension todolist
// $linkcolour_ids[$s->page_id] = $pdbk;
// }
// unset( $res );
// }
// if ( count( $linkcolour_ids ) ) {
// // pass an array of page_ids to an extension
// Hooks::run( 'GetLinkColours', [ $linkcolour_ids, &$colours ] );
// }
//
// # Do a second query for different language variants of links and categories
// if ( $wgContLang->hasVariants() ) {
// $this->doVariants( $colours );
// }
// }
private static final byte[] Bry__link__bgn = Bry_.new_a7("<!--LINK ");
}
class Xomw_link_holder_list {
private int ary_len = 0, ary_max = 128;
private Xomw_link_holder_item[] ary = new Xomw_link_holder_item[128];
public int Len() {return ary_len;}
public void Clear() {
ary_len = 0;
if (ary_max > 128)
ary = new Xomw_link_holder_item[128];
}
public void Add(int key, Xomw_link_holder_item item) {
if (key >= ary_max) {
int new_max = ary_max * 2;
ary = (Xomw_link_holder_item[])Array_.Resize(ary, new_max);
ary_max = new_max;
}
ary[key] = item;
ary_len++;
}
public Xomw_link_holder_item Get_by(int key) {return ary[key];}
}
class Xomw_link_holder_item {
public Xomw_link_holder_item(Xoa_ttl title, byte[] text, byte[][] query) {
this.title = title;
this.text = text;
this.query = query;
}
public Xoa_ttl Title() {return title;} private final Xoa_ttl title;
public byte[] Text() {return text;} private final byte[] text;
public byte[] Pdbk() {return title.Get_prefixed_db_key();}
public byte[][] Query() {return query;} private final byte[][] query;
}

View File

@@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mediawiki.includes.linkers.*;
public class Xomw_link_holders__tst {
private final Xomw_link_holders__fxt fxt = new Xomw_link_holders__fxt();
@Test public void Replace__basic() {
fxt.Init__add("A", "a");
fxt.Test__replace("a <!--LINK 0--> b", "a <a href='/wiki/A' title='A'>a</a> b");
}
}
class Xomw_link_holders__fxt {
private final Xomw_link_holders holders = new Xomw_link_holders(new Xomw_link_renderer(new Xomw_sanitizer()), Bry_bfr_.New());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xowe_wiki wiki;
private boolean apos = true;
public Xomw_link_holders__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
this.wiki = Xoa_app_fxt.Make__wiki__edit(app);
}
public void Init__add(String ttl, String capt) {
holders.Test__add(wiki.Ttl_parse(Bry_.new_u8(ttl)), Bry_.new_u8(capt));
}
public void Test__replace(String src, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
holders.Replace(new Xomw_parser_ctx(), pbfr.Init(Bry_.new_u8(src)));
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear());
}
}

View File

@@ -0,0 +1,27 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_output_type {
public static final byte
Tid__html = 1 // like parse()
, Tid__wiki = 2 // like preSaveTransform()
, Tid__preprocess = 3 // like preprocess()
, Tid__msg = 3
, Tid__plain = 4 // like extractSections() - portions of the original are returned unchanged.
;
}

View File

@@ -0,0 +1,299 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*; import gplx.core.net.*;
import gplx.xowa.mediawiki.includes.parsers.prepros.*; import gplx.xowa.mediawiki.includes.parsers.headings.*;
import gplx.xowa.mediawiki.includes.parsers.quotes.*; import gplx.xowa.mediawiki.includes.parsers.tables.*; import gplx.xowa.mediawiki.includes.parsers.hrs.*; import gplx.xowa.mediawiki.includes.parsers.nbsps.*;
import gplx.xowa.mediawiki.includes.parsers.lnkes.*; import gplx.xowa.mediawiki.includes.parsers.lnkis.*; import gplx.xowa.mediawiki.includes.parsers.magiclinks.*; import gplx.xowa.mediawiki.includes.parsers.doubleunders.*;
import gplx.xowa.mediawiki.includes.utls.*; import gplx.xowa.mediawiki.includes.linkers.*;
import gplx.xowa.mediawiki.includes.htmls.*;
public class Xomw_parser {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_table_wkr table_wkr;
private final Xomw_hr_wkr hr_wkr = new Xomw_hr_wkr();
private final Xomw_lnke_wkr lnke_wkr;
private final Xomw_nbsp_wkr nbsp_wkr = new Xomw_nbsp_wkr();
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
private final Xomw_magiclinks_wkr magiclinks_wkr;
private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
private final Xomw_link_renderer link_renderer;
private final Xomw_link_holders holders;
private final Xomw_heading_cbk__html heading_wkr_cbk;
private final Btrie_slim_mgr protocols_trie;
private final Xomw_doubleunder_data doubleunder_data = new Xomw_doubleunder_data();
private static Xomw_regex_space regex_space;
private static Xomw_regex_boundary regex_boundary;
private static Xomw_regex_url regex_url;
private final Btrie_rv trv = new Btrie_rv();
private int marker_index = 0;
// private final Xomw_prepro_wkr prepro_wkr = new Xomw_prepro_wkr();
public Xomw_parser_env Env() {return env;} private final Xomw_parser_env env = new Xomw_parser_env();
public Xomw_parser_options Options() {return options;} private final Xomw_parser_options options = new Xomw_parser_options();
public Xomw_strip_state Strip_state() {return strip_state;} private final Xomw_strip_state strip_state = new Xomw_strip_state();
public Xomw_sanitizer Sanitizer() {return sanitizer;} private final Xomw_sanitizer sanitizer = new Xomw_sanitizer();
public Xomw_linker Linker() {return linker;} private final Xomw_linker linker;
public Bry_bfr Tmp() {return tmp;} private final Bry_bfr tmp = Bry_bfr_.New();
public Xomw_quote_wkr Quote_wkr() {return quote_wkr;} private final Xomw_quote_wkr quote_wkr;
public Xomw_lnki_wkr Lnki_wkr() {return lnki_wkr;} private final Xomw_lnki_wkr lnki_wkr;
public boolean Output_type__wiki() {return output_type__wiki;} private final boolean output_type__wiki = false;
public Xomw_parser() {
if (regex_space == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
regex_space = new Xomw_regex_space();
regex_boundary = new Xomw_regex_boundary(regex_space);
regex_url = new Xomw_regex_url(regex_space);
Atr__rel = Bry_.new_a7("rel");
Get_external_link_rel = Bry_.new_a7("nofollow");
}
}
this.link_renderer = new Xomw_link_renderer(sanitizer);
this.linker = new Xomw_linker(link_renderer);
this.protocols_trie = Xomw_parser.Protocols__dflt();
this.holders = new Xomw_link_holders(link_renderer, tmp);
this.table_wkr = new Xomw_table_wkr(this);
this.quote_wkr = new Xomw_quote_wkr(this);
this.lnke_wkr = new Xomw_lnke_wkr(this);
this.lnki_wkr = new Xomw_lnki_wkr(this, holders, link_renderer, protocols_trie);
this.heading_wkr_cbk = new Xomw_heading_cbk__html();
this.magiclinks_wkr = new Xomw_magiclinks_wkr(this, sanitizer, linker, regex_boundary, regex_url);
}
public void Init_by_wiki(Xowe_wiki wiki) {
linker.Init_by_wiki(env, wiki.Lang().Lnki_trail_mgr().Trie());
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
lnki_wkr.Init_by_wiki(env, wiki);
doubleunder_wkr.Init_by_wiki(doubleunder_data, wiki.Lang());
magiclinks_wkr.Init_by_wiki();
}
public void Init_by_page(Xoa_ttl ttl) {
pctx.Init_by_page(ttl);
}
public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) {
pbfr.Init(text);
// $origText = text;
// MW.HOOK:ParserBeforeInternalParse
// if ($frame) {
// use frame depth to infer how include/noinclude tags should be handled
// depth=0 means this is the top-level document; otherwise it's an included document
// boolean for_inclusion = false;
// if (!$frame->depth) {
// $flag = 0;
// } else {
// $flag = Parser::PTD_FOR_INCLUSION;
// }
// text = prepro_wkr.Preprocess_to_xml(text, for_inclusion);
// text = $frame->expand($dom);
// } else {
// // if $frame is not provided, then use old-style replaceVariables
// text = $this->replaceVariables(text);
// }
// MW.HOOK:InternalParseBeforeSanitize
// text = Sanitizer::removeHTMLtags(
// text,
// [ &$this, 'attributeStripCallback' ],
// false,
// array_keys($this->mTransparentTagHooks),
// [],
// [ &$this, 'addTrackingCategory' ]
// );
// MW.HOOK:InternalParseBeforeLinks
// Tables need to come after variable replacement for things to work
// properly; putting them before other transformations should keep
// exciting things like link expansions from showing up in surprising
// places.
table_wkr.Do_table_stuff(pctx, pbfr);
hr_wkr.Replace_hrs(pctx, pbfr);
doubleunder_wkr.Do_double_underscore(pctx, pbfr); // DONE: DATE:2017-01-27
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
lnki_wkr.Replace_internal_links(pctx, pbfr);
quote_wkr.Do_all_quotes(pctx, pbfr);
lnke_wkr.Replace_external_links(pctx, pbfr);
// replaceInternalLinks may sometimes leave behind
// absolute URLs, which have to be masked to hide them from replaceExternalLinks
Xomw_parser_bfr_.Replace(pbfr, Bry__marker__noparse, Bry_.Empty);
magiclinks_wkr.Do_magic_links(pctx, pbfr);
// $text = $this->formatHeadings($text, $origText, $isMain);
}
public void Internal_parse_half_parsed(Xomw_parser_bfr pbfr, boolean is_main, boolean line_start) {
strip_state.Unstrip_general(pbfr);
// MW.HOOK:ParserAfterUnstrip
// Clean up special characters, only run once, next-to-last before doBlockLevels
nbsp_wkr.Do_nbsp(pctx, pbfr);
block_wkr.Do_block_levels(pctx, pbfr, line_start);
lnki_wkr.Replace_link_holders(pctx, pbfr);
// The input doesn't get language converted if
// a) It's disabled
// b) Content isn't converted
// c) It's a conversion table
// d) it is an interface message (which is in the user language)
// if ( !( $this->mOptions->getDisableContentConversion()
// || isset( $this->mDoubleUnderscores['nocontentconvert'] ) )
// ) {
// if ( !$this->mOptions->getInterfaceMessage() ) {
// // The position of the convert() call should not be changed. it
// // assumes that the links are all replaced and the only thing left
// // is the <nowiki> mark.
// $text = $this->getConverterLanguage()->convert( $text );
// }
// }
strip_state.Unstrip_nowiki(pbfr);
// MW.HOOK:ParserBeforeTidy
// $text = $this->replaceTransparentTags( $text );
strip_state.Unstrip_general(pbfr);
sanitizer.Normalize_char_references(pbfr);
// if ( MWTidy::isEnabled() ) {
// if ( $this->mOptions->getTidy() ) {
// $text = MWTidy::tidy( $text );
// }
// }
// else {
// attempt to sanitize at least some nesting problems
// (T4702 and quite a few others)
// $tidyregs = [
// // ''Something [http://www.cool.com cool''] -->
// // <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
// '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' =>
// '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9',
// // fix up an anchor inside another anchor, only
// // at least for a single single nested link (T5695)
// '/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\/a>(.*)<\/a>/' =>
// '\\1\\2</a>\\3</a>\\1\\4</a>',
// // fix div inside inline elements- doBlockLevels won't wrap a line which
// // contains a div, so fix it up here; replace
// // div with escaped text
// '/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' =>
// '\\1\\3&lt;div\\5&gt;\\6&lt;/div&gt;\\8\\9',
// // remove empty italic or bold tag pairs, some
// // introduced by rules above
// '/<([bi])><\/\\1>/' => '',
// ];
// $text = preg_replace(
// array_keys( $tidyregs ),
// array_values( $tidyregs ),
// $text );
// }
// MW.HOOK:ParserAfterTidy
}
public byte[] Armor_links(Bry_bfr trg, byte[] src, int src_bgn, int src_end) {
// PORTED:preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/', self::MARKER_PREFIX . "NOPARSE$1", $text )
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
boolean called_by_bry = trg == null;
while (true) {
// exit if EOS
if (cur == src_end) {
// if dirty, add rest of String
if (dirty)
trg.Add_mid(src, prv, src_end);
break;
}
// check if cur matches protocol
Object protocol_obj = protocols_trie.Match_at(trv, src, cur, src_end);
// no match; continue
if (protocol_obj == null) {
cur++;
}
// match; add to bfr
else {
dirty = true;
byte[] protocol_bry = (byte[])protocol_obj;
if (called_by_bry) trg = Bry_bfr_.New();
trg.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__noparse, protocol_bry);
cur += protocol_bry.length;
prv = cur;
}
}
if (called_by_bry) {
if (dirty)
return trg.To_bry_and_clear();
else {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
}
else {
if (dirty)
return null;
else {
trg.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
public byte[] Insert_strip_item(byte[] text) {
tmp.Add_bry_many(Xomw_strip_state.Bry__marker__bgn, Bry__strip_state_item);
tmp.Add_int_variable(marker_index);
tmp.Add(Xomw_strip_state.Bry__marker__end);
byte[] marker = tmp.To_bry_and_clear();
marker_index++;
strip_state.Add_general(marker, text);
return marker;
}
public Xomw_atr_mgr Get_external_link_attribs(Xomw_atr_mgr atrs) {
atrs.Clear();
byte[] rel = Get_external_link_rel;
// XO.MW.UNSUPPORTED: XO will assume target is blank; MW will set target of "_blank", "_self", etc. depending on global opt
// $target = $this->mOptions->getExternalLinkTarget();
atrs.Add(Atr__rel, rel);
return atrs;
}
// XO.MW.UNSUPPORTED: XO will always assume "nofollow"; MW will return "nofollow" if (a) ns is in ns-exception list or (b) domain is in domain-exception list;
// if ($wgNoFollowLinks && !in_array($ns, $wgNoFollowNsExceptions) && !wfMatchesDomainList($url, $wgNoFollowDomainExceptions)
public byte[] Get_external_link_rel;
private static byte[] Atr__rel;
private static final byte[] Bry__strip_state_item = Bry_.new_a7("-item-"), Bry__noparse = Bry_.new_a7("NOPARSE");
private static final byte[] Bry__marker__noparse = Bry_.Add(Xomw_strip_state.Bry__marker__bgn, Bry__noparse);
public static Btrie_slim_mgr Protocols__dflt() {
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
Gfo_protocol_itm[] ary = Gfo_protocol_itm.Ary();
for (Gfo_protocol_itm itm : ary) {
byte[] key = itm.Text_bry(); // EX: "https://"
rv.Add_obj(key, key);
}
byte[] bry__relative = Bry_.new_a7("//");
rv.Add_obj(bry__relative, bry__relative); // REF.MW: "$this->mUrlProtocols = wfUrlProtocols();"; "wfUrlProtocols( $includeProtocolRelative = true )"
return rv;
}
}

View File

@@ -0,0 +1,77 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*;
public class Xomw_parser__tst {
private final Xomw_parser__fxt fxt = new Xomw_parser__fxt();
@Test public void Basic() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "== heading_1 =="
, "para_1"
, "== heading_2 =="
, "para_2"
, "-----"
, "{|"
, "|-"
, "|a"
, "|}"
, "''italics''"
, "__TOC__"
, "[https://a.org b]"
, "[[A|abc]]"
, "https://c.org"
, "a »b« &#160;!important c"
), String_.Concat_lines_nl_skip_last
( "<h2> heading_1 </h2>"
, "<p>para_1"
, "</p>"
, "<h2> heading_2 </h2>"
, "<p>para_2"
, "</p>"
, "<hr />"
, "<table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
, "<p><i>italics</i>"
, "<!--MWTOC-->"
, "<a rel=\"nofollow\" class=\"external text\" href=\"https://a.org\">b</a>"
, "<a href=\"/wiki/A\" title=\"A\">abc</a>"
, "<a rel=\"nofollow\" class=\"external free\" href=\"https://c.org\">https://c.org</a>"
, "a&#160;»b«&#160; !important c"
, "</p>"
));
}
}
class Xomw_parser__fxt {
private final Xomw_parser mgr = new Xomw_parser();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public Xomw_parser__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
mgr.Init_by_wiki(wiki);
mgr.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
mgr.Internal_parse(pbfr, src_bry);
mgr.Internal_parse_half_parsed(pbfr, true, true);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,48 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_parser_bfr { // manages 2 bfrs to eliminate multiple calls to new memory allocations ("return bfr.To_bry_and_clear()")
private final Bry_bfr bfr_1 = Bry_bfr_.New(), bfr_2 = Bry_bfr_.New();
private Bry_bfr src, trg;
public Xomw_parser_bfr() {
this.src = bfr_1;
this.trg = bfr_2;
}
public Bry_bfr Src() {return src;}
public Bry_bfr Trg() {return trg;}
public Bry_bfr Rslt() {return src;}
public Xomw_parser_bfr Init(byte[] text) {
// resize each bfr once by guessing that html_len = text_len * 2
int text_len = text.length;
int html_len = text_len * 2;
src.Resize(html_len);
trg.Resize(html_len);
// clear and add
src.Clear();
trg.Clear();
src.Add(text);
return this;
}
public void Switch() {
Bry_bfr tmp = src;
this.src = trg;
this.trg = tmp;
trg.Clear();
}
}

View File

@@ -0,0 +1,69 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_parser_bfr_ {
public static void Replace(Xomw_parser_bfr pbfr, byte[] find, byte[] repl) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
if (Replace(bfr, Bool_.N, src, src_bgn, src_end, find, repl) != null)
pbfr.Switch();
}
private static byte[] Replace(Bry_bfr bfr, boolean lone_bfr, byte[] src, int src_bgn, int src_end, byte[] find, byte[] repl) {
boolean dirty = false;
int cur = src_bgn;
boolean called_by_bry = bfr == null;
while (true) {
int find_bgn = Bry_find_.Find_fwd(src, find, cur);
if (find_bgn == Bry_find_.Not_found) {
if (dirty)
bfr.Add_mid(src, cur, src_end);
break;
}
if (called_by_bry) bfr = Bry_bfr_.New();
bfr.Add_mid(src, cur, find_bgn);
cur += find.length;
dirty = true;
}
if (dirty) {
if (called_by_bry)
return bfr.To_bry_and_clear();
else
return Bry_.Empty;
}
else {
if (called_by_bry) {
if (src_bgn == 0 && src_end == src.length)
return src;
else
return Bry_.Mid(src, src_bgn, src_end);
}
else {
if (lone_bfr)
bfr.Add_mid(src, src_bgn, src_end);
return null;
}
}
}
}

View File

@@ -0,0 +1,32 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.parsers.lnkis.*;
public class Xomw_parser_ctx {
public Xoa_ttl Page_title() {return page_title;} private Xoa_ttl page_title;
public Xomw_image_params Lnki_wkr__make_image__img_params = new Xomw_image_params();
public byte[][] Lnki_wkr__make_image__match_magic_word = new byte[2][];
public int[] Lnki_wkr__make_image__img_size = new int[2];
public Xomw_params_mto Linker__makeImageLink__prms = new Xomw_params_mto();
public void Init_by_page(Xoa_ttl page_title) {
this.page_title = page_title;
}
public static final int Pos__bos = -1;
}

View File

@@ -0,0 +1,34 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.xowa.mediawiki.includes.filerepo.file.*; import gplx.xowa.mediawiki.includes.media.*;
public class Xomw_parser_env {
public byte[] Lang__align_end = Bry_.new_a7("right");
public int User__default__thumbsize = 220;
public int Global__wgSVGMaxSize = 5120;
public double Global__wgThumbUpright = .75d;
public int[] Global__wgThumbLimits = new int[] {120, 150, 180, 200, 250, 300};
public Xomw_MagicWordMgr Magic_word_mgr() {return magic_word_mgr;} private final Xomw_MagicWordMgr magic_word_mgr = new Xomw_MagicWordMgr();
public Xomw_message_mgr Message_mgr() {return message_mgr;} private final Xomw_message_mgr message_mgr = new Xomw_message_mgr();
public Xomw_file_finder File_finder() {return file_finder;} private Xomw_file_finder file_finder = new Xomw_file_finder__noop();
public Xomw_MediaHandlerFactory MediaHandlerFactory() {return mediaHandlerFactory;} private final Xomw_MediaHandlerFactory mediaHandlerFactory = new Xomw_MediaHandlerFactory();
public Xomw_parser_env File_finder_(Xomw_file_finder v) {file_finder = v; return this;}
}

View File

@@ -0,0 +1,933 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_parser_options {
public Xomw_parser_options() {
this.mThumbSize = 220;
}
// /**
// * Interlanguage links are removed and returned in an array
// */
// private $mInterwikiMagic;
//
// /**
// * Allow external images inline?
// */
// private $mAllowExternalImages;
//
// /**
// * If not, any exception?
// */
// private $mAllowExternalImagesFrom;
//
// /**
// * If not or it doesn't match, should we check an on-wiki whitelist?
// */
// private $mEnableImageWhitelist;
//
// /**
// * Date format index
// */
// private $mDateFormat = null;
//
// /**
// * Create "edit section" links?
// */
// private $mEditSection = true;
//
// /**
// * Allow inclusion of special pages?
// */
// private $mAllowSpecialInclusion;
//
// /**
// * Use tidy to cleanup output HTML?
// */
// private $mTidy = false;
//
// /**
// * Which lang to call for PLURAL and GRAMMAR
// */
// private $mInterfaceMessage = false;
//
// /**
// * Overrides $mInterfaceMessage with arbitrary language
// */
// private $mTargetLanguage = null;
//
// /**
// * Maximum size of template expansions, in bytes
// */
// private $mMaxIncludeSize;
//
// /**
// * Maximum number of nodes touched by PPFrame::expand()
// */
// private $mMaxPPNodeCount;
//
// /**
// * Maximum number of nodes generated by Preprocessor::preprocessToObj()
// */
// private $mMaxGeneratedPPNodeCount;
//
// /**
// * Maximum recursion depth in PPFrame::expand()
// */
// private $mMaxPPExpandDepth;
//
// /**
// * Maximum recursion depth for templates within templates
// */
// private $mMaxTemplateDepth;
//
// /**
// * Maximum number of calls per parse to expensive parser functions
// */
// private $mExpensiveParserFunctionLimit;
//
// /**
// * Remove HTML comments. ONLY APPLIES TO PREPROCESS OPERATIONS
// */
// private $mRemoveComments = true;
//
// /**
// * @var callable Callback for current revision fetching; first argument to call_user_func().
// */
// private $mCurrentRevisionCallback =
// [ 'Parser', 'statelessFetchRevision' ];
//
// /**
// * @var callable Callback for template fetching; first argument to call_user_func().
// */
// private $mTemplateCallback =
// [ 'Parser', 'statelessFetchTemplate' ];
//
// /**
// * @var callable|null Callback to generate a guess for {{REVISIONID}}
// */
// private $mSpeculativeRevIdCallback;
//
// /**
// * Enable limit report in an HTML comment on output
// */
// private $mEnableLimitReport = false;
//
// /**
// * Timestamp used for {{CURRENTDAY}} etc.
// */
// private $mTimestamp;
//
// /**
// * Target attribute for external links
// */
// private $mExternalLinkTarget;
//
// /**
// * Clean up signature texts?
// * @see Parser::cleanSig
// */
// private $mCleanSignatures;
//
// /**
// * Transform wiki markup when saving the page?
// */
// private $mPreSaveTransform = true;
//
// /**
// * Whether content conversion should be disabled
// */
// private $mDisableContentConversion;
//
// /**
// * Whether title conversion should be disabled
// */
// private $mDisableTitleConversion;
//
// /**
// * Automatically number headings?
// */
// private $mNumberHeadings;
/**
* Thumb size preferred by the user.
*/
private int mThumbSize;
// /**
// * Maximum article size of an article to be marked as "stub"
// */
// private $mStubThreshold;
//
// /**
// * Language Object of the User language.
// */
// private $mUserLang;
//
// /**
// * @var User
// * Stored user Object
// */
// private $mUser;
//
// /**
// * Parsing the page for a "preview" operation?
// */
// private $mIsPreview = false;
//
// /**
// * Parsing the page for a "preview" operation on a single section?
// */
// private $mIsSectionPreview = false;
//
// /**
// * Parsing the printable version of the page?
// */
// private $mIsPrintable = false;
//
// /**
// * Extra key that should be present in the caching key.
// */
// private $mExtraKey = '';
//
// /**
// * Are magic ISBN links enabled?
// */
// private $mMagicISBNLinks = true;
//
// /**
// * Are magic PMID links enabled?
// */
// private $mMagicPMIDLinks = true;
//
// /**
// * Are magic RFC links enabled?
// */
// private $mMagicRFCLinks = true;
//
// /**
// * Function to be called when an option is accessed.
// */
// private $onAccessCallback = null;
//
// /**
// * If the page being parsed is a redirect, this should hold the redirect
// * target.
// * @var Title|null
// */
// private $redirectTarget = null;
//
// public function getInterwikiMagic() {
// return this.mInterwikiMagic;
// }
//
// public function getAllowExternalImages() {
// return this.mAllowExternalImages;
// }
//
// public function getAllowExternalImagesFrom() {
// return this.mAllowExternalImagesFrom;
// }
//
// public function getEnableImageWhitelist() {
// return this.mEnableImageWhitelist;
// }
//
// public function getEditSection() {
// return this.mEditSection;
// }
//
// public function getNumberHeadings() {
// this.optionUsed( 'numberheadings' );
//
// return this.mNumberHeadings;
// }
//
// public function getAllowSpecialInclusion() {
// return this.mAllowSpecialInclusion;
// }
//
// public function getTidy() {
// return this.mTidy;
// }
//
// public function getInterfaceMessage() {
// return this.mInterfaceMessage;
// }
//
// public function getTargetLanguage() {
// return this.mTargetLanguage;
// }
//
// public function getMaxIncludeSize() {
// return this.mMaxIncludeSize;
// }
//
// public function getMaxPPNodeCount() {
// return this.mMaxPPNodeCount;
// }
//
// public function getMaxGeneratedPPNodeCount() {
// return this.mMaxGeneratedPPNodeCount;
// }
//
// public function getMaxPPExpandDepth() {
// return this.mMaxPPExpandDepth;
// }
//
// public function getMaxTemplateDepth() {
// return this.mMaxTemplateDepth;
// }
//
// /* @since 1.20 */
// public function getExpensiveParserFunctionLimit() {
// return this.mExpensiveParserFunctionLimit;
// }
//
// public function getRemoveComments() {
// return this.mRemoveComments;
// }
//
// /* @since 1.24 */
// public function getCurrentRevisionCallback() {
// return this.mCurrentRevisionCallback;
// }
//
// public function getTemplateCallback() {
// return this.mTemplateCallback;
// }
//
// /** @since 1.28 */
// public function getSpeculativeRevIdCallback() {
// return this.mSpeculativeRevIdCallback;
// }
//
// public function getEnableLimitReport() {
// return this.mEnableLimitReport;
// }
//
// public function getCleanSignatures() {
// return this.mCleanSignatures;
// }
//
// public function getExternalLinkTarget() {
// return this.mExternalLinkTarget;
// }
//
// public function getDisableContentConversion() {
// return this.mDisableContentConversion;
// }
//
// public function getDisableTitleConversion() {
// return this.mDisableTitleConversion;
// }
public int getThumbSize() {
// this.optionUsed( 'thumbsize' );
return this.mThumbSize;
}
// public function getStubThreshold() {
// this.optionUsed( 'stubthreshold' );
//
// return this.mStubThreshold;
// }
//
// public function getIsPreview() {
// return this.mIsPreview;
// }
//
// public function getIsSectionPreview() {
// return this.mIsSectionPreview;
// }
//
// public function getIsPrintable() {
// this.optionUsed( 'printable' );
//
// return this.mIsPrintable;
// }
//
// public function getUser() {
// return this.mUser;
// }
//
// public function getPreSaveTransform() {
// return this.mPreSaveTransform;
// }
//
// public function getDateFormat() {
// this.optionUsed( 'dateformat' );
// if ( !isset( this.mDateFormat ) ) {
// this.mDateFormat = this.mUser->getDatePreference();
// }
// return this.mDateFormat;
// }
//
// public function getTimestamp() {
// if ( !isset( this.mTimestamp ) ) {
// this.mTimestamp = wfTimestampNow();
// }
// return this.mTimestamp;
// }
//
// /**
// * Get the user language used by the parser for this page and split the parser cache.
// *
// * @warning: Calling this causes the parser cache to be fragmented by user language!
// * To avoid cache fragmentation, output should not depend on the user language.
// * Use Parser::getFunctionLang() or Parser::getTargetLanguage() instead!
// *
// * @note This function will trigger a cache fragmentation by recording the
// * 'userlang' option, see optionUsed(). This is done to avoid cache pollution
// * when the page is rendered based on the language of the user.
// *
// * @note When saving, this will return the default language instead of the user's.
// * {{int: }} uses this which used to produce inconsistent link tables (bug 14404).
// *
// * @return Language
// * @since 1.19
// */
// public function getUserLangObj() {
// this.optionUsed( 'userlang' );
// return this.mUserLang;
// }
//
// /**
// * Same as getUserLangObj() but returns a String instead.
// *
// * @warning: Calling this causes the parser cache to be fragmented by user language!
// * To avoid cache fragmentation, output should not depend on the user language.
// * Use Parser::getFunctionLang() or Parser::getTargetLanguage() instead!
// *
// * @see getUserLangObj()
// *
// * @return String Language code
// * @since 1.17
// */
// public function getUserLang() {
// return this.getUserLangObj()->getCode();
// }
//
// /**
// * @since 1.28
// * @return boolean
// */
// public function getMagicISBNLinks() {
// return this.mMagicISBNLinks;
// }
//
// /**
// * @since 1.28
// * @return boolean
// */
// public function getMagicPMIDLinks() {
// return this.mMagicPMIDLinks;
// }
// /**
// * @since 1.28
// * @return boolean
// */
// public function getMagicRFCLinks() {
// return this.mMagicRFCLinks;
// }
// public function setInterwikiMagic( $x ) {
// return wfSetVar( this.mInterwikiMagic, $x );
// }
//
// public function setAllowExternalImages( $x ) {
// return wfSetVar( this.mAllowExternalImages, $x );
// }
//
// public function setAllowExternalImagesFrom( $x ) {
// return wfSetVar( this.mAllowExternalImagesFrom, $x );
// }
//
// public function setEnableImageWhitelist( $x ) {
// return wfSetVar( this.mEnableImageWhitelist, $x );
// }
//
// public function setDateFormat( $x ) {
// return wfSetVar( this.mDateFormat, $x );
// }
//
// public function setEditSection( $x ) {
// return wfSetVar( this.mEditSection, $x );
// }
//
// public function setNumberHeadings( $x ) {
// return wfSetVar( this.mNumberHeadings, $x );
// }
//
// public function setAllowSpecialInclusion( $x ) {
// return wfSetVar( this.mAllowSpecialInclusion, $x );
// }
//
// public function setTidy( $x ) {
// return wfSetVar( this.mTidy, $x );
// }
//
// public function setInterfaceMessage( $x ) {
// return wfSetVar( this.mInterfaceMessage, $x );
// }
//
// public function setTargetLanguage( $x ) {
// return wfSetVar( this.mTargetLanguage, $x, true );
// }
//
// public function setMaxIncludeSize( $x ) {
// return wfSetVar( this.mMaxIncludeSize, $x );
// }
//
// public function setMaxPPNodeCount( $x ) {
// return wfSetVar( this.mMaxPPNodeCount, $x );
// }
//
// public function setMaxGeneratedPPNodeCount( $x ) {
// return wfSetVar( this.mMaxGeneratedPPNodeCount, $x );
// }
//
// public function setMaxTemplateDepth( $x ) {
// return wfSetVar( this.mMaxTemplateDepth, $x );
// }
//
// /* @since 1.20 */
// public function setExpensiveParserFunctionLimit( $x ) {
// return wfSetVar( this.mExpensiveParserFunctionLimit, $x );
// }
//
// public function setRemoveComments( $x ) {
// return wfSetVar( this.mRemoveComments, $x );
// }
//
// /* @since 1.24 */
// public function setCurrentRevisionCallback( $x ) {
// return wfSetVar( this.mCurrentRevisionCallback, $x );
// }
//
// /** @since 1.28 */
// public function setSpeculativeRevIdCallback( $x ) {
// return wfSetVar( this.mSpeculativeRevIdCallback, $x );
// }
//
// public function setTemplateCallback( $x ) {
// return wfSetVar( this.mTemplateCallback, $x );
// }
//
// public function enableLimitReport( $x = true ) {
// return wfSetVar( this.mEnableLimitReport, $x );
// }
//
// public function setTimestamp( $x ) {
// return wfSetVar( this.mTimestamp, $x );
// }
//
// public function setCleanSignatures( $x ) {
// return wfSetVar( this.mCleanSignatures, $x );
// }
//
// public function setExternalLinkTarget( $x ) {
// return wfSetVar( this.mExternalLinkTarget, $x );
// }
//
// public function disableContentConversion( $x = true ) {
// return wfSetVar( this.mDisableContentConversion, $x );
// }
//
// public function disableTitleConversion( $x = true ) {
// return wfSetVar( this.mDisableTitleConversion, $x );
// }
//
// public function setUserLang( $x ) {
// if ( is_string( $x ) ) {
// $x = Language::factory( $x );
// }
//
// return wfSetVar( this.mUserLang, $x );
// }
//
// public function setThumbSize( $x ) {
// return wfSetVar( this.mThumbSize, $x );
// }
//
// public function setStubThreshold( $x ) {
// return wfSetVar( this.mStubThreshold, $x );
// }
//
// public function setPreSaveTransform( $x ) {
// return wfSetVar( this.mPreSaveTransform, $x );
// }
//
// public function setIsPreview( $x ) {
// return wfSetVar( this.mIsPreview, $x );
// }
//
// public function setIsSectionPreview( $x ) {
// return wfSetVar( this.mIsSectionPreview, $x );
// }
//
// public function setIsPrintable( $x ) {
// return wfSetVar( this.mIsPrintable, $x );
// }
//
// /**
// * Set the redirect target.
// *
// * Note that setting or changing this does not *make* the page a redirect
// * or change its target, it merely records the information for reference
// * during the parse.
// *
// * @since 1.24
// * @param Title|null $title
// */
// function setRedirectTarget( $title ) {
// this.redirectTarget = $title;
// }
//
// /**
// * Get the previously-set redirect target.
// *
// * @since 1.24
// * @return Title|null
// */
// function getRedirectTarget() {
// return this.redirectTarget;
// }
//
// /**
// * Extra key that should be present in the parser cache key.
// * @param String $key
// */
// public function addExtraKey( $key ) {
// this.mExtraKey .= '!' . $key;
// }
//
// /**
// * Constructor
// * @param User $user
// * @param Language $lang
// */
// public function __construct( $user = null, $lang = null ) {
// if ( $user === null ) {
// global $wgUser;
// if ( $wgUser === null ) {
// $user = new User;
// } else {
// $user = $wgUser;
// }
// }
// if ( $lang === null ) {
// global $wgLang;
// if ( !StubObject::isRealObject( $wgLang ) ) {
// $wgLang->_unstub();
// }
// $lang = $wgLang;
// }
// this.initialiseFromUser( $user, $lang );
// }
//
// /**
// * Get a ParserOptions Object for an anonymous user
// * @since 1.27
// * @return ParserOptions
// */
// public static function newFromAnon() {
// global $wgContLang;
// return new ParserOptions( new User, $wgContLang );
// }
//
// /**
// * Get a ParserOptions Object from a given user.
// * Language will be taken from $wgLang.
// *
// * @param User $user
// * @return ParserOptions
// */
// public static function newFromUser( $user ) {
// return new ParserOptions( $user );
// }
//
// /**
// * Get a ParserOptions Object from a given user and language
// *
// * @param User $user
// * @param Language $lang
// * @return ParserOptions
// */
// public static function newFromUserAndLang( User $user, Language $lang ) {
// return new ParserOptions( $user, $lang );
// }
//
// /**
// * Get a ParserOptions Object from a IContextSource Object
// *
// * @param IContextSource $context
// * @return ParserOptions
// */
// public static function newFromContext( IContextSource $context ) {
// return new ParserOptions( $context->getUser(), $context->getLanguage() );
// }
//
// /**
// * Get user options
// *
// * @param User $user
// * @param Language $lang
// */
// private function initialiseFromUser( $user, $lang ) {
// global $wgInterwikiMagic, $wgAllowExternalImages,
// $wgAllowExternalImagesFrom, $wgEnableImageWhitelist, $wgAllowSpecialInclusion,
// $wgMaxArticleSize, $wgMaxPPNodeCount, $wgMaxTemplateDepth, $wgMaxPPExpandDepth,
// $wgCleanSignatures, $wgExternalLinkTarget, $wgExpensiveParserFunctionLimit,
// $wgMaxGeneratedPPNodeCount, $wgDisableLangConversion, $wgDisableTitleConversion,
// $wgEnableMagicLinks;
//
// // *UPDATE* ParserOptions::matches() if any of this changes as needed
// this.mInterwikiMagic = $wgInterwikiMagic;
// this.mAllowExternalImages = $wgAllowExternalImages;
// this.mAllowExternalImagesFrom = $wgAllowExternalImagesFrom;
// this.mEnableImageWhitelist = $wgEnableImageWhitelist;
// this.mAllowSpecialInclusion = $wgAllowSpecialInclusion;
// this.mMaxIncludeSize = $wgMaxArticleSize * 1024;
// this.mMaxPPNodeCount = $wgMaxPPNodeCount;
// this.mMaxGeneratedPPNodeCount = $wgMaxGeneratedPPNodeCount;
// this.mMaxPPExpandDepth = $wgMaxPPExpandDepth;
// this.mMaxTemplateDepth = $wgMaxTemplateDepth;
// this.mExpensiveParserFunctionLimit = $wgExpensiveParserFunctionLimit;
// this.mCleanSignatures = $wgCleanSignatures;
// this.mExternalLinkTarget = $wgExternalLinkTarget;
// this.mDisableContentConversion = $wgDisableLangConversion;
// this.mDisableTitleConversion = $wgDisableLangConversion || $wgDisableTitleConversion;
// this.mMagicISBNLinks = $wgEnableMagicLinks['ISBN'];
// this.mMagicPMIDLinks = $wgEnableMagicLinks['PMID'];
// this.mMagicRFCLinks = $wgEnableMagicLinks['RFC'];
//
// this.mUser = $user;
// this.mNumberHeadings = $user->getOption( 'numberheadings' );
// this.mThumbSize = $user->getOption( 'thumbsize' );
// this.mStubThreshold = $user->getStubThreshold();
// this.mUserLang = $lang;
// }
//
// /**
// * Check if these options match that of another options set
// *
// * This ignores report limit settings that only affect HTML comments
// *
// * @param ParserOptions $other
// * @return boolean
// * @since 1.25
// */
// public function matches( ParserOptions $other ) {
// $fields = array_keys( get_class_vars( __CLASS__ ) );
// $fields = array_diff( $fields, [
// 'mEnableLimitReport', // only effects HTML comments
// 'onAccessCallback', // only used for ParserOutput option tracking
// ] );
// foreach ( $fields as $field ) {
// if ( !is_object( this.$field ) && this.$field !== $other->$field ) {
// return false;
// }
// }
// // Check the Object and lazy-loaded options
// return (
// this.mUserLang->equals( $other->mUserLang ) &&
// this.getDateFormat() === $other->getDateFormat()
// );
// }
//
// /**
// * Registers a callback for tracking which ParserOptions which are used.
// * This is a private API with the parser.
// * @param callable $callback
// */
// public function registerWatcher( $callback ) {
// this.onAccessCallback = $callback;
// }
//
// /**
// * Called when an option is accessed.
// * Calls the watcher that was set using registerWatcher().
// * Typically, the watcher callback is ParserOutput::registerOption().
// * The information registered that way will be used by ParserCache::save().
// *
// * @param String $optionName Name of the option
// */
// public function optionUsed( $optionName ) {
// if ( this.onAccessCallback ) {
// call_user_func( this.onAccessCallback, $optionName );
// }
// }
//
// /**
// * Returns the full array of options that would have been used by
// * in 1.16.
// * Used to get the old parser cache entries when available.
// * @return array
// */
// public static function legacyOptions() {
// return [
// 'stubthreshold',
// 'numberheadings',
// 'userlang',
// 'thumbsize',
// 'editsection',
// 'printable'
// ];
// }
//
// /**
// * Generate a hash String with the values set on these ParserOptions
// * for the keys given in the array.
// * This will be used as part of the hash key for the parser cache,
// * so users sharing the options with vary for the same page share
// * the same cached data safely.
// *
// * Extensions which require it should install 'PageRenderingHash' hook,
// * which will give them a chance to modify this key based on their own
// * settings.
// *
// * @since 1.17
// * @param array $forOptions
// * @param Title $title Used to get the content language of the page (since r97636)
// * @return String Page rendering hash
// */
// public function optionsHash( $forOptions, $title = null ) {
// global $wgRenderHashAppend;
//
// // FIXME: Once the cache key is reorganized this argument
// // can be dropped. It was used when the math extension was
// // part of core.
// $confstr = '*';
//
// // Space assigned for the stubthreshold but unused
// // since it disables the parser cache, its value will always
// // be 0 when this function is called by parsercache.
// if ( in_array( 'stubthreshold', $forOptions ) ) {
// $confstr .= '!' . this.mStubThreshold;
// } else {
// $confstr .= '!*';
// }
//
// if ( in_array( 'dateformat', $forOptions ) ) {
// $confstr .= '!' . this.getDateFormat();
// }
//
// if ( in_array( 'numberheadings', $forOptions ) ) {
// $confstr .= '!' . ( this.mNumberHeadings ? '1' : '' );
// } else {
// $confstr .= '!*';
// }
//
// if ( in_array( 'userlang', $forOptions ) ) {
// $confstr .= '!' . this.mUserLang->getCode();
// } else {
// $confstr .= '!*';
// }
//
// if ( in_array( 'thumbsize', $forOptions ) ) {
// $confstr .= '!' . this.mThumbSize;
// } else {
// $confstr .= '!*';
// }
//
// // add in language specific options, if any
// // @todo FIXME: This is just a way of retrieving the url/user preferred variant
// if ( !is_null( $title ) ) {
// $confstr .= $title->getPageLanguage()->getExtraHashOptions();
// } else {
// global $wgContLang;
// $confstr .= $wgContLang->getExtraHashOptions();
// }
//
// $confstr .= $wgRenderHashAppend;
//
// // @note: as of Feb 2015, core never sets the editsection flag, since it uses
// // <mw:editsection> tags to inject editsections on the fly. However, extensions
// // may be using it by calling ParserOption::optionUsed resp. ParserOutput::registerOption
// // directly. At least Wikibase does at this point in time.
// if ( !in_array( 'editsection', $forOptions ) ) {
// $confstr .= '!*';
// } elseif ( !this.mEditSection ) {
// $confstr .= '!edit=0';
// }
//
// if ( this.mIsPrintable && in_array( 'printable', $forOptions ) ) {
// $confstr .= '!printable=1';
// }
//
// if ( this.mExtraKey != '' ) {
// $confstr .= this.mExtraKey;
// }
//
// // Give a chance for extensions to modify the hash, if they have
// // extra options or other effects on the parser cache.
// Hooks::run( 'PageRenderingHash', [ &$confstr, this.getUser(), &$forOptions ] );
//
// // Make it a valid memcached key fragment
// $confstr = str_replace( ' ', '_', $confstr );
//
// return $confstr;
// }
//
// /**
// * Sets a hook to force that a page exists, and sets a current revision callback to return
// * a revision with custom content when the current revision of the page is requested.
// *
// * @since 1.25
// * @param Title $title
// * @param Content $content
// * @param User $user The user that the fake revision is attributed to
// * @return ScopedCallback to unset the hook
// */
// public function setupFakeRevision( $title, $content, $user ) {
// $oldCallback = this.setCurrentRevisionCallback(
// function (
// $titleToCheck, $parser = false ) use ( $title, $content, $user, &$oldCallback
// ) {
// if ( $titleToCheck->equals( $title ) ) {
// return new Revision( [
// 'page' => $title->getArticleID(),
// 'user_text' => $user->getName(),
// 'user' => $user->getId(),
// 'parent_id' => $title->getLatestRevID(),
// 'title' => $title,
// 'content' => $content
// ] );
// } else {
// return call_user_func( $oldCallback, $titleToCheck, $parser );
// }
// }
// );
//
// global $wgHooks;
// $wgHooks['TitleExists'][] =
// function ( $titleToCheck, &$exists ) use ( $title ) {
// if ( $titleToCheck->equals( $title ) ) {
// $exists = true;
// }
// };
// end( $wgHooks['TitleExists'] );
// $key = key( $wgHooks['TitleExists'] );
// LinkCache::singleton()->clearBadLink( $title->getPrefixedDBkey() );
// return new ScopedCallback( function () use ( $title, $key ) {
// global $wgHooks;
// unset( $wgHooks['TitleExists'][$key] );
// LinkCache::singleton()->clearLink( $title );
// } );
// }
}

View File

@@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_regex_ {
public static int Find_fwd_while(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (cur < src_end) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
break;
else
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
}
return cur;
}
public static int Find_fwd_until(Btrie_slim_mgr trie, Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int cur = src_bgn;
while (cur < src_end) {
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null)
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
else
break;
}
return cur;
}
}

View File

@@ -0,0 +1,39 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_regex_boundary { // THREAD.SAFE: trv is only for consistent interface
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
private final Btrie_rv trv = new Btrie_rv();
public Xomw_regex_boundary(Xomw_regex_space space) {
// naive implementation of is_boundary; ignore all ws and underscore
byte[][] ary = space.Ws();
for (byte[] bry : ary)
trie.Add_bry_byte(bry, Byte_.Zero);
ary = space.Zs();
for (byte[] bry : ary)
trie.Add_bry_byte(bry, Byte_.Zero);
}
public boolean Is_boundary_prv(byte[] src, int pos) {
if (pos == 0) return true; // BOS is true
int bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, pos - 1);
byte b = src[bgn];
Object o = trie.Match_at_w_b0(trv, b, src, bgn, pos);
return o != null;
}
}

View File

@@ -0,0 +1,101 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
public class Xomw_regex_parser {
private Bry_bfr tmp;
public byte[][] Rslt() {return rslt;} private byte[][] rslt;
public Xomw_regex_parser Add_ary(String... ary) {return Set_or_add(Parse_ary(ary));}
private byte[][] Parse_ary(String... ary) {
if (tmp == null) tmp = Bry_bfr_.New();
int ary_len = ary.length;
byte[][] rv = new byte[ary_len][];
for (int i = 0; i < ary_len; i++) {
rv[i] = Compile_itm(tmp, Bry_.new_u8(ary[i]));
}
return rv;
}
public Xomw_regex_parser Add_rng(String bgn, String end) {return Set_or_add(Parse_rng(bgn, end));}
private byte[][] Parse_rng(String bgn, String end) {
if (tmp == null) tmp = Bry_bfr_.New();
byte[] bgn_bry = Compile_itm(tmp, Bry_.new_u8(bgn));
int bgn_val = gplx.core.intls.Utf16_.Decode_to_int(bgn_bry, 0);
byte[] end_bry = Compile_itm(tmp, Bry_.new_u8(end));
int end_val = gplx.core.intls.Utf16_.Decode_to_int(end_bry, 0);
int rv_len = end_val - bgn_val + 1;
byte[][] rv = new byte[rv_len][];
for (int i = 0; i < rv_len; i++) {
rv[i] = gplx.core.intls.Utf16_.Encode_int_to_bry(i + bgn_val);
}
return rv;
}
private Xomw_regex_parser Set_or_add(byte[][] val) {
rslt = rslt == null ? val : Bry_.Ary_add(rslt, val);
return this;
}
private static byte[] Compile_itm(Bry_bfr tmp, byte[] src) {
// parse each itm
int src_end = src.length;
int cur = 0;
int prv = cur;
boolean dirty = false;
while (true) {
// eos
if (cur == src_end) {
if (dirty)
tmp.Add_mid(src, prv, src_end);
break;
}
// look at byte
byte b = src[cur];
switch (b) { // escape
case Byte_ascii.Backslash:
int nxt = cur + 1;
if (nxt >= src_end) throw Err_.new_wo_type("regex escape failed: no more chars left", "src", src, "pos", nxt);
byte nxt_byte = src[nxt];
switch (nxt_byte) {
case Byte_ascii.Ltr_s: // \s -> " "
src = Byte_ascii.Space_bry;
cur = src_end;
break;
case Byte_ascii.Ltr_x: // \ u -> utf8 sequence in hex-dec; EX: "\xc2\xad" -> new byte[] {194, 160}
// read next two bytes
dirty = true;
nxt++;
if (nxt + 2 > src_end) throw Err_.new_wo_type("utf8 escape failed: no more chars left", "src", src, "pos", nxt);
tmp.Add_byte((byte)gplx.core.encoders.Hex_utl_.Parse_or(src, nxt, nxt + 2, -1));
cur = nxt + 2;
prv = cur;
break;
default:
throw Err_.new_wo_type("regex escape failed: unknown char", "src", src, "pos", nxt);
}
break;
default: // handles ascii only
if (b > 127)
throw Err_.new_wo_type("regex compiled failed: unknown char", "src", src, "pos", cur);
cur++;
break;
}
}
// set item
return dirty ? tmp.To_bry_and_clear() : src;
}
}

View File

@@ -0,0 +1,42 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_regex_parser__tst {
private final Xomw_regex_parser__fxt fxt = new Xomw_regex_parser__fxt();
@Test public void Ary__space() {
fxt.Test__parse_ary(String_.Ary("\\s"), String_.Ary(" "));
}
@Test public void Ary__utf8() {
fxt.Test__parse_ary(String_.Ary("\\xc2\\xa7", "\\xe0\\xb9\\x90"), String_.Ary("§", ""));
}
@Test public void Rng__ascii() {
fxt.Test__parse_rng("a", "c", String_.Ary("a", "b", "c"));
}
}
class Xomw_regex_parser__fxt {
private final Xomw_regex_parser parser = new Xomw_regex_parser();
public void Test__parse_ary(String[] ary, String[] expd) {
parser.Add_ary(ary);
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
}
public void Test__parse_rng(String bgn, String end, String[] expd) {
parser.Add_rng("a", "c");
Gftest.Eq__ary(expd, String_.Ary(parser.Rslt()));
}
}

View File

@@ -0,0 +1,64 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_regex_space {
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
public Xomw_regex_space() {
byte[] space = Bry_.New_by_ints(32);
ws = new byte[][]
{ space
, Bry_.New_by_ints(9)
, Bry_.New_by_ints(10)
, Bry_.New_by_ints(13)
};
// Zs; REF:http://www.fileformat.info/info/unicode/category/Zs/list.htm
zs = new byte[][]
{ space
, Bry_.New_by_ints(194, 160)
, Bry_.New_by_ints(225, 154, 128)
, Bry_.New_by_ints(226, 128, 129)
, Bry_.New_by_ints(226, 128, 130)
, Bry_.New_by_ints(226, 128, 131)
, Bry_.New_by_ints(226, 128, 132)
, Bry_.New_by_ints(226, 128, 133)
, Bry_.New_by_ints(226, 128, 134)
, Bry_.New_by_ints(226, 128, 135)
, Bry_.New_by_ints(226, 128, 136)
, Bry_.New_by_ints(226, 128, 137)
, Bry_.New_by_ints(226, 128, 138)
, Bry_.New_by_ints(226, 128, 175)
, Bry_.New_by_ints(226, 129, 159)
, Bry_.New_by_ints(227, 128, 128)
};
byte[][] ary = ws;
for (byte[] bry : ary) {
trie.Add_bry_byte(bry, Byte_.Zero);
}
ary = zs;
for (byte[] bry : ary) {
trie.Add_bry_byte(bry, Byte_.Zero);
}
}
public byte[][] Ws() {return ws;} private byte[][] ws;
public byte[][] Zs() {return zs;} private byte[][] zs;
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
return Xomw_regex_.Find_fwd_while(trie, trv, src, src_bgn, src_end);
}
}

View File

@@ -0,0 +1,40 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_regex_url {
private final Btrie_slim_mgr trie;
public Xomw_regex_url(Xomw_regex_space regex_space) {
// [^][<>"\\x00-\\x20\\x7F\|]
// REGEX:[^][<>"\\x00-\\x20\\x7F\p{Zs}]; NOTE: val is just a marker
this.trie = Btrie_slim_mgr.cs();
trie.Add_str_byte__many(Byte_.Zero, "[", "]", "<", ">", "\"");
for (byte i = 0; i < 33; i++) {
trie.Add_bry_byte(new byte[] {i}, Byte_.Zero);
}
trie.Add_bry_byte(Bry_.New_by_ints(127), Byte_.Zero); // x7F
byte[][] zs_ary = regex_space.Zs();
for (byte[] zs : zs_ary) {
trie.Add_bry_byte(zs, Byte_.Zero);
}
}
public int Find_fwd_while(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
return Xomw_regex_.Find_fwd_until(trie, trv, src, src_bgn, src_end);
}
}

View File

@@ -0,0 +1,139 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
public class Xomw_strip_state { // REF.MW:/parser/StripState.php
private final Btrie_slim_mgr trie = Btrie_slim_mgr.cs();
private final Btrie_rv trv = new Btrie_rv();
private final Bry_bfr tmp_1 = Bry_bfr_.New();
private final Bry_bfr tmp_2 = Bry_bfr_.New();
private boolean tmp_2_used = false;
private int general_len, nowiki_len;
public void Clear() {
trie.Clear();
general_len = nowiki_len = 0;
tmp_2_used = false;
}
public void Add_general(byte[] marker, byte[] val) {Add_item(Tid__general, marker, val);}
public void Add_nowiki (byte[] marker, byte[] val) {Add_item(Tid__nowiki, marker, val);}
public void Add_item(byte tid, byte[] marker, byte[] val) {
trie.Add_obj(marker, new Xomw_strip_item(tid, marker, val));
if (tid == Tid__general)
general_len++;
else
nowiki_len++;
}
public byte[] Unstrip_general(byte[] text) {return Unstrip(Tid__general, text);}
public byte[] Unstrip_nowiki (byte[] text) {return Unstrip(Tid__nowiki , text);}
public byte[] Unstrip_both (byte[] text) {return Unstrip(Tid__both , text);}
public byte[] Unstrip(byte tid, byte[] text) {
boolean dirty = Unstrip(tid, tmp_1, text, 0, text.length);
return dirty ? tmp_1.To_bry_and_clear() : text;
}
public void Unstrip_general(Xomw_parser_bfr pbfr) {Unstrip(Tid__general, pbfr);}
public void Unstrip_nowiki (Xomw_parser_bfr pbfr) {Unstrip(Tid__nowiki , pbfr);}
public void Unstrip_both (Xomw_parser_bfr pbfr) {Unstrip(Tid__both , pbfr);}
private boolean Unstrip(byte tid, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
boolean dirty = Unstrip(tid, pbfr.Trg(), src, 0, src_bfr.Len());
if (dirty)
pbfr.Switch();
return dirty;
}
private boolean Unstrip(byte tid, Bry_bfr trg, byte[] src, int src_bgn, int src_end) {
// exit early if no items for type
if ((tid & Tid__general) == Tid__general) {
if (general_len == 0)
return false;
}
else if ((tid & Tid__nowiki) == Tid__nowiki) {
if (nowiki_len == 0)
return false;
}
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
// loop over each src char
while (true) {
// EOS: exit
if (cur == src_end) {
if (dirty) // add remainder if dirty
trg.Add_mid(src, prv, src_end);
break;
}
// check if current pos matches strip state
Object o = trie.Match_at(trv, src, cur, src_end);
if (o != null) { // match
Xomw_strip_item item = (Xomw_strip_item)o;
byte item_tid = item.Tid();
if ((tid & item_tid) == item_tid) { // check if types match
// get bfr for recursion
Bry_bfr nested_bfr = null;
boolean tmp_2_release = false;
if (tmp_2_used) {
nested_bfr = Bry_bfr_.New();
}
else {
nested_bfr = tmp_2;
tmp_2_used = true;
tmp_2_release = true;
}
// recurse
byte[] item_val = item.Val();
if (Unstrip(tid, nested_bfr, item_val, 0, item_val.length))
item_val = nested_bfr.To_bry_and_clear();
if (tmp_2_release)
tmp_2_used = false;
// add to trg
trg.Add_mid(src, prv, cur);
trg.Add(item_val);
// update vars
dirty = true;
cur += item.Key().length;
prv = cur;
continue;
}
}
cur++;
}
return dirty;
}
public static final String Str__marker_bgn = "\u007f'\"`UNIQ-";
public static final byte[]
Bry__marker__bgn = Bry_.new_a7(Str__marker_bgn)
, Bry__marker__end = Bry_.new_a7("-QINU`\"'\u007f")
;
public static final byte Tid__general = 1, Tid__nowiki = 2, Tid__both = 3;
}
class Xomw_strip_item {
public Xomw_strip_item(byte tid, byte[] key, byte[] val) {
this.tid = tid;
this.key = key;
this.val = val;
}
public byte Tid() {return tid;} private final byte tid;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
}

View File

@@ -0,0 +1,44 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_strip_state__tst {
private final Xomw_strip_state__fxt fxt = new Xomw_strip_state__fxt();
@Test public void Basic() {
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1");
fxt.Test__nostrip(Xomw_strip_state.Tid__nowiki , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b");
fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b");
fxt.Test__unstrip(Xomw_strip_state.Tid__both , "a \u007f'\"`UNIQ-key-1-QINU`\"'\u007f b", "a val-1 b");
}
@Test public void Recurse() {
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f", "val-1");
fxt.Init__add (Xomw_strip_state.Tid__general, "\u007f'\"`UNIQ-key-2-QINU`\"'\u007f", "\u007f'\"`UNIQ-key-1-QINU`\"'\u007f");
fxt.Test__unstrip(Xomw_strip_state.Tid__general, "a \u007f'\"`UNIQ-key-2-QINU`\"'\u007f b", "a val-1 b");
}
}
class Xomw_strip_state__fxt {
private final Xomw_strip_state strip_state = new Xomw_strip_state();
public void Init__add(byte tid, String marker, String val) {
strip_state.Add_item(tid, Bry_.new_u8(marker), Bry_.new_u8(val));
}
public void Test__nostrip(byte tid, String src) {Test__unstrip(tid, src, src);}
public void Test__unstrip(byte tid, String src, String expd) {
byte[] actl = strip_state.Unstrip(tid, Bry_.new_u8(src));
Gftest.Eq__str(expd, String_.new_u8(actl));
}
}

View File

@@ -0,0 +1,56 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_doubleunder_data {
// XO.MW: MW stores these as mDoubleUnderscores in Parser
public boolean toc;
public boolean no_toc;
public boolean force_toc;
public boolean no_gallery;
public boolean force_gallery;
public boolean no_title_convert;
public boolean no_content_convert;
public boolean no_edit_section;
public boolean new_section_link;
public boolean static_redirect;
public boolean hidden_cat;
public boolean index;
public boolean no_index;
// XO.MW: MW stores these as member variables in Parser
public boolean show_toc;
public boolean force_toc_position;
public void Reset() {
toc = no_toc = force_toc =
no_gallery = force_gallery =
no_title_convert = no_content_convert =
no_edit_section = new_section_link =
static_redirect =
hidden_cat = index = no_index =
false;
show_toc = force_toc_position = false;
}
}

View File

@@ -0,0 +1,148 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*;
import gplx.xowa.langs.*; import gplx.xowa.langs.kwds.*;
public class Xomw_doubleunder_wkr {
private final Btrie_slim_mgr trie = Btrie_slim_mgr.ci_u8();
private final Btrie_rv trv = new Btrie_rv();
private Xomw_doubleunder_data data;
public void Init_by_wiki(Xomw_doubleunder_data data, Xol_lang_itm lang) {
this.data = data;
Reg(trie, lang.Kwd_mgr()
, Xol_kwd_grp_.Id_notoc
, Xol_kwd_grp_.Id_nogallery
, Xol_kwd_grp_.Id_forcetoc
, Xol_kwd_grp_.Id_toc
, Xol_kwd_grp_.Id_noeditsection
, Xol_kwd_grp_.Id_newsectionlink
, Xol_kwd_grp_.Id_hiddencat
, Xol_kwd_grp_.Id_index
, Xol_kwd_grp_.Id_noindex
, Xol_kwd_grp_.Id_staticredirect
, Xol_kwd_grp_.Id_notitleconvert
, Xol_kwd_grp_.Id_nocontentconvert
);
}
public void Do_double_underscore(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
data.Reset();
// XO.MW: MW does TOC before others; XO does it at the same time
// Now match and remove the rest of them
// XO.MW.BGN: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text );
int cur = src_bgn;
int prv = cur;
boolean dirty = false;
while (true) {
// reached end; stop
if (cur == src_end) {
if (dirty) {
bfr.Add_mid(src, prv, src_end);
}
break;
}
// no match; keep searching
byte b = src[cur];
Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (o == null) {
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
continue;
}
// if cs, ensure exact-match (trie is case-insensitive)
int kwd_end = trv.Pos();
Xomw_doubleunder_itm itm = (Xomw_doubleunder_itm)o;
if (itm.case_match && !Bry_.Match(src, cur, kwd_end, itm.val)) {
cur = kwd_end;
continue;
}
// match; replace __KWD__ with "" (or "<!--MWTOC-->" if __TOC__)
dirty = true;
bfr.Add_mid(src, prv, cur);
switch (itm.tid) {
case Xol_kwd_grp_.Id_toc:
// The position of __TOC__ needs to be recorded
boolean already_seen = !data.show_toc;
data.toc = true;
data.show_toc = true;
data.force_toc_position = true;
if (already_seen) { // Set a placeholder. At the end we'll fill it in with the TOC.
bfr.Add_str_a7("<!--MWTOC-->");
}
else { // Only keep the first one. XO.MW:ignore by not adding anything to bfr
}
break;
// XO.MW: MW adds boolean to hash_table; XO uses boolean props; note that "remove" is done by not adding to bfr
case Xol_kwd_grp_.Id_notoc: data.no_toc = true; break;
case Xol_kwd_grp_.Id_nogallery: data.no_gallery = true; break;
case Xol_kwd_grp_.Id_forcetoc: data.force_toc = true; break;
case Xol_kwd_grp_.Id_noeditsection: data.no_edit_section = true; break;
case Xol_kwd_grp_.Id_newsectionlink: data.new_section_link = true; break;
case Xol_kwd_grp_.Id_hiddencat: data.hidden_cat = true; break;
case Xol_kwd_grp_.Id_index: data.index = true; break;
case Xol_kwd_grp_.Id_noindex: data.no_index = true; break;
case Xol_kwd_grp_.Id_staticredirect: data.static_redirect = true; break;
case Xol_kwd_grp_.Id_notitleconvert: data.no_title_convert = true; break;
case Xol_kwd_grp_.Id_nocontentconvert: data.no_content_convert = true; break;
default: throw Err_.new_unhandled_default(itm.tid);
}
cur = kwd_end;
prv = cur;
}
// XO.MW.END: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text );
if (data.no_toc && !data.force_toc_position) {
data.show_toc = false;
}
// XO.MW.EDIT: hidden_cat, index, noindex are used to add to tracking category
if (dirty)
pbfr.Switch();
}
private static void Reg(Btrie_slim_mgr trie, Xol_kwd_mgr mgr, int... ids) {
for (int id : ids) {
Xol_kwd_grp grp = mgr.Get_or_new(id);
Xol_kwd_itm[] itms = grp.Itms();
for (Xol_kwd_itm itm : itms) {
byte[] val = itm.Val();
trie.Add_obj(val, new Xomw_doubleunder_itm(id, grp.Case_match(), val));
}
}
}
}
class Xomw_doubleunder_itm {
public int tid;
public boolean case_match;
public byte[] val;
public Xomw_doubleunder_itm(int tid, boolean case_match, byte[] val) {
this.tid = tid;
this.case_match = case_match;
this.val = val;
}
}

View File

@@ -0,0 +1,52 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_doubleunder_wkr__tst {
private final Xomw_doubleunder_wkr__fxt fxt = new Xomw_doubleunder_wkr__fxt();
@Test public void No_match() {fxt.Test__parse("a b c" , "a b c");}
@Test public void Force_toc() {fxt.Test__parse("a __FORCETOC__ b" , "a b").Test__prop_y(fxt.data.force_toc);}
@Test public void Toc() {fxt.Test__parse("a __TOC__ b __TOC__ c" , "a <!--MWTOC--> b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);}
@Test public void Notoc_only() {fxt.Test__parse("a __NOTOC__ b" , "a b").Test__prop_y(fxt.data.no_toc).Test__prop_n(fxt.data.show_toc);} // show_toc is false
@Test public void Notoc_w_toc() {fxt.Test__parse("a __TOC__ b __NOTOC__ c" , "a <!--MWTOC--> b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);} // show_toc is true
@Test public void Case_match() {fxt.Test__parse("a __index__ b" , "a __index__ b");}
}
class Xomw_doubleunder_wkr__fxt {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xomw_doubleunder_wkr wkr = new Xomw_doubleunder_wkr();
public Xomw_doubleunder_data data = new Xomw_doubleunder_data();
public Xomw_doubleunder_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
wkr.Init_by_wiki(data, wiki.Lang());
}
public Xomw_doubleunder_wkr__fxt Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Do_double_underscore(pctx, pbfr.Init(src_bry));
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear(), src_str);
return this;
}
public Xomw_doubleunder_wkr__fxt Test__prop_y(boolean... ary) {return Test__prop(Bool_.Y, ary);}
public Xomw_doubleunder_wkr__fxt Test__prop_n(boolean... ary) {return Test__prop(Bool_.N, ary);}
private Xomw_doubleunder_wkr__fxt Test__prop(boolean expd, boolean... ary) {
for (boolean v : ary)
Gftest.Eq__bool(expd, v);
return this;
}
}

View File

@@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public interface Xomw_heading_cbk {
void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr);
void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr);
}

View File

@@ -0,0 +1,52 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_heading_cbk__html implements Xomw_heading_cbk {
public Bry_bfr Bfr() {return bfr;} private Bry_bfr bfr;
public Xomw_heading_cbk__html Bfr_(Bry_bfr bfr) {
this.bfr = bfr;
return this;
}
public void On_hdr_seen(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// add from txt_bgn to hdr_bgn; EX: "abc\n==A==\n"; "\n==" seen -> add "abc"
byte[] src = wkr.Src();
int hdr_bgn = wkr.Hdr_bgn(), txt_bgn = wkr.Txt_bgn();
if (hdr_bgn > txt_bgn)
bfr.Add_mid(src, txt_bgn, hdr_bgn);
// add "\n" unless BOS
if (hdr_bgn != Xomw_parser_ctx.Pos__bos) bfr.Add_byte_nl();
// add <h2>...</h2>
int hdr_num = wkr.Hdr_num();
bfr.Add(Tag__lhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // <h2>
bfr.Add_mid(wkr.Src(), wkr.Hdr_lhs_end(), wkr.Hdr_rhs_bgn());
bfr.Add(Tag__rhs).Add_int_digits(1, hdr_num).Add(Byte_ascii.Angle_end_bry); // </h2>
}
public void On_src_done(Xomw_parser_ctx pctx, Xomw_heading_wkr wkr) {
// add from txt_bgn to EOS;
byte[] src = wkr.Src();
int txt_bgn = wkr.Txt_bgn(), src_end = wkr.Src_end();
if (txt_bgn != src_end) // PERF: don't call Add_mid() if hdr is at end of EOS
bfr.Add_mid(src, txt_bgn, src_end);
}
private static final byte[]
Tag__lhs = Bry_.new_a7("<h")
, Tag__rhs = Bry_.new_a7("</h")
;
}

View File

@@ -0,0 +1,108 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*; import gplx.xowa.langs.*;
public class Xomw_heading_wkr {
private Xomw_parser_ctx pctx;
private Xomw_heading_cbk cbk;
public byte[] Src() {return src;} private byte[] src;
public int Src_end() {return src_end;} private int src_end;
public int Txt_bgn() {return txt_bgn;} private int txt_bgn;
public int Hdr_bgn() {return hdr_bgn;} private int hdr_bgn;
public int Hdr_end() {return hdr_end;} private int hdr_end;
public int Hdr_num() {return hdr_num;} private int hdr_num;
public int Hdr_lhs_bgn() {return hdr_lhs_bgn;} private int hdr_lhs_bgn;
public int Hdr_lhs_end() {return hdr_lhs_end;} private int hdr_lhs_end;
public int Hdr_rhs_bgn() {return hdr_rhs_bgn;} private int hdr_rhs_bgn;
public int Hdr_rhs_end() {return hdr_rhs_end;} private int hdr_rhs_end;
public void Do_headings(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr, Xomw_heading_cbk__html cbk) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src_bry = src_bfr.Bfr();
int src_end = src_bfr.Len();
cbk.Bfr_(pbfr.Trg());
pbfr.Switch();
Parse(pctx, src_bry, 0, src_end, cbk);
}
public void Parse(Xomw_parser_ctx pctx, byte[] src, int src_bgn, int src_end, Xomw_heading_cbk cbk) { // REF.MW: /includes/parser/Parser.php|doHeadings
// init members
this.pctx = pctx;
this.src = src;
this.src_end = src_end;
this.cbk = cbk;
// PORTED:
// for ( $i = 6; $i >= 1; --$i ) {
// $h = str_repeat( '=', $i );
// $text = preg_replace( "/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text );
// }
// do loop
int pos = src_bgn;
this.txt_bgn = pos == Xomw_parser_ctx.Pos__bos ? 0 : pos;
byte b = Byte_ascii.Nl;
while (true) {
int nxt = pos + 1;
// check if (a) cur is \n; (b) nxt is '='
if ( b == Byte_ascii.Nl
&& nxt < src_end
&& src[nxt] == Byte_ascii.Eq
) {
pos = Parse_hdr_nl(txt_bgn, pos, nxt + 1);
this.txt_bgn = pos;
}
else
++pos;
// EOS; add all text after last "==\n"
if (pos == src_end) {
cbk.On_src_done(pctx, this);
break;
}
b = src[pos];
}
}
private int Parse_hdr_nl(int txt_bgn, int nl_lhs, int pos) {
// calc lhs vars
this.hdr_bgn = nl_lhs;
this.hdr_lhs_bgn = nl_lhs == 0 ? 0 : nl_lhs + 1; // set pos of 1st "="; note that "==" can be at BOS;
this.hdr_lhs_end = Bry_find_.Find_fwd_while(src, pos, src_end, Byte_ascii.Eq);
// calc rhs vars
int nl_rhs = Bry_find_.Find_fwd_or(src, Byte_ascii.Nl, hdr_lhs_end + 1, src_end, src_end); // if no "\n", src_end is rest of text; EX: "\n==<text>EOS
this.hdr_end = nl_rhs;
this.hdr_rhs_end = Bry_find_.Find_bwd__skip_ws(src, nl_rhs, hdr_lhs_end);
this.hdr_rhs_bgn = Bry_find_.Find_bwd__skip(src, hdr_rhs_end - 1, hdr_lhs_end, Byte_ascii.Eq);
int hdr_lhs_len = hdr_lhs_end - hdr_lhs_bgn;
int hdr_rhs_len = hdr_rhs_end - hdr_rhs_bgn;
// handle rare situations like "\n====\n"
if (hdr_rhs_len == 0) {
int hdr_lhs_len_half = hdr_lhs_len / 2;
hdr_rhs_len = hdr_lhs_len - hdr_lhs_len_half;
hdr_lhs_len = hdr_lhs_len_half;
this.hdr_lhs_end = hdr_lhs_bgn + hdr_lhs_len;
this.hdr_rhs_bgn = hdr_lhs_end;
}
this.hdr_num = hdr_lhs_len < hdr_rhs_len ? hdr_lhs_len : hdr_rhs_len;
cbk.On_hdr_seen(pctx, this);
return nl_rhs;
}
}

View File

@@ -0,0 +1,41 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.headings; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_heading_wkr__tst {
private final Xomw_heading_wkr__fxt fxt = new Xomw_heading_wkr__fxt();
@Test public void Basic() {
fxt.Test__parse("==A==" , "<h2>A</h2>");
fxt.Test__parse("abc\n==A==\ndef" , "abc\n<h2>A</h2>\ndef");
fxt.Test__parse("abc" , "abc");
fxt.Test__parse("abc\ndef" , "abc\ndef");
fxt.Test__parse("abc\n==" , "abc\n<h1></h1>");
}
}
class Xomw_heading_wkr__fxt {
private final Xomw_heading_wkr wkr = new Xomw_heading_wkr();
private final Xomw_heading_cbk__html cbk = new Xomw_heading_cbk__html().Bfr_(Bry_bfr_.New());
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Parse(pctx, src_bry, -1, src_bry.length, cbk);
Tfds.Eq_str_lines(expd, cbk.Bfr().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,81 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.hrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_hr_wkr {// THREAD.UNSAFE: caching for repeated calls
private Bry_bfr bfr;
public void Replace_hrs(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1<hr />', text);
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
this.bfr = pbfr.Trg();
boolean dirty = false;
// do separate check for "-----" at start of String;
int cur = 0;
if (Bry_.Eq(src, 0, Len__wtxt__hr__bos, Bry__wtxt__hr__bos)) {
cur = Replace_hr(Bool_.N, src, src_bgn, src_end, 0, Len__wtxt__hr__bos);
dirty = true;
}
// loop
while (true) {
// find next "\n-----"
int find_bgn = Bry_find_.Find_fwd(src, Bry__wtxt__hr__mid, cur, src_end);
// nothing found; exit
if (find_bgn == Bry_find_.Not_found) {
if (dirty) {
bfr.Add_mid(src, cur, src_end);
}
break;
}
// something found
cur = Replace_hr(Bool_.Y, src, cur, src_end, find_bgn, Len__wtxt__hr__mid);
dirty = true;
}
if (dirty)
pbfr.Switch();
}
private int Replace_hr(boolean mid, byte[] src, int cur, int src_end, int find_bgn, int tkn_len) {
// something found; add to bfr
if (mid) {
bfr.Add_mid(src, cur, find_bgn); // add everything before "\n-----"
bfr.Add_byte_nl();
}
bfr.Add(Bry__html__hr);
// set dirty / cur and continue
cur = find_bgn + tkn_len;
cur = Bry_find_.Find_fwd_while(src, cur, src_end, Byte_ascii.Dash); // gobble up trailing "-"; the "*" in "-----*" from the regex above
return cur;
}
private static final byte[]
Bry__wtxt__hr__mid = Bry_.new_a7("\n-----")
, Bry__wtxt__hr__bos = Bry_.new_a7("-----")
, Bry__html__hr = Bry_.new_a7("<hr />")
;
private static final int
Len__wtxt__hr__mid = Bry__wtxt__hr__mid.length
, Len__wtxt__hr__bos = Bry__wtxt__hr__bos.length
;
}

View File

@@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.hrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_hr_wkr__tst {
private final Xomw_hr_wkr__fxt fxt = new Xomw_hr_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a\n-----b" , "a\n<hr />b");}
@Test public void Extend() {fxt.Test__parse("a\n------b" , "a\n<hr />b");}
@Test public void Not_found() {fxt.Test__parse("a\n----b" , "a\n----b");}
@Test public void Bos() {fxt.Test__parse("-----a" , "<hr />a");}
@Test public void Bos_and_mid() {fxt.Test__parse("-----a\n-----b" , "<hr />a\n<hr />b");}
}
class Xomw_hr_wkr__fxt {
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xomw_hr_wkr wkr = new Xomw_hr_wkr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_hrs(new Xomw_parser_ctx(), pbfr.Init(src_bry));
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,233 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*; import gplx.core.primitives.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.mediawiki.includes.htmls.*;
/* TODO.XO
* P3: $langObj->formatNum( ++$this->mAutonumber );
* P2: $this->getConverterLanguage()->markNoConversion( $text );
*/
public class Xomw_lnke_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr tmp;
private Btrie_slim_mgr protocol_trie; private final Btrie_rv trv = new Btrie_rv();
private int autonumber;
private final Xomw_parser parser;
private final Xomw_linker linker;
private final Xomw_sanitizer sanitizer;
private final Xomw_atr_mgr attribs = new Xomw_atr_mgr();
private Xomw_regex_url regex_url;
private Xomw_regex_space regex_space;
public Xomw_lnke_wkr(Xomw_parser parser) {
this.parser = parser;
this.tmp = parser.Tmp();
this.linker = parser.Linker();
this.sanitizer = parser.Sanitizer();
if (angle_entities_trie == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Link_type__free = Bry_.new_a7("free");
Link_type__text = Bry_.new_a7("text");
Link_type__autonumber = Bry_.new_a7("autonumber");
angle_entities_trie = Btrie_slim_mgr.cs().Add_many_str("&lt;", "&gt;");
// REGEX:([^\]\\x00-\\x08\\x0a-\\x1F]*?); NOTE: val is key.length
invalid_text_chars_trie = Btrie_slim_mgr.cs();
New__trie_itm__by_len(invalid_text_chars_trie, Byte_ascii.Brack_end);
for (int i = 0; i <= 8; i++) { // x00-x08
New__trie_itm__by_len(invalid_text_chars_trie, i);
}
for (int i = 10; i <= 31; i++) { // x0a-x1F
New__trie_itm__by_len(invalid_text_chars_trie, i);
}
}
}
}
public void Init_by_wiki(Btrie_slim_mgr protocol_trie, Xomw_regex_url regex_url, Xomw_regex_space regex_space) {
this.protocol_trie = protocol_trie;
this.regex_url = regex_url;
this.regex_space = regex_space;
}
// XO.MW:SYNC:1.29; DATE:2017-02-01
public void Replace_external_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
this.autonumber = 1;
// find regex
int prv = 0;
while (true) {
// PORTED.BGN: $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
// $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
// self::EXT_LINK_ADDR .
// self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
//
// REGEX: "[" + "protocol" + "url-char"* + "space"* + "text"* + "]";
// protocol -> ((?i)' . $this->mUrlProtocols . ') -> "http://", "HTTps://"
// url-char* -> (EXT_LINK_ADDR . EXT_LINK_URL_CLASS*) -> "255.255.255.255", "a.b.c"; NOTE: "http:///" is valid
// space* -> \p{Zs}*
// text -> ([^\]\\x00-\\x08\\x0a-\\x1F]*?) -> "abcd"
// NOTE: /S=extra analysis of pattern /u = unicode support; REF.MW:http://php.net/manual/en/reference.pcre.pattern.modifiers.php
// Simplified expression to match an IPv4 or IPv6 address, or
// at least one character of a host name (embeds EXT_LINK_URL_CLASS)
// static final EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
//
// REGEX: "IPv4" | "IPv6" | "url-char"
// IPv4 -> [0-9.]+ -> "255."
// IPv6 -> \\[(?i:[0-9a-f:.]+)\\] -> "2001:"
// url-char -> [^][<>"\\x00-\\x20\\x7F\p{Zs}] -> "abcde"
// Constants needed for external link processing
// Everything except bracket, space, or control characters
// \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
// as well as U+3000 is IDEOGRAPHIC SPACE for T21052
// static final EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
//
// REGEX: NOT [ "symbols" | "control" | "whitespace" ]
// symbols -> ^][<>"
// control -> \\x00-\\x20\\x7F
// whitespace -> \p{Zs}
// search for "["
int lnke_bgn = Bry_find_.Find_fwd(src, Byte_ascii.Brack_bgn, cur, src_end);
if (lnke_bgn == Bry_find_.Not_found) {
bfr.Add_mid(src, cur, src_end);
break; // no more "["; stop
}
// check for protocol; EX: "https://"
cur = lnke_bgn + 1;
int url_bgn = cur;
Object protocol_bry = protocol_trie.Match_at(trv, src, cur, src_end);
if (protocol_bry == null) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue;// unknown protocol; ignore "["
}
cur += ((byte[])protocol_bry).length;
// check for one-or-more url chars; [^][<>"\\x00-\\x20\\x7F\p{Zs}]
int domain_bgn = cur;
cur = regex_url.Find_fwd_while(trv, src, domain_bgn, src_end);
if (cur - domain_bgn == 0) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue; // no chars found; invalid; EX: "[https://"abcde"]"
}
int url_end = cur;
// skip ws
cur = regex_space.Find_fwd_while(trv, src, cur, src_end);
// get text (if any)
int text_bgn = -1, text_end = -1;
while (true) {
byte b = src[cur];
Object invalid_text_char = invalid_text_chars_trie.Match_at_w_b0(trv, b, src, cur, src_end);
if (invalid_text_char != null) break;
if (text_bgn == -1) text_bgn = cur;
cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
text_end = cur;
}
// check for "]"
if (src[cur] != Byte_ascii.Brack_end) {
bfr.Add_mid(src, prv, cur);
prv = cur;
continue;
}
cur++;
// PORTED.END: $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
if (Php_preg_.Match(angle_entities_trie, trv, src, url_bgn, url_end) != null) {
int angle_bgn = trv.Match_bgn;
text_bgn = angle_bgn;
url_end = angle_bgn;
}
// If the link text is an image URL, replace it with an <img> tag
// This happened by accident in the original parser, but some people used it extensively
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
// $img = $this->maybeMakeExternalImage( $text );
// if ($img !== false) $text = $img;
// XO.MW.SKIP: See "Have link text"
//$dtrail = '';
// Set linktype for CSS - if URL==text, link is essentially free
boolean text_missing = text_bgn == -1;
byte[] link_type = text_missing ? Link_type__free : Link_type__text;
// No link text, e.g. [http://domain.tld/some.link]
if (text_missing) {
// Autonumber; EX: "[123]"
tmp.Add_byte(Byte_ascii.Brack_bgn);
tmp.Add_int_variable(autonumber++); // TODO.XO:$langObj->formatNum( ++$this->mAutonumber );
tmp.Add_byte(Byte_ascii.Brack_end);
link_type = Link_type__autonumber;
}
else {
// XO.MW.SKIP: skipped b/c MW splits $trail into $dtrail and $trail but does no extra logic with variables; just concatenates later; "$this->getExternalLinkAttribs( $url ) ) . $dtrail . $trail;"
// Have link text, e.g. [http://domain.tld/some.link text]s
// Check for trail
// list( $dtrail, $trail ) = Linker::splitTrail( $trail );
}
// TODO.XO:
// $text = $this->getConverterLanguage()->markNoConversion( $text );
byte[] url = Bry_.Mid(src, url_bgn, url_end);
url = sanitizer.Clean_url(url);
bfr.Add_mid(src, prv, lnke_bgn);
prv = cur;
// Use the encoded URL
// This means that users can paste URLs directly into the text
// Funny characters like <20> aren't valid in URLs anyway
// This was changed in August 2004
linker.makeExternalLink(bfr, url, Bry_.Mid(src, text_bgn, text_end), Bool_.N, link_type, parser.Get_external_link_attribs(attribs), Bry_.Empty);
// XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions?
// Register link in the output Object.
// Replace unnecessary URL escape codes with the referenced character
// This prevents spammers from hiding links from the filters
// $pasteurized = self::normalizeLinkUrl( $url );
// $this->mOutput->addExternalLink( $pasteurized );
}
}
private static byte[] Link_type__free, Link_type__text, Link_type__autonumber;
private static Btrie_slim_mgr angle_entities_trie;
private static Btrie_slim_mgr invalid_text_chars_trie;
private static void New__trie_itm__by_len(Btrie_slim_mgr mgr, int... ary) {
mgr.Add_obj(Bry_.New_by_ints(ary), new Int_obj_val(ary.length));
}
}

View File

@@ -0,0 +1,71 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_lnke_wkr__tst {
private final Xomw_lnke_wkr__fxt fxt = new Xomw_lnke_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("[https://a.org b]" , "<a rel='nofollow' class='external text' href='https://a.org'>b</a>");}
@Test public void Invaild__protocol() {fxt.Test__parse("[httpz:a.org]" , "[httpz:a.org]");}
@Test public void Invaild__protocol_slash() {fxt.Test__parse("[https:a.org]" , "[https:a.org]");}
@Test public void Invaild__urlchars__0() {fxt.Test__parse("[https://]" , "[https://]");}
@Test public void Invaild__urlchars__bad() {fxt.Test__parse("[https://\"]" , "[https://\"]");}
@Test public void Many() {
fxt.Test__parse(String_.Concat_lines_nl_apos_skip_last
( "a"
, "[https://b.org c]"
, "d"
, "[https://e.org f]"
, "g"
), String_.Concat_lines_nl_apos_skip_last
( "a"
, "<a rel='nofollow' class='external text' href='https://b.org'>c</a>"
, "d"
, "<a rel='nofollow' class='external text' href='https://e.org'>f</a>"
, "g"
));
}
@Test public void Protocol_rel() {
fxt.Test__parse("[//a.org b]" , "<a rel='nofollow' class='external text' href='//a.org'>b</a>");
}
@Test public void Url_should_not_has_angle_entities() {
fxt.Test__parse("[https://a.org/b&lt;c z]" , "<a rel='nofollow' class='external text' href='https://a.org/b'>&lt;c z</a>");
fxt.Test__parse("[https://a.org/b&gt;c z]" , "<a rel='nofollow' class='external text' href='https://a.org/b'>&gt;c z</a>");
}
@Test public void Link_trail() {// checks for noop via "Have link text"
fxt.Test__parse("[https://a.org b]xyz" , "<a rel='nofollow' class='external text' href='https://a.org'>b</a>xyz");
fxt.Test__parse("[https://a.org b]x!z" , "<a rel='nofollow' class='external text' href='https://a.org'>b</a>x!z");
}
@Test public void Clean_url() {
fxt.Test__parse("[https://a&quot;­b c]" , "<a rel='nofollow' class='external text' href='https://a%22b'>c</a>");
}
}
class Xomw_lnke_wkr__fxt {
private final Xomw_lnke_wkr wkr = new Xomw_lnke_wkr(new Xomw_parser());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public Xomw_lnke_wkr__fxt() {
Xomw_regex_space regex_space = new Xomw_regex_space();
wkr.Init_by_wiki(Xomw_parser.Protocols__dflt(), new Xomw_regex_url(regex_space), regex_space);
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_external_links(new Xomw_parser_ctx(), pbfr.Init(src_bry));
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_image_params {
public Xomw_param_map paramMap = null;
public Xomw_MagicWordArray mwArray = null;
}

View File

@@ -0,0 +1,858 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*; import gplx.core.primitives.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.wikis.nss.*; import gplx.xowa.wikis.xwikis.*;
import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.mediawiki.includes.parsers.quotes.*;
import gplx.xowa.mediawiki.includes.htmls.*; import gplx.xowa.mediawiki.includes.linkers.*;
import gplx.xowa.mediawiki.includes.libs.*;
import gplx.xowa.mediawiki.includes.media.*; import gplx.xowa.mediawiki.includes.filerepo.file.*;
import gplx.xowa.parsers.uniqs.*;
/* TODO.XO
* P7: multi-line links; // look at the next 'line' to see if we can close it there
* P7: interwiki
* P7: [[File:]]
* P7: [[Category:]]
* P6: [[Media:]]
* P4: handle "]]]"; "If we get a ] at the beginning of $m[3]"
* P4: handle "[[http://a.org]]"
* P3: $langObj->formatNum( ++$this->mAutonumber );
* P2: $this->getConverterLanguage()->markNoConversion( $text );
* P1: link_prefix; EX: b[[A]]; [not enabled on enwiki]
*/
public class Xomw_lnki_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Xomw_link_holders holders;
private final Xomw_linker linker;
private final Xomw_link_renderer link_renderer;
// private final Btrie_slim_mgr protocols_trie;
private final Xomw_quote_wkr quote_wkr;
private final Xomw_strip_state strip_state;
private Xomw_parser_env env;
private Xow_wiki wiki;
private Xoa_ttl page_title;
private final Xomw_linker__normalize_subpage_link normalize_subpage_link = new Xomw_linker__normalize_subpage_link();
private final Bry_bfr tmp;
private final Xomw_parser parser;
private final Xomw_atr_mgr extra_atrs = new Xomw_atr_mgr();
private final Xomw_qry_mgr query = new Xomw_qry_mgr();
private final Btrie_rv trv = new Btrie_rv();
private final List_adp tmp_list = List_adp_.New();
private final Hash_adp mImageParams = Hash_adp_bry.cs();
private final Hash_adp mImageParamsMagicArray = Hash_adp_bry.cs();
public Xomw_lnki_wkr(Xomw_parser parser, Xomw_link_holders holders, Xomw_link_renderer link_renderer, Btrie_slim_mgr protocols_trie) {
this.parser = parser;
this.holders = holders;
this.link_renderer = link_renderer;
// this.protocols_trie = protocols_trie;
this.linker = parser.Linker();
this.quote_wkr = parser.Quote_wkr();
this.tmp = parser.Tmp();
this.strip_state = parser.Strip_state();
}
public void Init_by_wiki(Xomw_parser_env env, Xow_wiki wiki) {
this.env = env;
this.wiki = wiki;
if (title_chars_for_lnki == null) {
title_chars_for_lnki = (boolean[])Array_.Clone(Xomw_ttl_utl.Title_chars_valid());
// the % is needed to support urlencoded titles as well
title_chars_for_lnki[Byte_ascii.Hash] = true;
title_chars_for_lnki[Byte_ascii.Percent] = true;
}
}
public void Clear_state() {
holders.Clear();
}
public void Replace_internal_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
this.page_title = pctx.Page_title();
Replace_internal_links(pctx, bfr, src, src_bgn, src_end);
}
// XO.MW:SYNC:1.29; DATE:2017-02-02
public void Replace_internal_links(Xomw_parser_ctx pctx, Bry_bfr bfr, byte[] src, int src_bgn, int src_end) {
// XO.MW: regex for tc move to header; e1 and e1_img moved to code
// the % is needed to support urlencoded titles as well
// XO.MW.BGN: split the entire text String on occurrences of [[
int cur = src_bgn;
int prv = cur;
while (true) {
int lnki_bgn = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__bgn, cur, src_end); // $a = StringUtils::explode('[[', ' ' . $s);
if (lnki_bgn == Bry_find_.Not_found) { // no more "[["; stop loop
bfr.Add_mid(src, cur, src_end);
break;
}
cur = lnki_bgn + 2; // 2="[[".length
// XO.MW.IGNORE: handles strange split logic of adding space to String; "$s = substr($s, 1);"
// TODO.XO:link_prefix; EX: b[[A]]
// $useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension();
// $e2 = null;
// if ($useLinkPrefixExtension) {
// // Match the end of a line for a word that's not followed by whitespace,
// // e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
// global $wgContLang;
// $charset = $wgContLang->linkPrefixCharset();
// $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu";
// }
// IGNORE: throw new MWException(__METHOD__ . ": \$this->mTitle is null\n");
// $nottalk = !$this->mTitle->isTalkPage();
// TODO.XO:link_prefix
byte[] prefix = Bry_.Empty;
//if ($useLinkPrefixExtension) {
// $m = [];
// if (preg_match($e2, $s, $m)) {
// $first_prefix = $m[2];
// } else {
// $first_prefix = false;
// }
//} else {
// $prefix = '';
//}
// TODO.XO:link_prefix; EX: b[[A]]
//if ($useLinkPrefixExtension) {
// if (preg_match($e2, $s, $m)) {
// $prefix = $m[2];
// $s = $m[1];
// } else {
// $prefix = '';
// }
// // first link
// if ($first_prefix) {
// $prefix = $first_prefix;
// $first_prefix = false;
// }
//}
// PORTED.BGN: if (preg_match($e1, $line, $m)) && else if (preg_match($e1_img, $line, $m))
// NOTE: both e1 and e1_img are effectively the same; e1_img allows nested "[["; EX: "[[A|b[[c]]d]]" will stop at "[[A|b"
int ttl_bgn = cur;
int ttl_end = Xomw_ttl_utl.Find_fwd_while_title(src, cur, src_end, title_chars_for_lnki);
cur = ttl_end;
int capt_bgn = -1, capt_end = -1;
int nxt_lnki = -1;
boolean might_be_img = false;
if (ttl_end > ttl_bgn) { // at least one valid title-char found; check for "|" or "]]" EX: "[[a"
byte nxt_byte = src[ttl_end];
if (nxt_byte == Byte_ascii.Pipe) { // handles lnki with capt ([[A|a]])and lnki with file ([[File:A.png|b|c|d]])
cur = ttl_end + 1;
// find next "[["
nxt_lnki = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__bgn, cur, src_end);
if (nxt_lnki == Bry_find_.Not_found)
nxt_lnki = src_end;
// find end "]]"
capt_bgn = cur;
capt_end = Bry_find_.Find_fwd(src, Bry__wtxt__lnki__end, cur, nxt_lnki);
if (capt_end == Bry_find_.Not_found) {
capt_end = nxt_lnki;
cur = nxt_lnki;
might_be_img = true;
}
else {
cur = capt_end + Bry__wtxt__lnki__end.length;
}
}
else if (Bry_.Match(src, ttl_end, ttl_end + 2, Bry__wtxt__lnki__end)) { // handles simple lnki; EX: [[A]]
cur = ttl_end + 2;
}
else {
ttl_end = -1;
}
}
else
ttl_end = -1;
if (ttl_end == -1) { // either (a) no valid title-chars ("[[<") or (b) title char, but has stray "]" ("[[a]b]]")
// Invalid form; output directly
bfr.Add_mid(src, prv, lnki_bgn + 2);
bfr.Add_mid(src, cur, ttl_bgn);
prv = cur = ttl_bgn;
continue;
}
// PORTED.END: if (preg_match($e1, $line, $m)) && else if (preg_match($e1_img, $line, $m))
byte[] text = Bry_.Mid(src, capt_bgn, capt_end);
byte[] trail = Bry_.Empty;
if (!might_be_img) {
// TODO.XO:
// If we get a ] at the beginning of $m[3] that means we have a link that's something like:
// [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up,
// the real problem is with the $e1 regex
// See T1500.
// Still some problems for cases where the ] is meant to be outside punctuation,
// and no image is in sight. See T4095.
// if ($text !== ''
// && substr($m[3], 0, 1) === ']'
// && strpos($text, '[') !== false
// ) {
// $text .= ']'; // so that replaceExternalLinks($text) works later
// $m[3] = substr($m[3], 1);
// }
// fix up urlencoded title texts
// if (strpos($m[1], '%') !== false) {
// // Should anchors '#' also be rejected?
// $m[1] = str_replace([ '<', '>' ], [ '&lt;', '&gt;' ], rawurldecode($m[1]));
// }
// $trail = $m[3];
}
else {
// Invalid, but might be an image with a link in its caption
// $text = $m[2];
// if (strpos($m[1], '%') !== false) {
// $m[1] = str_replace([ '<', '>' ], [ '&lt;', '&gt;' ], rawurldecode($m[1]));
// }
// $trail = "";
}
byte[] orig_link = Bry_.Mid(src, ttl_bgn, ttl_end);
// TODO.XO: handle "[[http://a.org]]"
// Don't allow @gplx.Internal protected links to pages containing
// PROTO: where PROTO is a valid URL protocol; these
// should be external links.
// if (preg_match('/^(?i:' . $this->mUrlProtocols . ')/', $origLink)) {
// $s .= $prefix . '[[' . $line;
// continue;
// }
byte[] link = orig_link;
boolean no_force = orig_link[0] != Byte_ascii.Colon;
if (!no_force) {
// Strip off leading ':'
link = Bry_.Mid(link, 1);
}
Xoa_ttl nt = wiki.Ttl_parse(link);
// Make subpage if necessary
boolean subpages_enabled = nt.Ns().Subpages_enabled();
if (subpages_enabled) {
Maybe_do_subpage_link(normalize_subpage_link, orig_link, text);
link = normalize_subpage_link.link;
text = normalize_subpage_link.text;
nt = wiki.Ttl_parse(link);
}
// IGNORE: handled in rewrite above
// else {
// link = orig_link;
// }
byte[] unstrip = strip_state.Unstrip_nowiki(link);
if (!Bry_.Eq(unstrip, link))
nt = wiki.Ttl_parse(unstrip);
if (nt == null) {
bfr.Add_mid(src, prv, lnki_bgn + 2); // $s .= $prefix . '[[' . $line;
prv = cur = lnki_bgn + 2;
continue;
}
Xow_ns ns = nt.Ns();
Xow_xwiki_itm iw = nt.Wik_itm();
if (might_be_img) { // if this is actually an invalid link
if (ns.Id_is_file() && no_force) { // but might be an image
boolean found = false;
// while (true) {
// // look at the next 'line' to see if we can close it there
// a->next();
// next_line = a->current();
// if (next_line === false || next_line === null) {
// break;
// }
// m = explode(']]', next_line, 3);
// if (count(m) == 3) {
// // the first ]] closes the inner link, the second the image
// found = true;
// text .= "[[{m[0]}]]{m[1]}";
// trail = m[2];
// break;
// } else if (count(m) == 2) {
// // if there's exactly one ]] that's fine, we'll keep looking
// text .= "[[{m[0]}]]{m[1]}";
// } else {
// // if next_line is invalid too, we need look no further
// text .= '[[' . next_line;
// break;
// }
// }
if (!found) {
// we couldn't find the end of this imageLink, so output it raw
// but don't ignore what might be perfectly normal links in the text we've examined
Bry_bfr nested = wiki.Utl__bfr_mkr().Get_b128();
this.Replace_internal_links(pctx, nested, text, 0, text.length);
nested.Mkr_rls();
bfr.Add(prefix).Add(Bry__wtxt__lnki__bgn).Add(link).Add_byte_pipe().Add(text); // s .= "{prefix}[[link|text";
// note: no trail, because without an end, there *is* no trail
continue;
}
}
else { // it's not an image, so output it raw
bfr.Add(prefix).Add(Bry__wtxt__lnki__bgn).Add(link).Add_byte_pipe().Add(text); // s .= "{prefix}[[link|text";
// note: no trail, because without an end, there *is* no trail
continue;
}
}
boolean was_blank = text.length == 0;
if (was_blank) {
text = link;
}
else {
// T6598 madness. Handle the quotes only if they come from the alternate part
// [[Lista d''e paise d''o munno]] -> <a href="...">Lista d''e paise d''o munno</a>
// [[Criticism of Harry Potter|Criticism of ''Harry Potter'']]
// -> <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a>
text = quote_wkr.Do_quotes(tmp, text);
}
// Link not escaped by : , create the various objects
// if (no_force && !nt->wasLocalInterwiki()) {
// Interwikis
// if (
// iw && this->mOptions->getInterwikiMagic() && nottalk && (
// Language::fetchLanguageName(iw, null, 'mw') ||
// in_array(iw, wgExtraInterlanguageLinkPrefixes)
// )
// ) {
// T26502: filter duplicates
// if (!isset(this->mLangLinkLanguages[iw])) {
// this->mLangLinkLanguages[iw] = true;
// this->mOutput->addLanguageLink(nt->getFullText());
// }
//
// s = rtrim(s . prefix);
// s .= trim(trail, "\n") == '' ? '': prefix . trail;
// continue;
// }
//
if (ns.Id_is_file()) {
// boolean is_good_image = !wfIsBadImage(nt->getDBkey(), this->mTitle)
boolean is_good_image = true;
if (is_good_image) {
if (was_blank) {
// if no parameters were passed, text
// becomes something like "File:Foo.png",
// which we don't want to pass on to the
// image generator
text = Bry_.Empty;
}
else {
// recursively parse links inside the image caption
// actually, this will parse them in any other parameters, too,
// but it might be hard to fix that, and it doesn't matter ATM
// text = this->replaceExternalLinks(text);
// holders->merge(this->replaceInternalLinks2(text));
}
// cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them
bfr.Add(prefix);
// Armor_links(Make_image(bfr, nt, text, holders))
this.makeImage(pctx, bfr, nt, text, holders);
bfr.Add(trail);
continue;
}
}
else if (ns.Id_is_ctg()) {
bfr.Trim_end_ws(); // s = rtrim(s . "\n"); // T2087
if (was_blank) {
// sortkey = this->getDefaultSort();
}
else {
// sortkey = text;
}
// sortkey = Sanitizer::decodeCharReferences(sortkey);
// sortkey = str_replace("\n", '', sortkey);
// sortkey = this->getConverterLanguage()->convertCategoryKey(sortkey);
// this->mOutput->addCategory(nt->getDBkey(), sortkey);
//
// Strip the whitespace Category links produce, see T2087
// s .= trim(prefix . trail, "\n") == '' ? '' : prefix . trail;
continue;
}
// }
// Self-link checking. For some languages, variants of the title are checked in
// LinkHolderArray::doVariants() to allow batching the existence checks necessary
// for linking to a different variant.
if (!ns.Id_is_special() && nt.Eq_full_db(page_title) && !nt.Has_fragment()) {
bfr.Add(prefix);
linker.makeSelfLinkObj(bfr, nt, text, Bry_.Empty, trail, Bry_.Empty);
continue;
}
// NS_MEDIA is a pseudo-namespace for linking directly to a file
// @todo FIXME: Should do batch file existence checks, see comment below
if (ns.Id_is_media()) {
// Give extensions a chance to select the file revision for us
// options = [];
// desc_query = false;
// MW.HOOK:BeforeParserFetchFileAndTitle
// Fetch and register the file (file title may be different via hooks)
// list(file, nt) = this->fetchFileAndTitle(nt, options);
// Cloak with NOPARSE to avoid replacement in replaceExternalLinks
// s .= prefix . this->armorLinks(
// Linker::makeMediaLinkFile(nt, file, text)) . trail;
// continue;
}
// Some titles, such as valid special pages or files in foreign repos, should
// be shown as bluelinks even though they're not included in the page table
// @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do
// batch file existence checks for NS_FILE and NS_MEDIA
bfr.Add_mid(src, prv, lnki_bgn);
prv = cur;
if (iw == null && nt.Is_always_known()) {
// this->mOutput->addLink(nt);
Make_known_link_holder(bfr, nt, text, trail, prefix);
}
else {
// Links will be added to the output link list after checking
holders.Make_holder(bfr, nt, text, Bry_.Ary_empty, trail, prefix);
}
}
}
public void makeImage(Xomw_parser_ctx pctx, Bry_bfr bfr, Xoa_ttl title, byte[] options_at_link, Xomw_link_holders holders) {
// Check if the options text is of the form "options|alt text"
// Options are:
// * thumbnail make a thumbnail with enlarge-icon and caption, alignment depends on lang
// * left no resizing, just left align. label is used for alt= only
// * right same, but right aligned
// * none same, but not aligned
// * ___px scale to ___ pixels width, no aligning. e.g. use in taxobox
// * center center the image
// * frame Keep original image size, no magnify-button.
// * framed Same as "frame"
// * frameless like 'thumb' but without a frame. Keeps user preferences for width
// * upright reduce width for upright images, rounded to full __0 px
// * border draw a 1px border around the image
// * alt Text for HTML alt attribute (defaults to empty)
// * class Set a class for img node
// * link Set the target of the image link. Can be external, interwiki, or local
// vertical-align values (no % or length right now):
// * baseline
// * sub
// * super
// * top
// * text-top
// * middle
// * bottom
// * text-bottom
// Protect LanguageConverter markup when splitting into parts
byte[][] parts = Xomw_string_utils.Delimiter_explode(tmp_list, trv, options_at_link);
// Give extensions a chance to select the file revision for us
// $options = [];
byte[] desc_query = null;
// XO.MW.HOOK:BeforeParserFetchFileAndTitle
// Fetch and register the file (file title may be different via hooks)
// list($file, $title) = $this->fetchFileAndTitle($title, $options);
Xomw_File file = fetchFileAndTitle(title, null);
// Get parameter map
Xomw_MediaHandler handler = file == null ? null : file.getHandler();
Xomw_image_params tmp_img_params = pctx.Lnki_wkr__make_image__img_params;
this.getImageParams(tmp_img_params, handler);
Xomw_param_map paramMap = tmp_img_params.paramMap;
Xomw_MagicWordArray mwArray = tmp_img_params.mwArray;
// XO.MW.UNSUPPORTED.TrackingCategory: if (!$file) $this->addTrackingCategory('broken-file-category');
// Process the input parameters
byte[] caption = Bry_.Empty;
// XO.MW: $params = [ 'frame' => [], 'handler' => [], 'horizAlign' => [], 'vertAlign' => [] ];
Xomw_params_frame frameParams = paramMap.Frame.Clear();
Xomw_params_handler handlerParams = paramMap.Handler.Clear();
// Xomw_params_horizAlign horizAlignParams = paramMap.HorizAlign.Clear();
// Xomw_params_vertAlign vertAlignParams = paramMap.VertAlign.Clear();
boolean seen_format = false;
int parts_len = parts.length;
for (int i = 0; i < parts_len; i++) {
byte[] part = parts[i];
part = Bry_.Trim(part);
byte[][] tmp_match_word = pctx.Lnki_wkr__make_image__match_magic_word;
mwArray.matchVariableStartToEnd(tmp_match_word, part);
byte[] magic_name = tmp_match_word[0];
byte[] val = tmp_match_word[1];
boolean validated = false;
Xomw_param_itm param_item = paramMap.Get_by(magic_name);
if (param_item != null) {
int typeUid = param_item.type_uid;
int paramNameUid = param_item.name_uid;
// Special case; width and height come in one variable together
if (typeUid == Xomw_param_map.Type__handler && paramNameUid == Xomw_param_itm.Name__width) {
int[] tmp_img_size = pctx.Lnki_wkr__make_image__img_size;
this.parseWidthParam(tmp_img_size, val);
int parsedW = tmp_img_size[0];
int parsedH = tmp_img_size[1];
if (parsedW != 0) {
if (handler.validateParam(Xomw_param_itm.Name__width, null, parsedW)) {
paramMap.Set(typeUid, Xomw_param_itm.Name__width, null, parsedW);
validated = true;
}
}
if (parsedH != 0) {
if (handler.validateParam(Xomw_param_itm.Name__height, null, parsedH)) {
paramMap.Set(typeUid, Xomw_param_itm.Name__height, null, parsedH);
validated = true;
}
}
// else no validation -- T15436
}
else {
if (typeUid == Xomw_param_map.Type__handler) {
// Validate handler parameter
// validated = $handler->validateParam($paramName, $value);
}
else {
// Validate @gplx.Internal protected parameters
switch (paramNameUid) {
case Xomw_param_itm.Name__manual_thumb:
case Xomw_param_itm.Name__alt:
case Xomw_param_itm.Name__class:
// @todo FIXME: Possibly check validity here for
// manualthumb? downstream behavior seems odd with
// missing manual thumbs.
validated = true;
// $value = $this->stripAltText($value, $holders);
break;
case Xomw_param_itm.Name__link:
// $chars = self::EXT_LINK_URL_CLASS;
// $addr = self::EXT_LINK_ADDR;
// $prots = $this->mUrlProtocols;
// if ($value === '') {
// $paramName = 'no-link';
// $value = true;
validated = true;
// }
// else if (preg_match("/^((?i)$prots)/", $value)) {
// if (preg_match("/^((?i)$prots)$addr$chars*$/u", $value, $m)) {
// $paramName = 'link-url';
// $this->mOutput->addExternalLink($value);
// if ($this->mOptions->getExternalLinkTarget()) {
// $params[$type]['link-target'] = $this->mOptions->getExternalLinkTarget();
// }
validated = true;
// }
// } else {
// $linkTitle = Title::newFromText($value);
// if ($linkTitle) {
// $paramName = 'link-title';
// $value = $linkTitle;
// $this->mOutput->addLink($linkTitle);
validated = true;
// }
// }
break;
case Xomw_param_itm.Name__frameless:
case Xomw_param_itm.Name__framed:
case Xomw_param_itm.Name__thumbnail:
// use first appearing option, discard others.
validated = !seen_format;
seen_format = true;
break;
default:
// Most other things appear to be empty or numeric...
validated = (val == null || Php_utl_.isnumeric(Bry_.Trim(val)));
break;
}
}
if (validated) {
paramMap.Set(typeUid, paramNameUid, val, -1);
}
}
}
if (!validated) {
caption = part;
}
}
// Process alignment parameters
Xomw_param_itm tmp = paramMap.Get_by(Xomw_param_map.Type__horizAlign);
if (tmp != null) {
// frameParams.align = tmp.val;
}
tmp = paramMap.Get_by(Xomw_param_map.Type__vertAlign);
if (tmp != null) {
// frameParams.valign = tmp.val;
}
frameParams.caption = caption;
boolean image_is_framed
= frameParams.frame != null
|| frameParams.framed != null
|| frameParams.thumbnail != null
|| frameParams.manualthumb != null
;
// Will the image be presented in a frame, with the caption below?
// In the old days, [[Image:Foo|text...]] would set alt text. Later it
// came to also set the caption, ordinary text after the image -- which
// makes no sense, because that just repeats the text multiple times in
// screen readers. It *also* came to set the title attribute.
// Now that we have an alt attribute, we should not set the alt text to
// equal the caption: that's worse than useless, it just repeats the
// text. This is the framed/thumbnail case. If there's no caption, we
// use the unnamed parameter for alt text as well, just for the time be-
// ing, if the unnamed param is set and the alt param is not.
// For the future, we need to figure out if we want to tweak this more,
// e.g., introducing a title= parameter for the title; ignoring the un-
// named parameter entirely for images without a caption; adding an ex-
// plicit caption= parameter and preserving the old magic unnamed para-
// meter for BC; ...
if (image_is_framed) { // Framed image
if (caption == Bry_.Empty && frameParams.alt == null) {
// No caption or alt text, add the filename as the alt text so
// that screen readers at least get some description of the image
frameParams.alt = title.Get_text();
}
// Do not set $params['frame']['title'] because tooltips don't make sense
// for framed images
}
else { // Inline image
if (frameParams.alt == null) {
// No alt text, use the "caption" for the alt text
if (caption != Bry_.Empty) {
// frameParams.alt = $this->stripAltText(caption, $holders);
}
else {
// No caption, fall back to using the filename for the
// alt text
frameParams.alt = title.Get_text();
}
}
// Use the "caption" for the tooltip text
// frameParams.title = $this->stripAltText(caption, $holders);
}
// MW.HOOK:ParserMakeImageParams
// Linker does the rest
// byte[] time = options.time;
Object time = null;
linker.makeImageLink(bfr, pctx, parser, title, file, frameParams, handlerParams, time, desc_query, parser.Options().getThumbSize());
// Give the handler a chance to modify the parser Object
// if (handler != null) {
// $handler->parserTransformHook($this, $file);
// }
}
// protected function stripAltText( $caption, $holders ) {
// // Strip bad stuff out of the title (tooltip). We can't just use
// // replaceLinkHoldersText() here, because if this function is called
// // from replaceInternalLinks2(), mLinkHolders won't be up-to-date.
// if ( $holders ) {
// $tooltip = $holders->replaceText( $caption );
// } else {
// $tooltip = $this->replaceLinkHoldersText( $caption );
// }
//
// // make sure there are no placeholders in thumbnail attributes
// // that are later expanded to html- so expand them now and
// // remove the tags
// $tooltip = $this->mStripState->unstripBoth( $tooltip );
// $tooltip = Sanitizer::stripAllTags( $tooltip );
//
// return $tooltip;
// }
private static Xomw_param_list[] internalParamNames;
private static Xomw_param_map internalParamMap;
private void getImageParams(Xomw_image_params rv, Xomw_MediaHandler handler) {
byte[] handlerClass = handler == null ? Bry_.Empty : handler.Key();
rv.paramMap = (Xomw_param_map)mImageParams.Get_by(handlerClass);
// NOTE: lazy-init; code below can be inefficent
if (rv.paramMap == null) {
// Initialise static lists
if (internalParamNames == null) {
internalParamNames = new Xomw_param_list[]
{ Xomw_param_list.New(Xomw_param_map.Type__horizAlign, "horizAlign", "left", "right", "center", "none")
, Xomw_param_list.New(Xomw_param_map.Type__vertAlign , "vertAlign", "baseline", "sub", "super", "top", "text-top", "middle", "bottom", "text-bottom")
, Xomw_param_list.New(Xomw_param_map.Type__frame , "frame", "thumbnail", "manual_thumb", "framed", "frameless", "upright", "border", "link", "alt", "class")
};
internalParamMap = new Xomw_param_map();
byte[] bry_img = Bry_.new_a7("img_");
for (Xomw_param_list param_list : internalParamNames) {
for (byte[] name : param_list.names) {
byte[] magic_name = Bry_.Add(bry_img, Bry_.Replace(name, Byte_ascii.Dash, Byte_ascii.Underline));
internalParamMap.Add(magic_name, param_list.type_uid, name);
}
}
}
// Add handler params
Xomw_param_map paramMap = internalParamMap.Clone();
if (handler != null) {
Xomw_param_map handlerParamMap = handler.getParamMap();
int handlerParamMapLen = handlerParamMap.Len();
for (int i = 0; i < handlerParamMapLen; i++) {
Xomw_param_itm itm = (Xomw_param_itm)handlerParamMap.Get_at(i);
paramMap.Add(itm.magic, itm.type_uid, itm.name);
}
}
this.mImageParams.Add(handlerClass, paramMap);
rv.paramMap = paramMap;
Xomw_MagicWordArray mw_array = new Xomw_MagicWordArray(env.Magic_word_mgr(), paramMap.Keys());
this.mImageParamsMagicArray.Add(handlerClass, mw_array);
rv.mwArray = mw_array;
}
else {
rv.mwArray = (Xomw_MagicWordArray)mImageParamsMagicArray.Get_by(handlerClass);
}
}
// Parsed a width param of imagelink like 300px or 200x300px
// XO.MW.NOTE: for MW, "" -> null, null while "AxB" -> 0x0
public void parseWidthParam(int[] img_size, byte[] src) {
img_size[0] = img_size[1] = Php_utl_.Null_int;
if (src == Bry_.Empty) {
return;
}
// (T15500) In both cases (width/height and width only),
// permit trailing "px" for backward compatibility.
int src_bgn = 0;
int src_end = src.length;
// XO: "px" is optional; if exists at end, ignore it
if (Bry_.Has_at_end(src, Bry__px)) {
src_end -= 2;
}
// XO.MW: if ( preg_match( '/^([0-9]*)x([0-9]*)\s*(?:px)?\s*$/', $value, $m ) ) {
int w_bgn = 0;
int w_end = Bry_find_.Find_fwd_while_num(src, src_bgn, src_end);
int h_bgn = -1;
int h_end = -1;
if (w_end < src_end && src[w_end] == Byte_ascii.Ltr_x) {
h_bgn = w_end + 1;
h_end = Bry_find_.Find_fwd_while_num(src, h_bgn, src_end);
}
img_size[0] = Bry_.To_int_or(src, w_bgn, w_end, 0);
img_size[1] = Bry_.To_int_or(src, h_bgn, h_end, 0);
}
public static final byte[] Bry__px = Bry_.new_a7("px");
/**
* Fetch a file and its title and register a reference to it.
* If 'broken' is a key in $options then the file will appear as a broken thumbnail.
* @param Title $title
* @param array $options Array of options to RepoGroup::findFile
* @return array ( File or false, Title of file )
*/
public Xomw_File fetchFileAndTitle(Xoa_ttl title, Hash_adp options) {
Xomw_File file = fetchFileNoRegister(title, options);
//$time = $file ? $file->getTimestamp() : false;
//$sha1 = $file ? $file->getSha1() : false;
//# Register the file as a dependency...
//$this->mOutput->addImage( $title->getDBkey(), $time, $sha1 );
//if ( $file && !$title->equals( $file->getTitle() ) ) {
// # Update fetched file title
// $title = $file->getTitle();
// $this->mOutput->addImage( $title->getDBkey(), $time, $sha1 );
//}
return file;
}
/**
* Helper function for fetchFileAndTitle.
*
* Also useful if you need to fetch a file but not use it yet,
* for example to get the file's handler.
*
* @param Title $title
* @param array $options Array of options to RepoGroup::findFile
* @return File|boolean
*/
private Xomw_File fetchFileNoRegister(Xoa_ttl title, Hash_adp options) {
Xomw_File file = null;
// if ( isset( $options['broken'] ) ) {
// file = false; // broken thumbnail forced by hook
// } elseif ( isset( $options['sha1'] ) ) { // get by (sha1,timestamp)
// file = RepoGroup::singleton()->findFileFromKey( $options['sha1'], $options );
// } else { // get by (name,timestamp)
file = env.File_finder().Find_file(title); // $options
// }
return file;
}
public void Maybe_do_subpage_link(Xomw_linker__normalize_subpage_link rv, byte[] target, byte[] text) {
linker.normalizeSubpageLink(rv, page_title, target, text);
}
public void Replace_link_holders(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
holders.Replace(pctx, pbfr);
}
public void Make_known_link_holder(Bry_bfr bfr, Xoa_ttl nt, byte[] text, byte[] trail, byte[] prefix) {
byte[][] split_trail = linker.splitTrail(trail);
byte[] inside = split_trail[0];
trail = split_trail[1];
if (text == Bry_.Empty) {
text = Bry_.Escape_html(nt.Get_prefixed_text());
}
// PORTED:new HtmlArmor( "$prefix$text$inside" )
tmp.Add_bry_escape_html(prefix);
tmp.Add_bry_escape_html(text);
tmp.Add_bry_escape_html(inside);
text = tmp.To_bry_and_clear();
link_renderer.Make_known_link(bfr, nt, text, extra_atrs, query);
byte[] link = bfr.To_bry_and_clear();
parser.Armor_links(bfr, link, 0, link.length);
bfr.Add(trail);
}
private static boolean[] title_chars_for_lnki;
private static final byte[] Bry__wtxt__lnki__bgn = Bry_.new_a7("[["), Bry__wtxt__lnki__end = Bry_.new_a7("]]");
// $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD";
//
// REGEX: "title-char"(1+) + "pipe"(0-1) + "]]"(0-1) + "other chars up to next [["
// title-char -> ([{$tc}]+)
// pipe -> (?:\\|(.+?))?
// ]] -> ?]]
// other chars... -> (.*)
// $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD";
//
// REGEX: "title-char"(1+) + "pipe"(0-1) + "other chars up to next [["
// title-char -> ([{$tc}]+)
// pipe -> \\|
// other chars... -> (.*)
}

View File

@@ -0,0 +1,122 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*; import gplx.core.tests.*;
import gplx.xowa.mediawiki.includes.filerepo.*; import gplx.xowa.mediawiki.includes.filerepo.file.*;
import gplx.xowa.mediawiki.includes.media.*;
public class Xomw_lnki_wkr__file__tst {
private final Xomw_lnki_wkr__fxt fxt = new Xomw_lnki_wkr__fxt();
@Before public void init() {
fxt.Clear();
fxt.Init__file("A.png", 300, 200);
}
@Test public void Plain() {
fxt.Test__to_html("[[File:A.png]]", "<a href='A.png' class='image'><img alt='A.png' src='/orig/7/70/A.png' /></a>");
}
@Test public void Thumb() {
fxt.Test__to_html("[[File:A.png|thumb]]", "<div class='thumb tright'><div class='thumbinner' style='width:222px;'><a href='A.png' class='image'><img alt='A.png' src='/thumb/7/70/A.png/220px-A.png' class='thumbimage' /></a> <div class='thumbcaption'><div class='magnify'><a href='' class='internal'></a></div></div></div></div>");
}
@Test public void Size() {
fxt.Test__to_html("[[File:A.png|123x456px]]", "<a href='A.png' class='image'><img alt='A.png' src='/thumb/7/70/A.png/123px-A.png' /></a>");
}
@Test public void fitBoxWidth() {
// COMMENT:"Height is the relative smaller dimension, so scale width accordingly"
// consider file of 200,100 (2:1)
// EX_1: view is 120,40 (3:1)
// - dimensions are either (a) 120,80 or (b) 80,40
// - use (b) 80,40
// EX_2: view is 120,80 (1.5:1)
// - dimensions are either (a) 120,60 or (b) 160,80
// - use (a) 120,60
fxt.Init__file("A.png", 200, 100);
fxt.Test__to_html__has("[[File:A.png|120x40px]]", "/80px-A.png");
fxt.Test__to_html__has("[[File:A.png|120x80px]]", "/120px-A.png");
}
@Test public void Test__parseWidthParam() {
int[] img_size = new int[2];
// WxHpx
fxt.Test__parseWidthParam(img_size, "12x34px" , 12, 34);
// WxH
fxt.Test__parseWidthParam(img_size, "12x34" , 12, 34);
// Wpx
fxt.Test__parseWidthParam(img_size, "12px" , 12, 0);
// W
fxt.Test__parseWidthParam(img_size, "12" , 12, 0);
// 12x
fxt.Test__parseWidthParam(img_size, "12x" , 12, 0);
// x34
fxt.Test__parseWidthParam(img_size, "x34" , 0, 34);
}
}
class Xomw_lnki_wkr__fxt {
private final Xomw_lnki_wkr wkr;
private final Xomw_parser_ctx pctx;
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xomw_file_finder__mock file_finder;
private final Xomw_FileRepo repo = new Xomw_FileRepo(Bry_.new_a7("/orig"), Bry_.new_a7("/thumb"));
private boolean apos = true;
public Xomw_lnki_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
Xomw_parser parser = new Xomw_parser();
wkr = parser.Lnki_wkr();
// env
file_finder = new Xomw_file_finder__mock(parser.Env());
parser.Env().File_finder_(file_finder);
parser.Env().Magic_word_mgr().Add(Bry_.new_u8("img_thumbnail"), Bool_.Y, Bry_.Ary("thumb"));
parser.Env().Magic_word_mgr().Add(Bry_.new_u8("img_width"), Bool_.Y, Bry_.Ary("$1px"));
parser.Init_by_wiki(wiki);
// ctx
pctx = new Xomw_parser_ctx();
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
}
public void Clear() {
wkr.Clear_state();
}
public void Init__file(String title, int w, int h) {
file_finder.Add(title, repo, w, h, Xomw_MediaHandlerFactory.Mime__image__png);
}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_internal_links(pctx, pbfr.Init(src_bry));
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Gftest.Eq__ary__lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
public void Test__to_html(String src_str, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Gftest.Eq__ary__lines(expd, Exec__to_html(src_str), src_str);
}
public void Test__to_html__has(String src_str, String expd) {
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Gftest.Eq__bool_y(String_.Has(Exec__to_html(src_str), expd));
}
private String Exec__to_html(String src_str) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Replace_internal_links(pctx, pbfr.Init(src_bry));
wkr.Replace_link_holders(pctx, pbfr);
return pbfr.Rslt().To_str_and_clear();
}
public void Test__parseWidthParam(int[] img_size, String src_str, int expd_w, int expd_h) {
wkr.parseWidthParam(img_size, Bry_.new_u8(src_str));
Gftest.Eq__int(expd_w, img_size[0], "w");
Gftest.Eq__int(expd_h, img_size[1], "h");
}
}

View File

@@ -0,0 +1,29 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*; import gplx.xowa.mediawiki.includes.filerepo.*; import gplx.xowa.mediawiki.includes.filerepo.file.*;
public class Xomw_lnki_wkr__text__tst {
private final Xomw_lnki_wkr__fxt fxt = new Xomw_lnki_wkr__fxt();
@Before public void init() {fxt.Clear();}
@Test public void Text() {fxt.Test__parse("a [[A]] z" , "a <!--LINK 0--> z");}
@Test public void Capt() {fxt.Test__parse("a [[A|a]] z" , "a <!--LINK 0--> z");}
@Test public void Invalid__char() {fxt.Test__parse("a [[<A>]] z" , "a [[<A>]] z");}
@Test public void Html__self() {fxt.Test__to_html("[[Page_1]]" , "<strong class='selflink'>Page_1</strong>");}
@Test public void Html__text() {fxt.Test__to_html("[[A]]" , "<a href='/wiki/A' title='A'>A</a>");}
@Test public void Html__capt() {fxt.Test__to_html("[[A|a]]" , "<a href='/wiki/A' title='A'>a</a>");}
}

View File

@@ -0,0 +1,57 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_param_itm {
public final byte[] magic;
public final int type_uid;
public final byte[] name;
public final int name_uid;
public Xomw_param_itm(byte[] magic, int type_uid, byte[] name) {
this.magic = magic;
this.type_uid = type_uid;
this.name = name;
this.name_uid = name_uids.Get_as_int_or(name, -1);
}
public static final int
Name__width = 0
, Name__height = 1
, Name__manual_thumb = 2
, Name__alt = 3
, Name__class = 4
, Name__link = 5
, Name__frameless = 6
, Name__framed = 7
, Name__thumbnail = 8
;
private static final Hash_adp_bry name_uids = Hash_adp_bry.cs()
.Add_str_int("width" , Name__width)
.Add_str_int("manual_thumb" , Name__manual_thumb)
.Add_str_int("alt" , Name__alt)
.Add_str_int("class" , Name__class)
.Add_str_int("link" , Name__link)
.Add_str_int("frameless" , Name__frameless)
.Add_str_int("framed" , Name__framed)
.Add_str_int("thumbnail" , Name__thumbnail)
;
public static final byte[]
Mw__img_width = Bry_.new_a7("img_width")
;
public static final byte[]
Name_bry__width = Bry_.new_a7("width")
;
}

View File

@@ -0,0 +1,77 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_param_map {
private final Ordered_hash hash = Ordered_hash_.New_bry();
public final Xomw_params_frame Frame = new Xomw_params_frame();
public final Xomw_params_handler Handler = new Xomw_params_handler();
public final Xomw_params_horizAlign HorizAlign = new Xomw_params_horizAlign();
public final Xomw_params_vertAlign VertAlign = new Xomw_params_vertAlign();
public int Len() {return hash.Len();}
public Xomw_param_itm Get_at(int i) {return (Xomw_param_itm)hash.Get_at(i);}
public Xomw_param_itm Get_by(byte[] name) {
return (Xomw_param_itm)hash.Get_by(name);
}
public Xomw_param_itm Get_by(int name_type) {
return null;
}
public void Set(int type, int paramNameUid, byte[] paramBry, int paramInt) {
switch (type) {
case Type__frame: Frame.Set(paramNameUid, paramBry, paramInt); break;
case Type__handler: Handler.Set(paramNameUid, paramBry, paramInt); break;
}
}
public byte[][] Keys() {
int len = hash.Len();
byte[][] rv = new byte[len][];
for (int i = 0; i < len; i++) {
rv[i] = ((Xomw_param_itm)hash.Get_at(i)).magic;
}
return rv;
}
public void Add(byte[] magic, int type_uid, byte[] name) {
Xomw_param_itm itm = new Xomw_param_itm(magic, type_uid, name);
hash.Add(magic, itm);
}
public Xomw_param_map Clone() {
Xomw_param_map rv = new Xomw_param_map();
int len = hash.Len();
for (int i = 0; i < len; i++) {
Xomw_param_itm itm = (Xomw_param_itm)hash.Get_at(i);
rv.Add(itm.magic, itm.type_uid, itm.name);
}
rv.Frame.Copy_to(this.Frame);
rv.Handler.Copy_to(this.Handler);
return rv;
}
public static final int Type__horizAlign = 0, Type__vertAlign = 1, Type__frame = 2, Type__handler = 3;
}
class Xomw_param_list {
public int type_uid;
public byte[] type;
public byte[][] names;
public static Xomw_param_list New(int type_uid, String type, String... names) {
Xomw_param_list rv = new Xomw_param_list();
rv.type_uid = type_uid;
rv.type = Bry_.new_u8(type);
rv.names = Bry_.Ary(names);
return rv;
}
}

View File

@@ -0,0 +1,85 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_params_frame {
public byte[] align = null;
public byte[] valign = null;
public byte[] caption = null;
public byte[] frame = null;
public byte[] framed = null;
public byte[] frameless = null;
public byte[] thumbnail = null;
public byte[] manualthumb = null;
public byte[] alt = null;
public byte[] title = null;
public byte[] cls = null;
public byte[] img_cls = null;
public byte[] link_title = null;
public byte[] link_url = null;
public byte[] link_target = null;
public byte[] no_link = null;
public byte[] border = null;
public byte[] custom_url_link = null;
public byte[] custom_target_link = null;
public boolean desc_link = false;
public byte[] desc_query = null;
public double upright;
public void Set(int uid, byte[] val_bry, int val_int) {
switch (uid) {
case Xomw_param_itm.Name__thumbnail: thumbnail = val_bry; break;
}
}
public Xomw_params_frame Clear() {
desc_link = false;
upright = Php_utl_.Null_double;
align = valign = caption = frame = framed = frameless
= thumbnail = manualthumb = alt = title = cls = img_cls
= link_title = link_url = link_target = no_link
= custom_url_link = custom_target_link = desc_query
= Php_utl_.Null_bry;
return this;
}
public void Copy_to(Xomw_params_frame src) {
this.desc_link = src.desc_link;
this.upright = src.upright;
this.align = src.align;
this.valign = src.valign;
this.caption = src.caption;
this.frame = src.frame;
this.framed = src.framed;
this.frameless = src.frameless;
this.thumbnail = src.thumbnail;
this.manualthumb = src.manualthumb;
this.alt = src.alt;
this.title = src.title;
this.cls = src.cls;
this.img_cls = src.img_cls;
this.link_title = src.link_title;
this.link_url = src.link_url;
this.link_target = src.link_target;
this.no_link = src.no_link;
this.border = src.border;
this.custom_url_link = src.custom_url_link;
this.custom_target_link = src.custom_target_link;
this.desc_query = src.desc_query;
}
public static byte[] Cls_add(byte[] lhs, byte[] rhs) {
return Bry_.Len_eq_0(lhs) ? rhs : Bry_.Add(lhs, Byte_ascii.Space_bry, rhs);
}
}

View File

@@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_params_handler {
public int width;
public int height;
public int page;
public int physicalWidth;
public int physicalHeight;
public Xomw_params_handler Clear() {
width = height = page
= physicalWidth = physicalHeight = Php_utl_.Null_int;
return this;
}
public void Copy_to(Xomw_params_handler src) {
this.width = src.width;
this.height = src.height;
this.page = src.page;
this.physicalWidth = src.physicalWidth;
this.physicalHeight = src.physicalHeight;
}
public void Set(int uid, byte[] val_bry, int val_int) {
switch (uid) {
case Xomw_param_itm.Name__width: width = val_int; break;
case Xomw_param_itm.Name__height: height = val_int; break;
default: throw Err_.new_unhandled_default(uid);
}
}
}

View File

@@ -0,0 +1,23 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_params_horizAlign {
public Xomw_params_horizAlign Clear() {
return this;
}
}

View File

@@ -0,0 +1,44 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_params_mto {
public boolean desc_link;
public byte[] alt = null;
public byte[] title = null;
public byte[] img_cls = null;
public byte[] file_link = null;
public byte[] valign = null;
public byte[] desc_query = null;
public byte[] override_width = null;
public byte[] override_height = null;
public byte[] no_dimensions = null;
public byte[] custom_url_link = null;
public byte[] custom_title_link = null;
public byte[] custom_target_link = null;
public byte[] parser_extlink_rel = null;
public byte[] parser_extlink_target = null;
public Xomw_params_mto Clear() {
desc_link = false;
alt = title = file_link = valign
= desc_query = override_width = override_height = no_dimensions
= custom_url_link = custom_title_link
= parser_extlink_rel = parser_extlink_target
= null;
return this;
}
}

View File

@@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_params_scalar {
public int physicalWidth;
public int physicalHeight;
public byte[] physicalDimensions;
public int clientWidth;
public int clientHeight;
public byte[] comment;
public int srcWidth;
public int srcHeight;
public byte[] mimeType;
public byte[] dstPath;
public byte[] dstUrl;
public byte[] interlace;
public Xomw_params_scalar() {
physicalWidth = physicalHeight = clientWidth = clientHeight = srcWidth = srcHeight = Php_utl_.Null_int;
}
}

View File

@@ -0,0 +1,23 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.lnkis; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_params_vertAlign {
public Xomw_params_vertAlign Clear() {
return this;
}
}

View File

@@ -0,0 +1,395 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.primitives.*; import gplx.core.btries.*; import gplx.core.net.*;
import gplx.xowa.mediawiki.includes.utls.*; import gplx.xowa.mediawiki.includes.htmls.*;
import gplx.langs.regxs.*;
// TODO.XO: this->getConverterLanguage()->markNoConversion($url, true),
public class Xomw_magiclinks_wkr {
private final Btrie_slim_mgr regex_trie = Btrie_slim_mgr.ci_a7(); // NOTE: must be ci to handle protocols; EX: "https:" and "HTTPS:"
private final Btrie_rv trv = new Btrie_rv();
private static byte[] Tag__anch__rhs;
private boolean[] url_separators;
private static Xomw_regex_link_interrupt regex_link_interrupt;
private final Xomw_parser parser;
private final Xomw_regex_boundary regex_boundary;
private final Xomw_regex_url regex_url;
private final Xomw_sanitizer sanitizer;
private final Xomw_linker linker;
private final Xomw_atr_mgr atrs = new Xomw_atr_mgr();
private byte[] page_title;
private static final byte Regex__anch = 1, Regex__elem = 2, Regex__free = 3;
public Xomw_magiclinks_wkr(Xomw_parser parser, Xomw_sanitizer sanitizer, Xomw_linker linker, Xomw_regex_boundary regex_boundary, Xomw_regex_url regex_url) {
this.parser = parser;
this.sanitizer = sanitizer;
this.linker = linker;
this.regex_boundary = regex_boundary;
this.regex_url = regex_url;
// ',;\.:!?'
url_separators = Bool_ary_bldr.New_u8()
.Set_many(Byte_ascii.Comma,Byte_ascii.Semic, Byte_ascii.Dot, Byte_ascii.Colon, Byte_ascii.Bang, Byte_ascii.Question)
.To_ary();
if (Tag__anch__rhs == null) {
synchronized (Type_adp_.ClassOf_obj(this)) {
Tag__anch__rhs = Bry_.new_a7("</a>");
regex_link_interrupt = new Xomw_regex_link_interrupt();
}
}
}
public void Init_by_wiki() {
regex_trie.Add_str_byte("<a", Regex__anch);
regex_trie.Add_str_byte("<" , Regex__elem);
Gfo_protocol_itm[] protocol_ary = Gfo_protocol_itm.Ary();
int protocol_len = protocol_ary.length;
for (int i = 0; i < protocol_len; i++) {
Gfo_protocol_itm itm = protocol_ary[i];
regex_trie.Add_bry_byte(itm.Text_bry(), Regex__free);
}
}
// Replace special strings like "ISBN xxx" and "RFC xxx" with
// magic external links.
public void Do_magic_links(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
int cur = src_bgn;
int prv = cur;
boolean dirty = true;
// PORTED.REGEX: handle below
// XO.MW.UNSUPPORTED.OBSOLETE: not handling RFC|PMID|ISBN b/c of upcoming obsolescence: https://www.mediawiki.org/wiki/Requests_for_comment/Future_of_magic_links
//'!(?: // Start cases
// (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
// (<.*?>) | // m[2]: Skip stuff inside
// // HTML elements' . "
// (\b(?i:$prots)($addr$urlChar*)) | // m[3]: Free external links
// // m[4]: Post-protocol path
// \b(?:RFC|PMID) $spaces // m[5]: RFC or PMID, capture number
// ([0-9]+)\b |
// \bISBN $spaces ( // m[6]: ISBN, capture number
// (?: 97[89] $spdash?)? // optional 13-digit ISBN prefix
// (?: [0-9] $spdash?){9} // 9 digits with opt. delimiters
// [0-9Xx] // check digit
// )\b
while (true) {
if (cur == src_end) {
if (dirty)
bfr.Add_mid(src, prv, src_end);
break;
}
byte b = src[cur];
Object o = regex_trie.Match_at_w_b0(trv, b, src, cur, src_end);
// current byte doesn't look like magiclink; continue;
if (o == null) {
cur++;
continue;
}
// looks like magiclink; do additional processing
byte regex_tid = ((Byte_obj_val)o).Val();
int hook_bgn = cur;
int hook_end = trv.Pos();
int tmp_pos = hook_end;
boolean regex_valid = true;
switch (regex_tid) {
case Regex__anch: // (<a[ \t\r\n>].*?</a>) | // m[1]: Skip link text
if (tmp_pos < src_end) {
// find "[ \t\r\n>]" after "<a"; i.e.: don't match "<ab" or "<ac", etc..
byte ws_byte = src[tmp_pos];
switch (ws_byte) {
// next char after "<a" is ws -> valid
case Byte_ascii.Space:
case Byte_ascii.Tab:
case Byte_ascii.Cr:
case Byte_ascii.Nl:
break;
// next char after "<a" is not ws -> invalid
default:
regex_valid = false;
break;
}
if (regex_valid) {
// find </a>
tmp_pos++;
int anch_end = Bry_find_.Find_fwd(src, Tag__anch__rhs, tmp_pos, src_end);
// </a> not found -> invalid
if (anch_end == Bry_find_.Not_found) {
regex_valid = false;
}
// </a> found -> valid; set cur to after "</a>"
else {
cur = anch_end + Tag__anch__rhs.length;
}
}
}
else {
regex_valid = false;
}
break;
case Regex__elem: // (<.*?>) | // m[2]: Skip stuff inside
// just find ">"
tmp_pos = Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, tmp_pos, src_end);
// > not found -> invalid
if (tmp_pos == Bry_find_.Not_found) {
regex_valid = false;
}
// > found -> valid; set cur to after ">"
else {
cur = tmp_pos + 1;
}
break;
case Regex__free:
// make sure that protocol starts at word bound; EX: "ahttp://a.org" should be invalid
if (regex_boundary.Is_boundary_prv(src, hook_bgn)) {
// skip forward until invalid url char
tmp_pos = regex_url.Find_fwd_while(trv, src, tmp_pos, src_end);
// no url chars found -> invalid
if (tmp_pos == hook_end) {
regex_valid = false;
}
// url chars found -> valid; set cur to 1st invalid url-char;
else {
cur = tmp_pos;
}
}
else
regex_valid = false;
break;
}
// regex is invalid; advance by 1 and continue;
if (!regex_valid) {
cur++;
}
// regex is valid
else {
// handle free
if (regex_tid == Regex__free) {
this.page_title = pctx.Page_title().Full_db();
dirty = true;
bfr.Add_mid(src, prv, hook_bgn);
byte[] url = Bry_.Mid(src, hook_bgn, cur);
int num_post_proto = cur - hook_end; // get length of url without proto; EX: "http://a.org" should be 5 ("a.org")
this.Make_free_external_link(bfr, url, num_post_proto);
prv = cur;
}
// "<a " and "<" just need to be ignored; note that they already update cur so noop
else {
}
}
}
if (dirty) {
pbfr.Switch();
}
}
// Make a free external link, given a user-supplied URL
public void Make_free_external_link(Bry_bfr bfr, byte[] url, int num_post_proto) {
byte[] trail = Bry_.Empty;
// The characters '<' and '>' (which were escaped by
// removeHTMLtags()) should not be included in
// URLs, per RFC 2396.
// Make &nbsp; terminate a URL as well (bug T84937)
int separator_bgn = regex_link_interrupt.Find(trv, url, 0, url.length);
if (separator_bgn != Bry_find_.Not_found) {
trail = Bry_.Mid(url, separator_bgn);
url = Bry_.Mid(url, 0, separator_bgn);
}
// Move trailing punctuation to $trail
int url_len = url.length;
// If there is no left bracket, then consider right brackets fair game too
// XO.MW: if (strpos($url, '(') === false) {$sep .= ')';}
url_separators[Byte_ascii.Paren_end] = Bry_find_.Find_fwd(url, Byte_ascii.Paren_bgn, 0, url_len) == Bry_find_.Not_found;
int num_sep_chars = Php_str_.Strspn_bwd__ary(url, url_separators, url_len, -1);
// Don't break a trailing HTML entity by moving the ; into $trail
// This is in hot code, so use substr_compare to avoid having to
// create a new String Object for the comparison
// XO.MW.NOTE: ignore semic if part of entity; EX: "http://a.org&apos;!."
if (num_sep_chars > 0 && Php_str_.Substr_byte(url, -num_sep_chars) == Byte_ascii.Semic) {
// more optimization: instead of running preg_match with a $
// anchor, which can be slow, do the match on the reversed
// String starting at the desired offset.
// un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
if (Xomw_regex_html_entity.Match_bwd(url, url_len - num_sep_chars, 0)) {
num_sep_chars--;
}
}
if (num_sep_chars > 0) {
trail = Bry_.Add(Php_str_.Substr(url, -num_sep_chars), trail);
url = Php_str_.Substr(url, 0, -num_sep_chars);
}
// Verify that we still have a real URL after trail removal, and
// not just lone protocol
if (trail.length >= num_post_proto) {
bfr.Add_bry_many(url, trail);
return;
}
url = sanitizer.Clean_url(url);
// XO.MW.UNSUPPORTED.NON-WMF: not supporting images from freefrom url; (EX: "http://a.org/image.png" -> "<img>"); haven't seen this used on WMF wikis
// Is this an external image?
byte[] text = null; // $this->maybeMakeExternalImage($url);
if (text == null) {
// Not an image, make a link
linker.makeExternalLink(bfr, url
, url // $this->getConverterLanguage()->markNoConversion($url, true),
, true, Bry_.new_a7("free")
, parser.Get_external_link_attribs(atrs)
, page_title);
// XO.MW.UNSUPPORTED.HOOK: registers link for processing by other extensions?
// Register it in the output Object...
// Replace unnecessary URL escape codes with their equivalent characters
// $pasteurized = self::normalizeLinkUrl($url);
// $this->mOutput->addExternalLink($pasteurized);
}
bfr.Add(trail);
}
}
class Xomw_regex_html_entity {
// if (preg_match('/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, num_sep_chars)) {
// REGEX: (letters | hex + "#" | dec + "x#") + "&"
// \G means "stop if matching breaks"; so, using a reversed example, "http://&#amp;&#!lt;" will not match "&#amp;" b/c "&#!lt;" breaks match
// http://www.php.net/manual/en/regexp.reference.escape.php
// http://stackoverflow.com/questions/14897949/what-is-the-use-of-g-anchor-in-regex
public static boolean Match_bwd(byte[] src, int src_bgn, int src_end) {
int cur = src_bgn - 1;
int numbers = 0;
int letters = 0;
while (cur >= src_end) {
int b_bgn = gplx.core.intls.Utf8_.Get_pos0_of_char_bwd(src, cur);
switch (src[b_bgn]) {
case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
letters++;
break;
case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
numbers++;
break;
case Byte_ascii.Hash:
// next must be &; EX: "&#" and "&#x"
int prv = cur - 1;
if (prv >= src_end && src[prv] == Byte_ascii.Amp) {
// if hex, num | ltr is fine
byte hex_byte = src[cur + 1];
if (hex_byte == Byte_ascii.Ltr_X || hex_byte == Byte_ascii.Ltr_x) {
return numbers > 0 || letters > 1; // 1 to ignore "x"
}
// if dec, no letters allowed
else {
return numbers > 0 && letters == 0;
}
}
return false;
case Byte_ascii.Amp:
// if entity, no numbers
return letters > 0 && numbers == 0;
default:
return false;
}
cur--;
}
return false;
}
}
class Xomw_regex_link_interrupt {
private static final byte Bgn__ent__lt = 0, Bgn__ent__gt = 1, Bgn__ent__nbsp = 2, Bgn__hex = 3, Bgn__dec = 4;
private static final byte End__hex__lt = 0, End__hex__gt = 1, End__hex__nbsp = 2, End__dec__lt = 3, End__dec__gt = 4, End__dec__nbsp = 5;
private final Btrie_slim_mgr bgn_trie = Btrie_slim_mgr.cs();
private final Btrie_slim_mgr end_trie = Btrie_slim_mgr.ci_a7();
public Xomw_regex_link_interrupt() {
// MW.REGEX: &(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));
bgn_trie.Add_str_byte("&lt;", Bgn__ent__lt);
bgn_trie.Add_str_byte("&gt;", Bgn__ent__gt);
bgn_trie.Add_str_byte("&nbsp;", Bgn__ent__nbsp);
bgn_trie.Add_str_byte("&#x", Bgn__hex); // 3C | 3E | A0
bgn_trie.Add_str_byte("&#", Bgn__dec); // 60 | 62 | 160
end_trie.Add_str_byte("3c;", End__hex__lt);
end_trie.Add_str_byte("3e;", End__hex__gt);
end_trie.Add_str_byte("a0;", End__hex__nbsp);
end_trie.Add_str_byte("60;", End__dec__lt);
end_trie.Add_str_byte("62;", End__dec__gt);
end_trie.Add_str_byte("160;", End__dec__nbsp);
}
public int Find(Btrie_rv trv, byte[] src, int src_bgn, int src_end) {
int pos = src_bgn;
while (true) {
if (pos >= src_end) break;
byte b = src[pos];
Object bgn_obj = bgn_trie.Match_at_w_b0(trv, b, src, pos, src_end);
if (bgn_obj == null) {
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
continue;
}
byte bgn_tid = ((Byte_obj_val)bgn_obj).Val();
int end_pos = trv.Pos();
boolean valid = false;
switch (bgn_tid) {
case Bgn__ent__lt:
case Bgn__ent__gt:
case Bgn__ent__nbsp:
return pos;
case Bgn__hex:
case Bgn__dec:
// match rest of sequence from above; EX: "3c;", "60;" etc.
end_pos = Bry_find_.Find_fwd_while(src, end_pos, src_end, Byte_ascii.Num_0);
Object end_obj = end_trie.Match_at(trv, src, end_pos, src_end);
if (end_obj != null) {
// make sure that hex-dec matches; EX: "&#x60;" and "&#3c;" are invalid
byte end_tid = ((Byte_obj_val)end_obj).Val();
if ( bgn_tid == Bgn__hex && Int_.Between(end_tid, End__hex__lt, End__hex__nbsp)
|| bgn_tid == Bgn__dec && Int_.Between(end_tid, End__dec__lt, End__dec__nbsp)
)
return pos;
}
break;
}
if (valid)
return pos;
else
pos += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
}
return Bry_find_.Not_found;
}
}

View File

@@ -0,0 +1,91 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.magiclinks; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_magiclinks_wkr__tst {
private final Xomw_magiclinks_wkr__fxt fxt = new Xomw_magiclinks_wkr__fxt();
@Test public void Basic() {fxt.Test__parse("a https://b.org z", "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a> z");}
@Test public void Invalid() {fxt.Test__parse("a _https://b.org z", "a _https://b.org z");}
@Test public void Tag__anch() {fxt.Test__parse("a <a title=\"https://b.org\">b</a> z", "a <a title=\"https://b.org\">b</a> z");}
@Test public void Tag__misc() {fxt.Test__parse("a <div title=\"https://b.org\">b</div> z", "a <div title=\"https://b.org\">b</div> z");}
@Test public void Interrupt() {
// ent
fxt.Test__parse("a https://b.org&lt;z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>&lt;z");
// hex
fxt.Test__parse("a https://b.org&#x3c;z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>&#x3c;z");
// dec
fxt.Test__parse("a https://b.org&#60;z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>&#60;z");
// num_post_proto rule
fxt.Test__parse("a https://&lt; z" , "a https://&lt; z");
}
@Test public void Interrupt__hex_dec() {// implementation specific test for mixed hex / dec
// dec-hex
fxt.Test__parse("a https://b.org&#3c;z" , "a <a rel='nofollow' class='external free' href='https://b.org&amp;#3c;z'>https://b.org&amp;#3c;z</a>");
}
@Test public void Separator() {
// basic; ,;.:!?
fxt.Test__parse("a https://b.org,;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>,;.:!? z");
// ")" excluded
fxt.Test__parse("a https://b.org).:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>).:!? z");
// ")" included b/c "(" exists
fxt.Test__parse("a https://b.org().:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org()'>https://b.org()</a>.:!? z");
// ";" excluded
fxt.Test__parse("a https://b.org;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org'>https://b.org</a>;.:!? z");
// ";" included b/c of ent
fxt.Test__parse("a https://b.org&abc;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org&amp;abc;'>https://b.org&amp;abc;</a>.:!? z");
// ";" included b/c of hex; note that Clean_url changes "&#xB1;" to "±"
fxt.Test__parse("a https://b.org&#xB1;.:!? z", "a <a rel='nofollow' class='external free' href='https://b.org±'>https://b.org±</a>.:!? z");
// ";" included b/c of dec; note that Clean_url changes "&#123;" to "{"
fxt.Test__parse("a https://b.org&#123;.:!? z", "a <a rel='nofollow' class='external free' href='https://b.org{'>https://b.org{</a>.:!? z");
// ";" excluded b/c of invalid.ent
fxt.Test__parse("a https://b.org&a1b;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org&amp;a1b'>https://b.org&amp;a1b</a>;.:!? z");
// ";" excluded b/c of invalid.hex
fxt.Test__parse("a https://b.org&#x;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org&amp;#x'>https://b.org&amp;#x</a>;.:!? z");
// ";" excluded b/c of invalid.dec
fxt.Test__parse("a https://b.org&#a;.:!? z" , "a <a rel='nofollow' class='external free' href='https://b.org&amp;#a'>https://b.org&amp;#a</a>;.:!? z");
// num_post_proto rule
fxt.Test__parse("a https://.:!? z" , "a https://.:!? z");
}
@Test public void Clean_url() {
// basic
fxt.Test__parse("http://a᠆b.org/c᠆d" , "<a rel='nofollow' class='external free' href='http://ab.org/c᠆d'>http://ab.org/c᠆d</a>");
}
}
class Xomw_magiclinks_wkr__fxt {
private final Xomw_magiclinks_wkr wkr;
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public Xomw_magiclinks_wkr__fxt() {
Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
Xomw_regex_space regex_space = new Xomw_regex_space();
pctx.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
Xomw_parser parser = new Xomw_parser();
this.wkr = new Xomw_magiclinks_wkr(parser, parser.Sanitizer(), parser.Linker(), new Xomw_regex_boundary(regex_space), new Xomw_regex_url(regex_space));
wkr.Init_by_wiki();
}
public void Test__parse(String src_str, String expd) {Test__parse(Bool_.Y, src_str, expd);}
public void Test__parse(boolean apos, String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
pbfr.Init(src_bry);
wkr.Do_magic_links(pctx, pbfr);
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,134 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.nbsps; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*;
public class Xomw_nbsp_wkr {
private final Btrie_rv trv = new Btrie_rv();
public void Do_nbsp(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
// PORTED:
// Clean up special characters, only run once, next-to-last before doBlockLevels
// $fixtags = [
// // French spaces, last one Guillemet-left
// // only if there is something before the space
// '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1&#160;',
// // french spaces, Guillemet-right
// '/(\\302\\253) /' => '\\1&#160;',
// '/&#160;(!\s*important)/' => ' \\1', // Beware of CSS magic word !important, T13874.
// ];
// $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
// XO.PBFR
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
if (trie == null) {
synchronized (this.getClass()) {
trie = Btrie_slim_mgr.cs();
Trie__add(trie, Tid__space_lhs, " ?");
Trie__add(trie, Tid__space_lhs, " :");
Trie__add(trie, Tid__space_lhs, " ;");
Trie__add(trie, Tid__space_lhs, " !");
Trie__add(trie, Tid__space_lhs, " »");
Trie__add(trie, Tid__space_rhs, "« ");
Trie__add(trie, Tid__important, "&#160;!");
}
}
int cur = src_bgn;
int prv = cur;
boolean dirty = true;
// search forward for...
// "\s" before ? : ; ! % 302,273; EX: "a :"
// "\s" after 302,253
// "&160;!\simportant"
while (true) {
if (cur == src_end) {
if (dirty)
bfr.Add_mid(src, prv, src_end);
break;
}
Object o = trie.Match_at(trv, src, cur, src_end);
if (o == null) {
cur++;
continue;
}
Xomw_nbsp_itm itm = (Xomw_nbsp_itm)o;
// '/&#160;(!\s*important)/' => ' \\1'
byte itm_tid = itm.Tid();
int important_end = -1;
if (itm_tid == Tid__important) {
int space_bgn = cur + itm.Key().length;
int space_end = Bry_find_.Find_fwd_while(src, space_bgn, src_end, Byte_ascii.Space);
important_end = space_end + Bry__important.length;
if (!Bry_.Match(src, space_end, important_end, Bry__important)) {
continue;
}
}
dirty = true;
bfr.Add_mid(src, prv, cur);
switch (itm_tid) {
case Tid__space_lhs:
bfr.Add_bry_many(Bry__nbsp, itm.Val());
break;
case Tid__space_rhs:
bfr.Add_bry_many(itm.Val(), Bry__nbsp);
break;
case Tid__important:
bfr.Add(Bry__important__repl);
break;
}
cur += itm.Key().length;
prv = cur;
}
if (dirty)
pbfr.Switch();
}
private static final byte Tid__space_lhs = 0, Tid__space_rhs = 1, Tid__important = 2;
private static Btrie_slim_mgr trie;
private static void Trie__add(Btrie_slim_mgr trie, byte tid, String key_str) {
byte[] key_bry = Bry_.new_u8(key_str);
byte[] val_bry = null;
switch (tid) {
case Tid__space_lhs:
val_bry = Bry_.Mid(key_bry, 1);
break;
case Tid__space_rhs:
val_bry = Bry_.Mid(key_bry, 0, key_bry.length - 1);
break;
case Tid__important:
val_bry = key_bry;
break;
}
Xomw_nbsp_itm itm = new Xomw_nbsp_itm(tid, key_bry, val_bry);
trie.Add_obj(key_bry, itm);
}
private static final byte[] Bry__nbsp = Bry_.new_a7("&#160;"), Bry__important = Bry_.new_a7("important"), Bry__important__repl = Bry_.new_a7(" !");
}
class Xomw_nbsp_itm {
public Xomw_nbsp_itm(byte tid, byte[] key, byte[] val) {
this.tid = tid;
this.key = key;
this.val = val;
}
public byte Tid() {return tid;} private final byte tid;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
}

View File

@@ -0,0 +1,40 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.nbsps; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_nbsp_wkr__tst {
private final Xomw_nbsp_wkr__fxt fxt = new Xomw_nbsp_wkr__fxt();
@Test public void Noop() {fxt.Test__parse("abc" , "abc");}
@Test public void Space_lhs__colon() {fxt.Test__parse("a :b c" , "a&#160;:b c");}
@Test public void Space_lhs__laquo() {fxt.Test__parse("a »b c" , "a&#160;»b c");}
@Test public void Space_rhs() {fxt.Test__parse("a« b c" , "a«&#160;b c");}
@Test public void Important() {fxt.Test__parse("a &#160;! important b" , "a ! important b");}
}
class Xomw_nbsp_wkr__fxt {
private final Xomw_nbsp_wkr wkr = new Xomw_nbsp_wkr();
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private boolean apos = true;
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
pbfr.Init(src_bry);
wkr.Do_nbsp(pctx, pbfr);
if (apos) expd = gplx.langs.htmls.Gfh_utl.Replace_apos(expd);
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,23 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_frame_itm {
public byte[] Expand(byte[] ttl) {
return null;
}
}

View File

@@ -0,0 +1,564 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
// public class Xomw_frame_wkr { // THREAD.UNSAFE: caching for repeated calls
// private final Xomw_parser parser;
// public Xomw_frame_wkr(Xomw_parser parser) {
// this.parser = parser;
// }
// \\ Replace magic variables, templates, and template arguments
// \\ with the appropriate text. Templates are substituted recursively,
// \\ taking care to avoid infinite loops.
// \\
// \\ Note that the substitution depends on value of $mOutputType:
// \\ self::OT_WIKI: only {{subst:}} templates
// \\ self::OT_PREPROCESS: templates but not extension tags
// \\ self::OT_HTML: all templates and extension tags
// \\
// \\ @param String $text The text to transform
// \\ @param boolean|PPFrame $frame Object describing the arguments passed to the
// \\ template. Arguments may also be provided as an associative array, as
// \\ was the usual case before MW1.12. Providing arguments this way may be
// \\ useful for extensions wishing to perform variable replacement
// \\ explicitly.
// \\ @param boolean $argsOnly Only do argument (triple-brace) expansion, not
// \\ double-brace expansion.
// \\ @return String
// public function replaceVariables($text, $frame = false, $argsOnly = false) {
// // Is there any text? Also, Prevent too big inclusions!
// $textSize = strlen($text);
// if ($textSize < 1 || $textSize > $this->mOptions->getMaxIncludeSize()) {
// return $text;
// }
//
// if ($frame == false) {
// $frame = $this->getPreprocessor()->newFrame();
// } elseif (!($frame instanceof PPFrame)) {
// wfDebug(__METHOD__ . " called using plain parameters instead of "
// . "a PPFrame instance. Creating custom frame.\n");
// $frame = $this->getPreprocessor()->newCustomFrame($frame);
// }
//
// $dom = $this->preprocessToDom($text);
// $flags = $argsOnly ? PPFrame::NO_TEMPLATES : 0;
// $text = $frame->expand($dom, $flags);
//
// return $text;
// }
//
// \\ Clean up argument array - refactored in 1.9 so parserfunctions can use it, too.
// public static function createAssocArgs($args) {
// $assocArgs = [];
// $index = 1;
// foreach ($args as $arg) {
// $eqpos = strpos($arg, '=');
// if ($eqpos == false) {
// $assocArgs[$index++] = $arg;
// } else {
// $name = trim(substr($arg, 0, $eqpos));
// $value = trim(substr($arg, $eqpos + 1));
// if ($value == false) {
// $value = '';
// }
// if ($name != false) {
// $assocArgs[$name] = $value;
// }
// }
// }
//
// return $assocArgs;
// }
// \\ Return the text of a template, after recursively
// \\ replacing any variables or templates within the template.
// \\
// \\ @param array $piece The parts of the template
// \\ $piece['title']: the title, i.e. the part before the |
// \\ $piece['parts']: the parameter array
// \\ $piece['lineStart']: whether the brace was at the start of a line
// \\ @param PPFrame $frame The current frame, contains template arguments
// \\ @throws Exception
// \\ @return String The text of the template
// public void Brace_substitution(Xomw_prepro_node__template piece, Xomw_frame_itm frame) {
// // Flags
//
// // $text has been filled
// boolean found = false;
// // wiki markup in $text should be escaped
// boolean nowiki = false;
// // $text is HTML, armour it against wikitext transformation
// boolean is_html = false;
// // Force interwiki transclusion to be done in raw mode not rendered
// boolean force_raw_interwiki = false;
// // $text is a DOM node needing expansion in a child frame
// boolean is_child_obj = false;
// // $text is a DOM node needing expansion in the current frame
// boolean is_local_obj = false;
//
// // Title Object, where $text came from
// byte[] title = null;
//
// // $part1 is the bit before the first |, and must contain only title characters.
// // Various prefixes will be stripped from it later.
// byte[] title_with_spaces = frame.Expand(piece.Title());
// byte[] part1 = Bry_.Trim(title_with_spaces);
// byte[] title_text = null;
//
// // Original title text preserved for various purposes
// byte[] originalTitle = part1;
//
// // $args is a list of argument nodes, starting from index 0, not including $part1
// // @todo FIXME: If piece['parts'] is null then the call to getLength()
// // below won't work b/c this $args isn't an Object
// Xomw_prepro_node__part[] args = (null == piece.Parts()) ? null : piece.Parts();
//
// byte[] profile_section = null; // profile templates
//
// Tfds.Write(nowiki, is_html, force_raw_interwiki, is_child_obj, is_local_obj, title, title_text, profile_section);
// // SUBST
// if (!found) {
// String subst_match = null; // $this->mSubstWords->matchStartAndRemove($part1);
// boolean literal = false;
//
// // Possibilities for substMatch: "subst", "safesubst" or FALSE
// // Decide whether to expand template or keep wikitext as-is.
// if (parser.Output_type__wiki()) {
// if (subst_match == null) {
// literal = true; // literal when in PST with no prefix
// }
// else {
// literal = false; // expand when in PST with subst: or safesubst:
// }
// }
// else {
// if (subst_match == "subst") {
// literal = true; // literal when not in PST with plain subst:
// }
// else {
// literal = false; // expand when not in PST with safesubst: or no prefix
// }
// }
// if (literal) {
//// $text = $frame->virtualBracketedImplode('{{', '|', '}}', title_with_spaces, $args);
// is_local_obj = true;
// found = true;
// }
// }
//
// // Variables
// if (!found && args.length == 0) {
//// $id = $this->mVariables->matchStartToEnd($part1);
//// if ($id != false) {
//// $text = $this->getVariableValue($id, $frame);
//// if (MagicWord::getCacheTTL($id) > -1) {
//// $this->mOutput->updateCacheExpiry(MagicWord::getCacheTTL($id));
//// }
// found = true;
//// }
// }
//
// // MSG, MSGNW and RAW
// if (!found) {
// // Check for MSGNW:
//// $mwMsgnw = MagicWord::get('msgnw');
//// if ($mwMsgnw->matchStartAndRemove($part1)) {
// nowiki = true;
//// }
//// else {
// // Remove obsolete MSG:
//// $mwMsg = MagicWord::get('msg');
//// $mwMsg->matchStartAndRemove($part1);
//// }
//
// // Check for RAW:
//// $mwRaw = MagicWord::get('raw');
//// if ($mwRaw->matchStartAndRemove($part1)) {
//// force_raw_interwiki = true;
//// }
// }
// Parser functions
// if (!found) {
// $colonPos = strpos($part1, ':');
// if ($colonPos != false) {
// $func = substr($part1, 0, $colonPos);
// $funcArgs = [ trim(substr($part1, $colonPos + 1)) ];
// $argsLength = $args->getLength();
// for ($i = 0; $i < $argsLength; $i++) {
// $funcArgs[] = $args->item($i);
// }
// try {
// $result = $this->callParserFunction($frame, $func, $funcArgs);
// } catch (Exception $ex) {
// throw $ex;
// }
// The interface for parser functions allows for extracting
// flags into the local scope. Extract any forwarded flags
// here.
// extract($result);
// }
// }
// Finish mangling title and then check for loops.
// Set title to a Title Object and $title_text to the PDBK
// if (!found) {
// $ns = NS_TEMPLATE;
// Split the title into page and subpage
// $subpage = '';
// $relative = $this->maybeDoSubpageLink($part1, $subpage);
// if ($part1 != $relative) {
// $part1 = $relative;
// $ns = $this->mTitle->getNamespace();
// }
// title = Title::newFromText($part1, $ns);
// if (title) {
// $title_text = title->getPrefixedText();
// // Check for language variants if the template is not found
// if ($this->getConverterLanguage()->hasVariants() && title->getArticleID() == 0) {
// $this->getConverterLanguage()->findVariantLink($part1, title, true);
// }
// // Do recursion depth check
// $limit = $this->mOptions->getMaxTemplateDepth();
// if ($frame->depth >= $limit) {
// found = true;
// $text = '<span class="error">'
// . wfMessage('parser-template-recursion-depth-warning')
// ->numParams($limit)->inContentLanguage()->text()
// . '</span>';
// }
// }
// }
// Load from database
// if (!found && title) {
// $profile_section = $this->mProfiler->scopedProfileIn(title->getPrefixedDBkey());
// if (!title->isExternal()) {
// if (title->isSpecialPage()
// && $this->mOptions->getAllowSpecialInclusion()
// && $this->ot['html']
// ) {
// $specialPage = SpecialPageFactory::getPage(title->getDBkey());
// // Pass the template arguments as URL parameters.
// // "uselang" will have no effect since the Language Object
// // is forced to the one defined in ParserOptions.
// $pageArgs = [];
// $argsLength = $args->getLength();
// for ($i = 0; $i < $argsLength; $i++) {
// $bits = $args->item($i)->splitArg();
// if (strval($bits['index']) == '') {
// $name = trim($frame->expand($bits['name'], PPFrame::STRIP_COMMENTS));
// $value = trim($frame->expand($bits['value']));
// $pageArgs[$name] = $value;
// }
// }
//
// // Create a new context to execute the special page
// $context = new RequestContext;
// $context->setTitle(title);
// $context->setRequest(new FauxRequest($pageArgs));
// if ($specialPage && $specialPage->maxIncludeCacheTime() == 0) {
// $context->setUser($this->getUser());
// } else {
// // If this page is cached, then we better not be per user.
// $context->setUser(User::newFromName('127.0.0.1', false));
// }
// $context->setLanguage($this->mOptions->getUserLangObj());
// $ret = SpecialPageFactory::capturePath(
// title, $context, $this->getLinkRenderer());
// if ($ret) {
// $text = $context->getOutput()->getHTML();
// $this->mOutput->addOutputPageMetadata($context->getOutput());
// found = true;
// is_html = true;
// if ($specialPage && $specialPage->maxIncludeCacheTime() != false) {
// $this->mOutput->updateRuntimeAdaptiveExpiry(
// $specialPage->maxIncludeCacheTime()
// );
// }
// }
// } elseif (MWNamespace::isNonincludable(title->getNamespace())) {
// found = false; // access denied
// wfDebug(__METHOD__ . ": template inclusion denied for " .
// title->getPrefixedDBkey() . "\n");
// } else {
// list($text, title) = $this->getTemplateDom(title);
// if ($text != false) {
// found = true;
// is_child_obj = true;
// }
// }
//
// // If the title is valid but undisplayable, make a link to it
// if (!found && ($this->ot['html'] || $this->ot['pre'])) {
// $text = "[[:$title_text]]";
// found = true;
// }
// } elseif (title->isTrans()) {
// // Interwiki transclusion
// if ($this->ot['html'] && !force_raw_interwiki) {
// $text = $this->interwikiTransclude(title, 'render');
// is_html = true;
// } else {
// $text = $this->interwikiTransclude(title, 'raw');
// // Preprocess it like a template
// $text = $this->preprocessToDom($text, self::PTD_FOR_INCLUSION);
// is_child_obj = true;
// }
// found = true;
// }
//
// // Do infinite loop check
// // This has to be done after redirect resolution to avoid infinite loops via redirects
// if (!$frame->loopCheck(title)) {
// found = true;
// $text = '<span class="error">'
// . wfMessage('parser-template-loop-warning', $title_text)->inContentLanguage()->text()
// . '</span>';
// wfDebug(__METHOD__ . ": template loop broken at '$title_text'\n");
// }
// }
// If we haven't found text to substitute by now, we're done
// Recover the source wikitext and return it
// if (!found) {
// $text = $frame->virtualBracketedImplode('{{', '|', '}}', title_with_spaces, $args);
// if ($profile_section) {
// $this->mProfiler->scopedProfileOut($profile_section);
// }
// return [ 'Object' => $text ];
// }
// Expand DOM-style return values in a child frame
// if (is_child_obj) {
// // Clean up argument array
// $newFrame = $frame->newChild($args, title);
//
// if (nowiki) {
// $text = $newFrame->expand($text, PPFrame::RECOVER_ORIG);
// } elseif ($title_text != false && $newFrame->isEmpty()) {
// // Expansion is eligible for the empty-frame cache
// $text = $newFrame->cachedExpand($title_text, $text);
// } else {
// // Uncached expansion
// $text = $newFrame->expand($text);
// }
// }
// if (is_local_obj && nowiki) {
// $text = $frame->expand($text, PPFrame::RECOVER_ORIG);
// is_local_obj = false;
// }
// if ($profile_section) {
// $this->mProfiler->scopedProfileOut($profile_section);
// }
// Replace raw HTML by a placeholder
// if (is_html) {
// $text = $this->insertStripItem($text);
// } elseif (nowiki && ($this->ot['html'] || $this->ot['pre'])) {
// // Escape nowiki-style return values
// $text = wfEscapeWikiText($text);
// } elseif (is_string($text)
// && !$piece['lineStart']
// && preg_match('/^(?:{\\||:|;|#|\*)/', $text)
// ) {
// // T2529: if the template begins with a table or block-level
// // element, it should be treated as beginning a new line.
// // This behavior is somewhat controversial.
// $text = "\n" . $text;
// }
// if (is_string($text) && !$this->incrementIncludeSize('post-expand', strlen($text))) {
// // Error, oversize inclusion
// if ($title_text != false) {
// // Make a working, properly escaped link if possible (T25588)
// $text = "[[:$title_text]]";
// } else {
// // This will probably not be a working link, but at least it may
// // provide some hint of where the problem is
// preg_replace('/^:/', '', $originalTitle);
// $text = "[[:$originalTitle]]";
// }
// $text .= $this->insertStripItem('<!-- WARNING: template omitted, '
// . 'post-expand include size too large -->');
// $this->limitationWarn('post-expand-template-inclusion');
// }
//
// if (is_local_obj) {
// $ret = [ 'Object' => $text ];
// } else {
// $ret = [ 'text' => $text ];
// }
// return $ret;
// }
// \\ Triple brace replacement -- used for template arguments
// public function argSubstitution($piece, $frame) {
//
// $error = false;
// $parts = $piece['parts'];
// $nameWithSpaces = $frame->expand($piece['title']);
// $argName = trim($nameWithSpaces);
// $Object = false;
// $text = $frame->getArgument($argName);
// if ($text == false && $parts->getLength() > 0
// && ($this->ot['html']
// || $this->ot['pre']
// || ($this->ot['wiki'] && $frame->isTemplate())
// )
// ) {
// // No match in frame, use the supplied default
// $Object = $parts->item(0)->getChildren();
// }
// if (!$this->incrementIncludeSize('arg', strlen($text))) {
// $error = '<!-- WARNING: argument omitted, expansion size too large -->';
// $this->limitationWarn('post-expand-template-argument');
// }
//
// if ($text == false && $Object == false) {
// // No match anywhere
// $Object = $frame->virtualBracketedImplode('{{{', '|', '}}}', $nameWithSpaces, $parts);
// }
// if ($error != false) {
// $text .= $error;
// }
// if ($Object != false) {
// $ret = [ 'Object' => $Object ];
// } else {
// $ret = [ 'text' => $text ];
// }
//
// return $ret;
// }
//
// /**
// \\ Return the text to be used for a given extension tag.
// \\ This is the ghost of strip().
// \\
// \\ @param array $params Associative array of parameters:
// \\ name PPNode for the tag name
// \\ attr PPNode for unparsed text where tag attributes are thought to be
// \\ attributes Optional associative array of parsed attributes
// \\ inner Contents of extension element
// \\ noClose Original text did not have a close tag
// \\ @param PPFrame $frame
// \\
// \\ @throws MWException
// \\ @return String
// \\/
// public function extensionSubstitution($params, $frame) {
// static $errorStr = '<span class="error">';
// static $errorLen = 20;
//
// $name = $frame->expand($params['name']);
// if (substr($name, 0, $errorLen) == $errorStr) {
// // Probably expansion depth or node count exceeded. Just punt the
// // error up.
// return $name;
// }
//
// $attrText = !isset($params['attr']) ? null : $frame->expand($params['attr']);
// if (substr($attrText, 0, $errorLen) == $errorStr) {
// // See above
// return $attrText;
// }
//
// // We can't safely check if the expansion for $content resulted in an
// // error, because the content could happen to be the error String
// // (T149622).
// $content = !isset($params['inner']) ? null : $frame->expand($params['inner']);
//
// $marker = self::MARKER_PREFIX . "-$name-"
// . sprintf('%08X', $this->mMarkerIndex++) . self::MARKER_SUFFIX;
//
// $isFunctionTag = isset($this->mFunctionTagHooks[strtolower($name)]) &&
// ($this->ot['html'] || $this->ot['pre']);
// if ($isFunctionTag) {
// $markerType = 'none';
// } else {
// $markerType = 'general';
// }
// if ($this->ot['html'] || $isFunctionTag) {
// $name = strtolower($name);
// $attributes = Sanitizer::decodeTagAttributes($attrText);
// if (isset($params['attributes'])) {
// $attributes = $attributes + $params['attributes'];
// }
//
// if (isset($this->mTagHooks[$name])) {
// // Workaround for PHP bug 35229 and similar
// if (!is_callable($this->mTagHooks[$name])) {
// throw new MWException("Tag hook for $name is not callable\n");
// }
// $output = call_user_func_array($this->mTagHooks[$name],
// [ $content, $attributes, $this, $frame ]);
// } elseif (isset($this->mFunctionTagHooks[$name])) {
// list($callback,) = $this->mFunctionTagHooks[$name];
// if (!is_callable($callback)) {
// throw new MWException("Tag hook for $name is not callable\n");
// }
//
// $output = call_user_func_array($callback, [ &$this, $frame, $content, $attributes ]);
// } else {
// $output = '<span class="error">Invalid tag extension name: ' .
// htmlspecialchars($name) . '</span>';
// }
//
// if (is_array($output)) {
// // Extract flags to local scope (to override $markerType)
// $flags = $output;
// $output = $flags[0];
// unset($flags[0]);
// extract($flags);
// }
// } else {
// if (is_null($attrText)) {
// $attrText = '';
// }
// if (isset($params['attributes'])) {
// foreach ($params['attributes'] as $attrName => $attrValue) {
// $attrText .= ' ' . htmlspecialchars($attrName) . '="' .
// htmlspecialchars($attrValue) . '"';
// }
// }
// if ($content == null) {
// $output = "<$name$attrText/>";
// } else {
// $close = is_null($params['close']) ? '' : $frame->expand($params['close']);
// if (substr($close, 0, $errorLen) == $errorStr) {
// // See above
// return $close;
// }
// $output = "<$name$attrText>$content$close";
// }
// }
//
// if ($markerType == 'none') {
// return $output;
// } elseif ($markerType == 'nowiki') {
// $this->mStripState->addNoWiki($marker, $output);
// } elseif ($markerType == 'general') {
// $this->mStripState->addGeneral($marker, $output);
// } else {
// throw new MWException(__METHOD__ . ': invalid marker type');
// }
// return $marker;
// }
// }

View File

@@ -0,0 +1,98 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public interface Xomw_prepro_node {
int Subs__len();
Xomw_prepro_node Subs__get_at(int i);
void Subs__add(Xomw_prepro_node sub);
void To_xml(Bry_bfr bfr);
}
class Xomw_prepro_node__text extends Xomw_prepro_node__base {
public Xomw_prepro_node__text(byte[] bry) {
this.bry = bry;
}
public byte[] Bry() {return bry;} protected final byte[] bry;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add(bry);
}
}
class Xomw_prepro_node__comment extends Xomw_prepro_node__base {
public Xomw_prepro_node__comment(byte[] bry) {
this.bry = bry;
}
public byte[] Bry() {return bry;} protected final byte[] bry;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<comment>");
bfr.Add(bry);
bfr.Add_str_a7("</comment>");
}
}
class Xomw_prepro_node__ext extends Xomw_prepro_node__base {
public Xomw_prepro_node__ext(byte[] name, byte[] attr, byte[] inner, byte[] close) {
this.name = name;
this.attr = attr;
this.inner = inner;
this.close = close;
}
public byte[] Name() {return name;} private final byte[] name;
public byte[] Attr() {return attr;} private final byte[] attr;
public byte[] Inner() {return inner;} private final byte[] inner;
public byte[] Close() {return close;} private final byte[] close;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<ext>");
bfr.Add_str_a7("<name>").Add(name).Add_str_a7("</name>");
bfr.Add_str_a7("<atr>").Add(attr).Add_str_a7("</atr>");
bfr.Add_str_a7("<inner>").Add(inner).Add_str_a7("</inner>");
bfr.Add_str_a7("<close>").Add(close).Add_str_a7("</close>");
bfr.Add_str_a7("</ext>");
}
}
class Xomw_prepro_node__heading extends Xomw_prepro_node__base {
public Xomw_prepro_node__heading(int heading_index, int title_index, byte[] text) {
this.heading_index = heading_index;
this.title_index = title_index;
this.text = text;
}
public int Heading_index() {return heading_index;} private final int heading_index;
public int Title_index() {return title_index;} private final int title_index;
public byte[] Text() {return text;} private final byte[] text;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<h ");
bfr.Add_str_a7(" level=\"").Add_int_variable(heading_index);
bfr.Add_str_a7("\" i=\"").Add_int_variable(title_index);
bfr.Add_str_a7("\">");
bfr.Add(text);
bfr.Add_str_a7("</h>");
}
}
class Xomw_prepro_node__tplarg extends Xomw_prepro_node__base {
public Xomw_prepro_node__tplarg(byte[] title, Xomw_prepro_node__part[] parts) {
this.title = title; this.parts = parts;
}
public byte[] Title() {return title;} private final byte[] title;
public Xomw_prepro_node__part[] Parts() {return parts;} private final Xomw_prepro_node__part[] parts;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<tplarg>");
bfr.Add_str_a7("<title>").Add(title);
bfr.Add_str_a7("</title>");
for (Xomw_prepro_node__part part : parts)
part.To_xml(bfr);
bfr.Add_str_a7("</tplarg>");
}
}

View File

@@ -0,0 +1,28 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public abstract class Xomw_prepro_node__base implements Xomw_prepro_node {
private List_adp subs;
public int Subs__len() {return subs == null ? 0 : subs.Len();}
public Xomw_prepro_node Subs__get_at(int i) {return subs == null ? null : (Xomw_prepro_node)subs.Get_at(i);}
public void Subs__add(Xomw_prepro_node sub) {
if (subs == null) subs = List_adp_.New();
subs.Add(sub);
}
public abstract void To_xml(Bry_bfr bfr);
}

View File

@@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_prepro_node__part extends Xomw_prepro_node__base {
public Xomw_prepro_node__part(int idx, byte[] key, byte[] val) {
this.idx = idx;
this.key = key;
this.val = val;
}
public int Idx() {return idx;} private final int idx;
public byte[] Key() {return key;} private final byte[] key;
public byte[] Val() {return val;} private final byte[] val;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<part>");
bfr.Add_str_a7("<name");
if (idx > 0) {
bfr.Add_str_a7(" index=\"").Add_int_variable(idx).Add_str_a7("\" />");
}
else {
bfr.Add_str_a7(">");
bfr.Add(key);
bfr.Add_str_a7("</name>");
bfr.Add_str_a7("=");
}
bfr.Add_str_a7("<value>");
bfr.Add(val);
bfr.Add_str_a7("</value>");
bfr.Add_str_a7("</part>");
}
}

View File

@@ -0,0 +1,36 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
public class Xomw_prepro_node__template extends Xomw_prepro_node__base {
public Xomw_prepro_node__template(byte[] title, Xomw_prepro_node__part[] parts, int line_start) {
this.title = title; this.parts = parts; this.line_start = line_start;
}
public byte[] Title() {return title;} private final byte[] title;
public Xomw_prepro_node__part[] Parts() {return parts;} private final Xomw_prepro_node__part[] parts;
public int Line_start() {return line_start;} private final int line_start;
@Override public void To_xml(Bry_bfr bfr) {
bfr.Add_str_a7("<template");
if (line_start > 0) bfr.Add_str_a7(" lineStart=\"").Add_int_variable(line_start).Add_byte_quote();
bfr.Add_byte(Byte_ascii.Angle_end);
bfr.Add_str_a7("<title>").Add(title);
bfr.Add_str_a7("</title>");
for (Xomw_prepro_node__part part : parts)
part.To_xml(bfr);
bfr.Add_str_a7("</template>");
}
}

View File

@@ -0,0 +1,66 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
class Xomw_prepro_rule {
public Xomw_prepro_rule(byte[] bgn, byte[] end, int min, int max, int[] names) {
this.bgn = bgn;
this.end = end;
this.min = min;
this.max = max;
this.names = names;
}
public final byte[] bgn;
public final byte[] end;
public final int min;
public final int max;
public final int[] names;
public boolean Names_exist(int idx) {
return idx < names.length && names[idx] != Name__invalid;
}
private static final byte[] Name__tmpl_bry = Bry_.new_a7("template"), Name__targ_bry = Bry_.new_a7("tplarg");
public static final int Name__invalid = -1, Name__null = 0, Name__tmpl = 1, Name__targ = 2;
public static byte[] Name(int type) {
switch (type) {
case Name__tmpl: return Name__tmpl_bry;
case Name__targ: return Name__targ_bry;
default:
case Name__invalid: return null;
case Name__null: return null;
}
}
}
class Xomw_prepro_elem {
private static final byte[] Bry__tag_end = Bry_.new_a7("</");
public Xomw_prepro_elem(int type, byte[] name) {
this.type = type;
this.name = name;
this.tag_end_lhs = Bry_.Add(Bry__tag_end, name);
}
public final int type;
public final byte[] name;
public final byte[] tag_end_lhs;
public static final int Type__comment = 0, Type__other = 1;
}
class Xomw_prepro_curchar_itm {
public Xomw_prepro_curchar_itm(byte[] bry, byte type) {
this.bry = bry;
this.type = type;
}
public byte[] bry;
public byte type;
}

View File

@@ -0,0 +1,170 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
class Xomw_prepro_stack {
public List_adp stack = List_adp_.New();
public Xomw_prepro_piece top;
private Bry_bfr root_accum = Bry_bfr_.New(), accum;
private final Xomw_prepro_flags flags = new Xomw_prepro_flags();
public Xomw_prepro_stack() {
accum = root_accum;
}
public void Clear() {
stack.Clear();
accum.Clear();
top = null;
}
public int Count() {return stack.Len();}
public Bry_bfr Get_accum() {return accum;}
public Bry_bfr Get_root_accum() {return root_accum;}
public Xomw_prepro_part Get_current_part() {
if (top == null) {
return null;
}
else {
return top.Get_current_part();
}
}
public void Push(Xomw_prepro_piece item) {
stack.Add(item);
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
accum = top.Get_accum();
}
public Xomw_prepro_piece Pop() {
int len = stack.Count();
if (len == 0) {
throw Err_.new_wo_type("Xomw_prepro_stack: no elements remaining");
}
Xomw_prepro_piece rv = (Xomw_prepro_piece)stack.Get_at(len - 1);
stack.Del_at(len - 1);
len--;
if (len > 0) {
this.top = (Xomw_prepro_piece)stack.Get_at(stack.Len() - 1);
accum = top.Get_accum();
} else {
this.top = null;
this.accum = root_accum;
}
return rv;
}
public void Add_part(byte[] bry) {
top.Add_part(bry);
accum = top.Get_accum();
}
public Xomw_prepro_flags Get_flags() {
if (stack.Count() == 0) {
flags.Find_eq = false;
flags.Find_pipe = false;
flags.In_heading = false;
return flags;
}
else {
top.Set_flags(flags);
return flags;
}
}
}
class Xomw_prepro_flags {
public boolean Find_pipe;
public boolean Find_eq;
public boolean In_heading;
}
class Xomw_prepro_piece {
public final byte[] open; // Opening character (\n for heading)
public final byte[] close; // Matching closing char;
public int count; // Number of opening characters found (number of "=" for heading)
public final boolean line_start; // True if the open char appeared at the start of the input line; Not set for headings.
public final int start_pos;
public List_adp parts = List_adp_.New();
public Xomw_prepro_piece(byte[] open, byte[] close, int count, int start_pos, boolean line_start) {
this.open = open;
this.close = close;
this.count = count;
this.start_pos = start_pos;
this.line_start = line_start;
parts.Add(new Xomw_prepro_part(Bry_.Empty));
}
public void Parts__renew() {
parts.Clear();
this.Add_part(Bry_.Empty);
}
public Xomw_prepro_part Get_current_part() {
return (Xomw_prepro_part)parts.Get_at(parts.Len() - 1);
}
public Bry_bfr Get_accum() {
return Get_current_part().bfr;
}
public void Add_part(byte[] bry) {
parts.Add(new Xomw_prepro_part(bry));
}
public static final byte[] Brack_bgn_bry = Bry_.new_a7("[");
public void Set_flags(Xomw_prepro_flags flags) {
int parts_len = parts.Len();
boolean open_is_nl = Bry_.Eq(open, Byte_ascii.Nl_bry);
boolean find_pipe = !open_is_nl && !Bry_.Eq(open, Brack_bgn_bry);
flags.Find_pipe = find_pipe;
flags.Find_eq = find_pipe && parts_len > 1 && ((Xomw_prepro_part)parts.Get_at(parts_len - 1)).Eqpos != -1;
flags.In_heading = open_is_nl;
}
// Get the output String that would result if the close is not found.
public byte[] Break_syntax(Bry_bfr tmp_bfr, int opening_count) {
byte[] rv = Bry_.Empty;
if (Bry_.Eq(open, Byte_ascii.Nl_bry)) {
rv = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry();
}
else {
if (opening_count == -1) {
opening_count = count;
}
tmp_bfr.Add(Bry_.Repeat_bry(open, opening_count));
// concat parts with "|"
boolean first = true;
int len = parts.Len();
for (int i = 0; i < len; i++) {
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(i);
if (first) {
first = false;
}
else {
tmp_bfr.Add_byte_pipe();
}
tmp_bfr.Add(part.bfr.To_bry());
}
rv = tmp_bfr.To_bry_and_clear();
}
return rv;
}
}
class Xomw_prepro_part {
public Xomw_prepro_part(byte[] bry) {
bfr.Add(bry);
}
public final Bry_bfr bfr = Bry_bfr_.New();
public int Eqpos = -1;
public int comment_end = -1;
public int visual_end = -1;
}

View File

@@ -0,0 +1,789 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.core.btries.*;
import gplx.xowa.mediawiki.includes.utls.*;
public class Xomw_prepro_wkr { // THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr tmp_bfr = Bry_bfr_.New();
private final List_adp comments_list = List_adp_.New();
private final Btrie_slim_mgr elements_trie__y = Btrie_slim_mgr.ci_a7(), elements_trie__n = Btrie_slim_mgr.ci_a7();
private final Hash_adp_bry xmlish_allow_missing_end_tag = Hash_adp_bry.cs().Add_many_str("includeonly", "noinclude", "onlyinclude");
private final Hash_adp_bry no_more_closing_tag = Hash_adp_bry.cs();
private final Xomw_prepro_stack stack = new Xomw_prepro_stack();
private final Btrie_rv trv = new Btrie_rv();
private Bry_bfr accum = Bry_bfr_.New();
public void Init_by_wiki(String... xmlish_elems_ary) {
Elements_trie__init_by_wiki(elements_trie__y, ignored_tags_y, xmlish_elems_ary, "noinclude");
Elements_trie__init_by_wiki(elements_trie__n, ignored_tags_n, xmlish_elems_ary, "includeonly");
}
private void Elements_trie__init_by_wiki(Btrie_slim_mgr trie, Ordered_hash ignored_tags, String[] strip_list_ary, String xmlish_elem) {
trie.Clear();
Elements_trie__add(trie, Bool_.Y, "!--", "comment");
// PORTED: $xmlishElements = parser->getStripList();
for (String itm : strip_list_ary) {
Elements_trie__add(trie, Bool_.N, itm, itm);
}
// PORTED: "$xmlishElements[] = 'noinclude';" or "$xmlishElements[] = 'includeonly';"
Elements_trie__add(trie, Bool_.N, xmlish_elem, xmlish_elem);
// PORTED: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
int ignored_tags_len = ignored_tags.Count();
for (int j = 0; j < ignored_tags_len; j++) {
byte[] bry = (byte[])ignored_tags.Get_at(j);
String str = String_.new_u8(bry);
Elements_trie__add(trie, Bool_.N, str, str);
}
}
private static void Elements_trie__add(Btrie_slim_mgr trie, boolean type_is_comment, String hook, String name) {
trie.Add_obj(hook, new Xomw_prepro_elem(type_is_comment ? Xomw_prepro_elem.Type__comment : Xomw_prepro_elem.Type__other, Bry_.new_a7(name)));
}
public byte[] Preprocess_to_xml(byte[] src, boolean for_inclusion) {
// RELIC.PROC_VAR: forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
// RELIC.INIT_BY_WIKI: $xmlishElements = parser->getStripList();
// RELIC.CLASS_VAR: $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
boolean enable_only_include = false;
// PORTED: rewritten so that all add / del is done in INIT_BY_WIKI
Ordered_hash ignored_tags;
Hash_adp ignored_elements;
Btrie_slim_mgr elements_trie;
if (for_inclusion) {
ignored_tags = ignored_tags_y; // RELIC: $ignoredTags = [ 'includeonly', '/includeonly' ];
ignored_elements = ignored_elements__y; // RELIC: $ignoredElements = [ 'noinclude' ];
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'noinclude';
if ( Bry_.Has(src, Bry__only_include_bgn)
&& Bry_.Has(src, Bry__only_include_end)) {
enable_only_include = true;
}
elements_trie = elements_trie__y;
}
else {
ignored_tags = ignored_tags_n; // $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ];
ignored_elements = ignored_elements__n; // $ignoredElements = [ 'includeonly' ];
// RELIC.INIT_BY_WIKI: $xmlishElements[] = 'includeonly';
elements_trie = elements_trie__n;
}
// RELIC.INIT_BY_WIKI: $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
// RELIC.REGEX
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
// $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
stack.Clear();
// RELIC.REGEX:
// $searchBase = "[{<\n"; # }
// RELIC.BRY_FIND
// For fast reverse searches
// $revText = strrev( $text );
// $lengthText = strlen( $text );
// Input pointer, starts out pointing to a pseudo-newline before the start
int i = 0;
// Current accumulator
accum = stack.Get_accum();
accum.Add_str_a7("<root>");
// True to find equals signs in arguments
boolean find_equals = false;
// True to take notice of pipe characters
boolean find_pipe = false;
int heading_index = 1;
// True if $i is inside a possible heading
boolean in_heading = false;
// True if there are no more greater-than (>) signs right of $i
boolean no_more_gt = false;
// Map of tag name => true if there are no more closing tags of given type right of $i
no_more_closing_tag.Clear();
// True to ignore all input up to the next <onlyinclude>
boolean find_only_include = enable_only_include;
// Do a line-start run without outputting an LF character
boolean fake_line_start = true;
// XOWA: init
int src_len = src.length;
int found = -1;
byte[] cur_char = Bry_.Empty;
byte[] cur_closing = Bry_.Empty;
byte[] inner = null;
Xomw_prepro_rule rule = null;
while (true) {
if (find_only_include) {
// Ignore all input up to the next <onlyinclude>
int start_pos = Bry_find_.Find_fwd(src, Bry__only_include_bgn, i, src_len);
if (start_pos == Bry_find_.Not_found) {
// Ignored section runs to the end
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</ignore>");
break;
}
int tag_end_pos = start_pos + Bry__only_include_bgn.length; // past-the-end
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos).Add_str_a7("</ignore>");
i = tag_end_pos;
find_only_include = false;
}
if (fake_line_start) {
found = Found__line_bgn;
cur_char = Bry_.Empty;
}
else {
// Find next opening brace, closing brace or pipe
// RELIC.REGEX: $search = $searchBase;
if (stack.top == null) {
cur_closing = Bry_.Empty;
}
else {
cur_closing = stack.top.close;
// RELIC.REGEX: $search .= $currentClosing;
}
if (find_pipe) {
// RELIC.REGEX: $search .= '|';
}
if (find_equals) {
// First equals will be for the template
// RELIC.REGEX: $search .= '=';
}
// Output literal section, advance input counter
// PORTED: "$literalLength = strcspn(src, $search, i)"; NOTE: no trie b/c of frequent changes to $search
int literal_len = 0;
boolean loop_stop = false;
// loop chars until search_char is found
for (int j = i; j < src_len; j++) {
byte b = src[j];
switch (b) { // handle '$searchBase = "[{<\n";'
case Byte_ascii.Brack_bgn:
case Byte_ascii.Curly_bgn:
case Byte_ascii.Angle_bgn:
case Byte_ascii.Nl:
loop_stop = true;
break;
case Byte_ascii.Pipe: // handle "find_pipe"
if (find_pipe) loop_stop = true;
break;
case Byte_ascii.Eq: // handle "find_equals"
if (find_equals) loop_stop = true;
break;
default: // handle "cur_closing"; specified by piece.close and rule.close, so "\n", "}", "]" and "}-"
if (cur_closing != Bry_.Empty) {
byte cur_closing_0 = cur_closing[0];
if (b == cur_closing_0) {
if (cur_closing.length == 1) { // handle "\n", "}", "]"
loop_stop = true;
}
else {// handle "}-"
int nxt_idx = j + 1;
if (nxt_idx < src_len && src[nxt_idx] == Byte_ascii.Dash)
loop_stop = true;
}
}
}
break;
}
if (loop_stop)
break;
else
literal_len++;
}
if (literal_len > 0) {
accum.Add_bry_escape_html(src, i, i + literal_len);
i += literal_len;
}
if (i >= src_len) {
if (Bry_.Eq(cur_closing, Byte_ascii.Nl_bry)) {
// Do a past-the-end run to finish off the heading
cur_char = Bry_.Empty;
found = Found__line_end;
}
else {
// All done
break;
}
}
else {
// PORTED: "if ( $curChar == '|' ) {", etc..
Xomw_prepro_curchar_itm cur_char_itm = (Xomw_prepro_curchar_itm)cur_char_trie.Match_at(trv, src, i, src_len);
if (cur_char_itm != null) {
cur_char = cur_char_itm.bry;
switch (cur_char_itm.type) {
case Byte_ascii.Pipe: found = Found__pipe; break;
case Byte_ascii.Eq: found = Found__equals; break;
case Byte_ascii.Angle_bgn: found = Found__angle; break;
case Byte_ascii.Nl: found = in_heading ? Found__line_end : Found__line_bgn; break;
// PORTED: "elseif ( $curChar == $currentClosing )"
case Byte_ascii.Curly_end: found = Found__close; break;
case Byte_ascii.Brack_end: found = Found__close; break;
case Byte_ascii.At: found = Found__close; break; // NOTE: At is type for "}-"
// PORTED: "elseif ( isset( $this->rules[$curChar] ) )"
case Byte_ascii.Curly_bgn: {found = Found__open; rule = rule_curly; break;}
case Byte_ascii.Brack_bgn: {found = Found__open; rule = rule_brack; break;}
case Byte_ascii.Dash: {found = Found__open; rule = rule_langv; break;}
}
}
else {
i++;
continue;
}
}
}
if (found == Found__angle) {
// Handle </onlyinclude>
if ( enable_only_include
&& Bry_.Eq(src, i, i + Len__only_include_end, Bry__only_include_end)) {
find_only_include = true;
continue;
}
// Determine element name
// PORTED: $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; EX: "(pre|ref)(?:\s|\/>|>)|(!--)
Xomw_prepro_elem element = (Xomw_prepro_elem)elements_trie.Match_at(trv, src, i + 1, src_len);
if (element == null) {
// Element name missing or not listed
accum.Add(Bry__escaped_lt);
i++;
continue;
}
// Handle comments
if (element.type == Xomw_prepro_elem.Type__comment) {
// To avoid leaving blank lines, when a sequence of
// space-separated comments is both preceded and followed by
// a newline (ignoring spaces), then
// trim leading and trailing spaces and the trailing newline.
// Find the end
int end_pos = Bry_find_.Find_fwd(src, Bry__comment_end, i + 4, src_len);
if (end_pos == Bry_find_.Not_found) {
// Unclosed comment in input, runs to end
accum.Add_str_a7("<comment>").Add_bry_escape_html(src, i, src_len).Add_str_a7("</comment>");
i = src_len;
}
else {
// Search backwards for leading whitespace
int ws_bgn = i > 0 ? i - Php_str_.Strspn_bwd__space_or_tab(src, i, -1) : 0;
// Search forwards for trailing whitespace
// $wsEnd will be the position of the last space (or the '>' if there's none)
int ws_end = end_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, end_pos + 3, -1, src_len);
// Keep looking forward as long as we're finding more
// comments.
comments_list.Clear();
comments_list.Add(new int[] {ws_bgn, ws_end});
while (ws_end + 5 < src_len && Bry_.Eq(src, ws_end + 1, ws_end + 5, Bry__comment_bgn)) {
int cur_char_pos = Bry_find_.Find_fwd(src, Bry__comment_end, ws_end + 4);
if (cur_char_pos == Bry_find_.Not_found) {
break;
}
cur_char_pos = cur_char_pos + 2 + Php_str_.Strspn_fwd__space_or_tab(src, cur_char_pos + 3, -1, src_len);
comments_list.Add(new int[] {ws_end + 1, cur_char_pos});
ws_end = cur_char_pos;
}
// Eat the line if possible
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
// it's a possible beneficial b/c break.
int bgn_pos = -1;
if ( ws_bgn > 0
&& Bry_.Eq(src, ws_bgn - 1, ws_bgn , Byte_ascii.Nl_bry)
&& Bry_.Eq(src, ws_end + 1, ws_end + 2, Byte_ascii.Nl_bry)
) {
// Remove leading whitespace from the end of the accumulator
// Sanity check first though
int ws_len = i - ws_bgn;
int accum_len = accum.Len();
if ( ws_len > 0
&& Php_str_.Strspn_fwd__space_or_tab(accum.Bfr(), accum_len - ws_len, -1, accum_len) == ws_len) {
accum.Del_by(ws_len);
}
// Dump all but the last comment to the accumulator
int comments_list_len = comments_list.Len();
for (int j = 0; j < comments_list_len; j++) {
int[] com = (int[])comments_list.Get_at(j);
bgn_pos = com[0];
end_pos = com[1] + 1;
if (j == comments_list_len - 1) {
break;
}
inner = Bry_.Mid(src, bgn_pos, end_pos);
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
}
// Do a line-start run next time to look for headings after the comment
fake_line_start = true;
}
else {
// No line to eat, just take the comment itself
bgn_pos = i;
end_pos += 2;
}
if (stack.top != null) {
Xomw_prepro_part part = stack.top.Get_current_part();
if (!(part.comment_end != -1 && part.comment_end == ws_bgn - 1)) {
part.visual_end = ws_bgn;
}
// Else comments abutting, no change in visual end
part.comment_end = end_pos;
}
i = end_pos + 1;
inner = Bry_.Mid(src, bgn_pos, end_pos + 1);
accum.Add_str_a7("<comment>").Add_bry_escape_html(inner).Add_str_a7("</comment>");
}
continue;
}
byte[] name = element.name;
// RELIC.BTRIE_CI: $lowerName = strtolower( $name );
int atr_bgn = i + name.length + 1;
// Find end of tag
int tag_end_pos = no_more_gt ? Bry_find_.Not_found : Bry_find_.Find_fwd(src, Byte_ascii.Angle_end, atr_bgn);
if (tag_end_pos == Bry_find_.Not_found) {
// Infinite backtrack
// Disable tag search to prevent worst-case O(N^2) performance
no_more_gt = true;
accum.Add(Bry__escaped_lt);
i++;
continue;
}
// Handle ignored tags
if (ignored_tags.Has(name)) {
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, i, tag_end_pos + 1).Add_str_a7("</ignore>");
i = tag_end_pos + 1;
continue;
}
int tag_bgn_pos = i;
int atr_end = -1;
byte[] close = null;
if (src[tag_end_pos - 1] == Byte_ascii.Slash) {
atr_end = tag_end_pos - 1;
inner = null;
i = tag_end_pos + 1;
close = Bry_.Empty;
}
else {
atr_end = tag_end_pos;
// Find closing tag
// PORTED: `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
boolean elem_end_found = false;
int elem_end_lhs = -1, elem_end_rhs = -1;
int elem_end_cur = tag_end_pos + 1;
while (true) {
// search for "</"
elem_end_lhs = Bry_find_.Find_fwd(src, Bry__end_lhs, elem_end_cur, src_len);
if (elem_end_lhs == Bry_find_.Not_found) {
break;
}
// verify $name
elem_end_cur = elem_end_lhs + 2; // 2="</"
int elem_end_tmp = elem_end_cur + name.length;
if (!Bry_.Eq_ci_a7(name, src, elem_end_cur, elem_end_tmp)) {
continue;
}
// verify "\s*>"
elem_end_cur = elem_end_tmp;
elem_end_cur = Bry_find_.Find_fwd_while(src, elem_end_cur, src_len, Byte_ascii.Space);
if (elem_end_cur == src_len) { // just "\s", but no ">"
break;
}
if (src[elem_end_cur] == Byte_ascii.Gt) {
elem_end_rhs = elem_end_cur + 1;
elem_end_found = true;
break;
}
}
if ( !no_more_closing_tag.Has(name)
&& elem_end_found) {
inner = Bry_.Mid(src, tag_end_pos + 1, elem_end_lhs);
i = elem_end_rhs;
tmp_bfr.Add_str_a7("<close>").Add_bry_escape_html(src, elem_end_lhs, elem_end_rhs).Add_str_a7("</close>");
close = tmp_bfr.To_bry_and_clear();
}
else {
// No end tag
if (xmlish_allow_missing_end_tag.Has(name)) {
// Let it run out to the end of the src.
inner = Bry_.Mid(src, tag_end_pos + 1);
i = src_len;
close = Bry_.Empty;
}
else {
// Don't match the tag, treat opening tag as literal and resume parsing.
i = tag_end_pos + 1;
accum.Add_bry_escape_html(src, tag_bgn_pos, tag_end_pos + 1);
// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
no_more_closing_tag.Add_if_dupe_use_nth(name, name);
continue;
}
}
}
// <includeonly> and <noinclude> just become <ignore> tags
if (ignored_elements.Has(name)) {
accum.Add_str_a7("<ignore>").Add_bry_escape_html(src, tag_bgn_pos, i).Add_str_a7("</ignore>");
continue;
}
accum.Add_str_a7("<ext>");
// PORTED:
// if ( $attrEnd <= $attrStart ) {
// $attr = '';
// } else {
// $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
// }
accum.Add_str_a7("<name>").Add(name).Add_str_a7("</name>");
// Note that the attr element contains the whitespace between name and attribute,
// this is necessary for precise reconstruction during pre-save transform.
accum.Add_str_a7("<attr>");
if (atr_end > atr_bgn)
accum.Add_bry_escape_html(src, atr_bgn, atr_end);
accum.Add_str_a7("</attr>");
if (inner != null) {
accum.Add_str_a7("<inner>").Add_bry_escape_html(inner).Add_str_a7("</inner>");
}
accum.Add(close).Add_str_a7("</ext>");
}
else if (found == Found__line_bgn) {
// Is this the start of a heading?
// Line break belongs before the heading element in any case
if (fake_line_start) {
fake_line_start = false;
} else {
accum.Add(cur_char);
i++;
}
int count = Php_str_.Strspn_fwd__byte(src, Byte_ascii.Eq, i, 6, src_len);
if (count == 1 && find_equals) { // EX: "{{a|\n=b=\n"
// DWIM: This looks kind of like a name/value separator.
// Let's let the equals handler have it and break the
// potential heading. This is heuristic, but AFAICT the
// methods for completely correct disambiguation are very
// complex.
}
else if (count > 0) {
Xomw_prepro_piece piece = new Xomw_prepro_piece(Byte_ascii.Nl_bry, Byte_ascii.Nl_bry, count, i, false);
piece.Add_part(Bry_.Repeat(Byte_ascii.Eq, count));
stack.Push(piece);
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
i += count;
}
}
else if (found == Found__line_end) {
Xomw_prepro_piece piece = stack.top;
// A heading must be open, otherwise \n wouldn't have been in the search list
if (!Bry_.Eq(piece.open, Byte_ascii.Nl_bry)) throw Err_.new_wo_type("assertion:piece must start with \\n");
Xomw_prepro_part part = piece.Get_current_part();
// Search back through the input to see if it has a proper close.
// Do this using the reversed String since the other solutions
// (end anchor, etc.) are inefficient.
int ws_len = Php_str_.Strspn_bwd__space_or_tab(src, src_len - i, -1);
int search_bgn = i - ws_len;
if (part.comment_end != -1 && search_bgn -1 == part.comment_end) {
// Comment found at line end
// Search for equals signs before the comment
search_bgn = part.visual_end;
search_bgn = Bry_find_.Find_bwd__while_space_or_tab(src, search_bgn, 0);
search_bgn -= Php_str_.Strspn_bwd__space_or_tab(src, search_bgn, -1);
}
int count = piece.count;
int eq_len = Php_str_.Strspn_bwd__byte(src, Byte_ascii.Eq, search_bgn, -1);
byte[] element = Bry_.Empty;
if (eq_len > 0) {
if (search_bgn - eq_len == piece.start_pos) {
// This is just a single String of equals signs on its own line
// Replicate the doHeadings behavior /={count}(.+)={count}/
// First find out how many equals signs there really are (don't stop at 6)
count = eq_len;
if (count < 3) {
count = 0;
}
else {
count = (count - 1) / 2;
if (count > 6) count = 6;
}
}
else {
if (eq_len < count) count = eq_len; // PORTED: $count = min( $equalsLength, $count );
}
if (count > 0) {
// Normal match, output <h>
element = tmp_bfr.Add_str_a7("<h level=\"").Add_int_variable(count).Add_str_a7("\" i=\"").Add_int_variable(heading_index).Add_str_a7("\">").Add_bfr_and_preserve(accum).Add_str_a7("</h>").To_bry_and_clear();
heading_index++;
} else {
// Single equals sign on its own line, count=0
element = accum.To_bry();
}
}
else {
// No match, no <h>, just pass down the inner src
element = accum.To_bry();
}
// Unwind the stack
stack.Pop();
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
// Append the result to the enclosing accumulator
accum.Add(element);
// Note that we do NOT increment the input pointer.
// This is because the closing linebreak could be the opening linebreak of
// another heading. Infinite loops are avoided because the next iteration MUST
// hit the heading open case above, which unconditionally increments the
// input pointer.
}
else if (found == Found__open) {
// count opening brace characters
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, -1, src_len); // NOTE: don't know how MediaWiki will handle "-{"
// we need to add to stack only if opening brace count is enough for one of the rules
if (count >= rule.min) {
// Add it to the stack
Xomw_prepro_piece piece = new Xomw_prepro_piece(cur_char, rule.end, count, -1, i > 0 && src[i - 1] == Byte_ascii.Nl);
stack.Push(piece);
accum = stack.Get_accum();
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
}
else {
// Add literal brace(s)
for (int j = 0; j < count; j++)
accum.Add_bry_escape_html(cur_char);
}
i += count;
}
else if (found == Found__close) {
Xomw_prepro_piece piece = stack.top;
// lets check if there are enough characters for closing brace
int max_count = piece.count;
int count = Php_str_.Strspn_fwd__byte(src, cur_char[0], i, max_count, src_len);
// check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules)
rule = Get_rule(piece.open);
int matching_count = -1;
if (count > rule.max) {
// The specified maximum exists in the callback array, unless the caller
// has made an error
matching_count = rule.max;
}
else {
// Count is less than the maximum
// Skip any gaps in the callback array to find the true largest match
// Need to use array_key_exists not isset because the callback can be null
matching_count = count;
while (matching_count > 0 && !rule.Names_exist(matching_count)) {
matching_count--;
}
}
if (matching_count <= 0) {
// No matching element found in callback array
// Output a literal closing brace and continue
for (int j = 0; j < count; j++)
accum.Add_bry_escape_html(cur_char);
i += count;
continue;
}
int name_type = rule.names[matching_count];
byte[] element = null;
if (name_type == Xomw_prepro_rule.Name__null) {
// No element, just literal text
tmp_bfr.Add(piece.Break_syntax(tmp_bfr, matching_count));
element = tmp_bfr.Add(Bry_.Repeat_bry(rule.end, matching_count)).To_bry_and_clear();
}
else {
// Create XML element
// Note: $parts is already XML, does not need to be encoded further
List_adp parts = piece.parts;
byte[] title = ((Xomw_prepro_part)parts.Get_at(0)).bfr.To_bry_and_clear();
parts.Del_at(0);
// The invocation is at the start of the line if lineStart is set in
// the stack, and all opening brackets are used up.
byte[] attr = null;
if (max_count == matching_count && piece.line_start) { // RELIC:!empty( $piece->lineStart )
attr = Bry_.new_a7(" lineStart=\"1\"");
}
else {
attr = Bry_.Empty;
}
byte[] name_bry = Xomw_prepro_rule.Name(name_type);
tmp_bfr.Add_str_a7("<").Add(name_bry).Add(attr).Add_str_a7(">");
tmp_bfr.Add_str_a7("<title>").Add(title).Add_str_a7("</title>");
int arg_idx = 1;
int parts_len = parts.Len();
for (int j = 0; j < parts_len; j++) {
Xomw_prepro_part part = (Xomw_prepro_part)parts.Get_at(j);
if (part.Eqpos != -1) {
Bry_bfr part_bfr = part.bfr;
byte[] part_bfr_bry = part_bfr.Bfr();
tmp_bfr.Add_str_a7("<part><name>").Add_mid(part_bfr_bry, 0, part.Eqpos);
tmp_bfr.Add_str_a7("</name>=<value>").Add_mid(part_bfr_bry, part.Eqpos + 1, part_bfr.Len());
tmp_bfr.Add_str_a7("</value></part>");
}
else {
tmp_bfr.Add_str_a7("<part><name index=\"").Add_int_variable(arg_idx).Add_str_a7("\" /><value>").Add(part.bfr.To_bry()).Add_str_a7("</value></part>");
arg_idx++;
}
}
element = tmp_bfr.Add_str_a7("</").Add(name_bry).Add_str_a7(">").To_bry_and_clear();
}
// Advance input pointer
i += matching_count;
// Unwind the stack
stack.Pop();
accum = stack.Get_accum();
// Re-add the old stack element if it still has unmatched opening characters remaining
if (matching_count < piece.count) {
piece.Parts__renew(); // PORTED: piece.parts = [ new PPDPart ];
piece.count -= matching_count;
// do we still qualify for any callback with remaining count?
int min = Get_rule(piece.open).min;
if (piece.count >= min) {
stack.Push(piece);
accum = stack.Get_accum();
}
else {
accum.Add(Bry_.Repeat_bry(piece.open, piece.count));
}
}
Xomw_prepro_flags flags = stack.Get_flags();
find_pipe = flags.Find_pipe;
find_equals = flags.Find_eq;
in_heading = flags.In_heading;
// Add XML element to the enclosing accumulator
accum.Add(element);
}
else if (found == Found__pipe) {
find_equals = true; // shortcut for getFlags()
stack.Add_part(Bry_.Empty);
accum = stack.Get_accum();
i++;
}
else if (found == Found__equals) {
find_equals = false; // shortcut for getFlags()
stack.Get_current_part().Eqpos = accum.Len();
accum.Add_byte(Byte_ascii.Eq);
i++;
}
}
// Output any remaining unclosed brackets
Bry_bfr root_accum = stack.Get_root_accum();
int stack_len = stack.stack.Len();
for (int j = 0; j < stack_len; j++) {
Xomw_prepro_piece piece = (Xomw_prepro_piece)stack.stack.Get_at(j);
root_accum.Add(piece.Break_syntax(tmp_bfr, -1));
}
root_accum.Add_str_a7("</root>");
return root_accum.To_bry_and_clear();
}
private Xomw_prepro_rule Get_rule(byte[] bry) {
if (Bry_.Eq(bry, rule_curly.bgn)) return rule_curly;
else if (Bry_.Eq(bry, rule_brack.bgn)) return rule_brack;
else if (Bry_.Eq(bry, rule_langv.bgn)) return rule_langv;
else throw Err_.new_unhandled(bry);
}
private static final int
Found__line_bgn = 0
, Found__line_end = 1
, Found__pipe = 2
, Found__equals = 3
, Found__angle = 4
, Found__close = 5
, Found__open = 6
;
private static final Xomw_prepro_rule
rule_curly = new Xomw_prepro_rule(Bry_.new_a7("{"), Bry_.new_a7("}") , 2, 3, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__tmpl, Xomw_prepro_rule.Name__targ})
, rule_brack = new Xomw_prepro_rule(Bry_.new_a7("["), Bry_.new_a7("]") , 2, 2, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
, rule_langv = new Xomw_prepro_rule(Bry_.new_a7("-{"), Bry_.new_a7("}-"), 1, 1, new int[] {Xomw_prepro_rule.Name__invalid, Xomw_prepro_rule.Name__null})
;
private static final byte[]
Bry__only_include_bgn = Bry_.new_a7("<onlyinclude>")
, Bry__only_include_end = Bry_.new_a7("</onlyinclude>")
, Bry__comment_bgn = Bry_.new_a7("<!--")
, Bry__comment_end = Bry_.new_a7("-->")
, Bry__escaped_lt = Bry_.new_a7("&lt;")
, Bry__end_lhs = Bry_.new_a7("</")
;
private static final int Len__only_include_end = Bry__only_include_end.length;
private static final Btrie_slim_mgr cur_char_trie = Cur_char_trie__new();
private static final Ordered_hash
ignored_tags_y = Ordered_hash_.New_bry().Add_many_str("includeonly", "/includeonly")
, ignored_tags_n = Ordered_hash_.New_bry().Add_many_str("noinclude", "/noinclude", "onlyinclude", "/onlyinclude");
private static final Hash_adp_bry
ignored_elements__y = Hash_adp_bry.cs().Add_many_str("noinclude")
, ignored_elements__n = Hash_adp_bry.cs().Add_many_str("includeonly");
private static Btrie_slim_mgr Cur_char_trie__new() {
Btrie_slim_mgr rv = Btrie_slim_mgr.ci_a7();
String[] ary = new String[] {"|", "=", "<", "\n", "{", "[", "-{", "}", "]"};
for (String str : ary) {
byte[] bry = Bry_.new_a7(str);
rv.Add_obj(bry, new Xomw_prepro_curchar_itm(bry, bry[0]));
}
// handle "}-" separately
byte[] langv_end = Bry_.new_a7("}-");
rv.Add_obj(langv_end, new Xomw_prepro_curchar_itm(langv_end, Byte_ascii.At));
return rv;
}
}

View File

@@ -0,0 +1,235 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.prepros; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_prepro_wkr__tst {
private final Xomw_prepro_wkr__fxt fxt = new Xomw_prepro_wkr__fxt();
@Test public void Text() {
fxt.Test__parse("abc", "<root>abc</root>");
}
@Test public void Brack() {
fxt.Test__parse("a[[b]]c", "<root>a[[b]]c</root>");
}
@Test public void Brack__one() { // COVERS: "Add literal brace(s)"
fxt.Test__parse("a[b]c", "<root>a[b]c</root>");
}
@Test public void Brack__max() { // COVERS: "The specified maximum exists in the callback array, unless the caller"
fxt.Test__parse("a[[[[[b]]]]]c", "<root>a[[[[[b]]]]]c</root>");
}
@Test public void Template() {
fxt.Test__parse("a{{b}}c", "<root>a<template><title>b</title></template>c</root>");
}
@Test public void Template__args__idx() {
fxt.Test__parse("a{{b|c|d}}e", "<root>a<template><title>b</title><part><name index=\"1\" /><value>c</value></part><part><name index=\"2\" /><value>d</value></part></template>e</root>");
}
@Test public void Template__args__key() {
fxt.Test__parse("a{{b|c=d}}e", "<root>a<template><title>b</title><part><name>c</name>=<value>d</value></part></template>e</root>");
}
@Test public void Template__line_start() { // COVERS: "The invocation is at the start of the line if lineStart is set in"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "{{b}}"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<template lineStart=\"1\"><title>b</title></template></root>"
));
}
@Test public void Template__max() { // COVERS: "do we still qualify for any callback with remaining count?"
fxt.Test__parse("a{{{{{b}}}}}c", "<root>a<template><title><tplarg><title>b</title></tplarg></title></template>c</root>");
}
@Test public void Tplarg() {
fxt.Test__parse("a{{{b}}}c", "<root>a<tplarg><title>b</title></tplarg>c</root>");
}
@Test public void Tplarg__dflt() {
fxt.Test__parse("a{{{b|c}}}d", "<root>a<tplarg><title>b</title><part><name index=\"1\" /><value>c</value></part></tplarg>d</root>");
}
@Test public void Comment() {
fxt.Test__parse("a<!--b-->c", "<root>a<comment>&lt;!--b--&gt;</comment>c</root>");
}
@Test public void Comment__dangling() {// COVERS: "Unclosed comment in input, runs to end"
fxt.Test__parse("a<!--b", "<root>a<comment>&lt;!--b</comment></root>");
}
@Test public void Comment__ws() { // COVERS: "Search backwards for leading whitespace"
fxt.Test__parse("a <!--b--> c", "<root>a <comment>&lt;!--b--&gt;</comment> c</root>"); // NOTE: space is outside comment
}
@Test public void Comment__many__ws() {// COVERS: "Dump all but the last comment to the accumulator"
fxt.Test__parse("a <!--1--> <!--2--> z", "<root>a <comment>&lt;!--1--&gt;</comment> <comment>&lt;!--2--&gt;</comment> z</root>"); // NOTE: space is outside comment;
}
@Test public void Comment__nl__ws() { // COVERS: "Eat the line if possible"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, " <!--1--> "
, " <!--2--> "
, "z"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<comment> &lt;!--1--&gt; " // NOTE: space is inside </comment> if flanked by nl;
, "</comment><comment> &lt;!--2--&gt; "
, "</comment>z</root>"
));
}
@Test public void Ext() { // COVERS.ALSO: "Note that the attr element contains the whitespace between name and attribute,"
fxt.Test__parse("a<pre id=\"1\">b</pre>c", "<root>a<ext><name>pre</name><attr> id=&quot;1&quot;</attr><inner>b</inner><close>&lt;/pre&gt;</close></ext>c</root>");
}
@Test public void Ext__inline() { // COVERS: "if ( $text[$tagEndPos - 1] == '/' ) {"
fxt.Test__parse("a<pre/>b" , "<root>a<ext><name>pre</name><attr></attr></ext>b</root>");
fxt.Test__parse("a<pre />b" , "<root>a<ext><name>pre</name><attr> </attr></ext>b</root>");
}
@Test public void Ext__end__pass__space() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pre >c", "<root>a<ext><name>pre</name><attr></attr><inner>b</inner><close>&lt;/pre &gt;</close></ext>c</root>");
}
@Test public void Ext__end__pass__name() { // COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pro></pre>c", "<root>a<ext><name>pre</name><attr></attr><inner>b&lt;/pro&gt;</inner><close>&lt;/pre&gt;</close></ext>c</root>");
}
@Test public void Ext__end__fail__angle() {// COVERS: "\s*" in `preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",`
fxt.Test__parse("a<pre>b</pre c", "<root>a&lt;pre&gt;b&lt;/pre c</root>");
}
@Test public void Ext__dangling() { // COVERS: "Let it run out to the end of the text."
fxt.Test__parse("a<pre>bc", "<root>a&lt;pre&gt;bc</root>");
}
@Test public void Ext__dangling__many() { // COVERS: "Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>..."
fxt.Test__parse("a<pre><pre><pre>bc", "<root>a&lt;pre&gt;&lt;pre&gt;&lt;pre&gt;bc</root>");
}
@Test public void Ext__unclosed() { // COVERS: "Infinite backtrack"
fxt.Test__parse("a<pre bcd", "<root>a&lt;pre bcd</root>");
}
@Test public void Ext__noinclude() { // COVERS: "<includeonly> and <noinclude> just become <ignore> tags"
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<includeonly>b<noinclude>c</noinclude>d</includeonly>e", "<root>a<ignore>&lt;includeonly&gt;b&lt;noinclude&gt;c&lt;/noinclude&gt;d&lt;/includeonly&gt;</ignore>e</root>");
}
@Test public void Heading() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "== b1 =="
, "z"
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">== b1 ==</h>"
, "z</root>"
));
}
@Test public void Heading__eos__no_nl() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "== b1 =="
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">== b1 ==</h></root>"
));
}
@Test public void Heading__bos__implied_nl() { // COVERS: "Is this the start of a heading?"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "== b1 =="
, "z"
), String_.Concat_lines_nl_skip_last
( "<root><h level=\"2\" i=\"1\">== b1 ==</h>"
, "z</root>"
));
}
@Test public void Heading__dwim__y() { // COVERS: "DWIM: This looks kind of like a name/value separator."
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a{{b|"
, "=c="
, "}}d"
), String_.Concat_lines_nl_skip_last
( "<root>a<template><title>b</title><part><name>"
, "</name>=<value>c="
, "</value></part></template>d</root>"
));
}
@Test public void Heading__dwim__n() { // COVERS: "DWIM: This looks kind of like a name/value separator."
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a{{b|"
, "==c=="
, "}}d"
), String_.Concat_lines_nl_skip_last
( "<root>a<template><title>b</title><part><name index=\"1\" /><value>"
, "<h level=\"2\" i=\"1\">==c==</h>"
, "</value></part></template>d</root>"
));
}
@Test public void Heading__comment() { // COVERS: "Comment found at line end"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "==b== <!--c-->"
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">==b== <comment>&lt;!--c--&gt;</comment></h>"
, "</root>"
));
}
@Test public void Heading__consecutive__5() { // COVERS: "This is just a single String of equals signs on its own line"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "====="
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "<h level=\"2\" i=\"1\">=====</h>"
, "</root>"
));
}
@Test public void Heading__consecutive__1() { // COVERS: "Single equals sign on its own line, count=0"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "="
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "="
, "</root>"
));
}
@Test public void Heading__unclosed() { // COVERS: "No match, no <h>, just pass down the inner src"
fxt.Test__parse(String_.Concat_lines_nl_skip_last
( "a"
, "===b"
, ""
), String_.Concat_lines_nl_skip_last
( "<root>a"
, "===b"
, "</root>"
));
}
@Test public void Inclusion__n() {
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root>a<ignore>&lt;onlyinclude&gt;</ignore>b<ignore>&lt;/onlyinclude&gt;</ignore>c</root>");
}
@Test public void Inclusion__y() {
fxt.Init__for_inclusion_(Bool_.Y);
fxt.Test__parse("a<onlyinclude>b</onlyinclude>c", "<root><ignore>a&lt;onlyinclude&gt;</ignore>b<ignore>&lt;/onlyinclude&gt;c</ignore></root>");
}
@Test public void Ignored__noinclude() { // COVERS: "Handle ignored tags"
fxt.Init__for_inclusion_(Bool_.N);
fxt.Test__parse("a<noinclude>b</noinclude>c", "<root>a<ignore>&lt;noinclude&gt;</ignore>b<ignore>&lt;/noinclude&gt;</ignore>c</root>");
}
}
class Xomw_prepro_wkr__fxt {
private final Xomw_prepro_wkr wkr = new Xomw_prepro_wkr();
private boolean for_inclusion = false;
public Xomw_prepro_wkr__fxt() {
wkr.Init_by_wiki("pre");
}
public void Init__for_inclusion_(boolean v) {for_inclusion = v;}
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Preprocess_to_xml(src_bry, for_inclusion);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
}
}

View File

@@ -0,0 +1,267 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.parsers.htmls.*;
import gplx.core.primitives.*;
public class Xomw_quote_wkr {// THREAD.UNSAFE: caching for repeated calls
private Bry_bfr tmp;
private final Int_list apos_pos_ary = new Int_list(32);
public Xomw_quote_wkr(Xomw_parser mgr) {
this.tmp = mgr.Tmp();
}
public void Do_all_quotes(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
pbfr.Switch();
int cur = src_bgn;
int line_bgn = cur;
while (true) {
int line_end = Bry_find_.Find_fwd(src, Byte_ascii.Nl, line_bgn, src_end);
if (line_end == Bry_find_.Not_found) {
line_end = src_end;
}
Do_quotes(bfr, Bool_.Y, src, line_bgn, line_end);
if (line_end == src_end)
break;
else
line_bgn = line_end + 1; // 1=\n.length
}
// Bry_split_.Split(src, src_bgn, src_end, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
if (bfr.Match_end_byt(Byte_ascii.Nl))
bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 );
apos_pos_ary.Clear();
}
public byte[] Do_quotes(Bry_bfr tmp, byte[] src) {
boolean found = Do_quotes(tmp, Bool_.N, src, 0, src.length);
return found ? tmp.To_bry_and_clear() : src;
}
private boolean Do_quotes(Bry_bfr bfr, boolean all_quotes_mode, byte[] src, int line_bgn, int line_end) {
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, line_bgn, line_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
if (arr == null) {
if (all_quotes_mode) {
bfr.Add_mid(src, line_bgn, line_end).Add_byte_nl();
}
return false;
}
int arr_len = arr.length;
// First, do some preliminary work. This may shift some apostrophes from
// being mark-up to being text. It also counts the number of occurrences
// of bold and italics mark-ups.
int num_bold = 0;
int num_italics = 0;
for (int i = 1; i < arr_len; i += 2) {
int apos_len = arr[i].length;
// If there are ever four apostrophes, assume the first is supposed to
// be text, and the remaining three constitute mark-up for bold text.
// (bug 13227: ''''foo'''' turns into ' ''' foo ' ''')
if (apos_len == 4) {
arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry);
arr[i] = Bry_.new_a7("'''");
apos_len = 3;
}
else if (apos_len > 5) {
// If there are more than 5 apostrophes in a row, assume they're all
// text except for the last 5.
// (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''')
arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5));
arr[i] = Bry_.new_a7("'''''");
apos_len = 5;
}
// Count the number of occurrences of bold and italics mark-ups.
if (apos_len == 2) {
num_italics++;
}
else if (apos_len == 3) {
num_bold++;
}
else if (apos_len == 5) {
num_italics++;
num_bold++;
}
}
// If there is an odd number of both bold and italics, it is likely
// that one of the bold ones was meant to be an apostrophe followed
// by italics. Which one we cannot know for certain, but it is more
// likely to be one that has a single-letter word before it.
// NOTE: this code primarily handles italicized possessives; EX: The ''[[Main Page]]'''s talk page.
if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
int prv_ends_w_word_1char = -1;
int prv_ends_w_word_nchar = -1;
int prv_ends_w_space = -1;
for (int i = 1; i < arr_len; i += 2) {
if (arr[i].length == 3) {
byte[] prv = arr[i - 1];
byte prv__last_char = Php_str_.Substr_byte(prv, -1);
byte prv__last_minus_1_char = Php_str_.Substr_byte(prv, -2, 1);
if (prv__last_char == Byte_ascii.Space) { // NOTE: prv ends in space; EX: "''prv '''"
if (prv_ends_w_space == -1) {
prv_ends_w_space = i;
}
}
else if (prv__last_minus_1_char == Byte_ascii.Space) { // NOTE: prv ends in 1-char word; EX: "''prv a'''"
prv_ends_w_word_1char = i;
// if $firstsingleletterword is set, we don't
// look at the other options, so we can bail early.
break;
}
else {
if (prv_ends_w_word_nchar == -1) {
prv_ends_w_word_nchar = i;
}
}
}
}
// If there is a single-letter word, use it!
if (prv_ends_w_word_1char > -1) {
arr[prv_ends_w_word_1char] = Wtxt__apos;
arr[prv_ends_w_word_1char - 1] = Bry_.Add(arr[prv_ends_w_word_1char - 1], Byte_ascii.Apos);
}
else if (prv_ends_w_word_nchar > -1) {
// If not, but there's a multi-letter word, use that one.
arr[prv_ends_w_word_nchar] = Wtxt__apos;
arr[prv_ends_w_word_nchar - 1] = Bry_.Add(arr[prv_ends_w_word_nchar - 1], Byte_ascii.Apos);
}
else if (prv_ends_w_space > -1) {
// ... otherwise use the first one that has neither.
// (notice that it is possible for all three to be -1 if, for example,
// there is only one pentuple-apostrophe in the line)
arr[prv_ends_w_space] = Wtxt__apos;
arr[prv_ends_w_space - 1] = Bry_.Add(arr[prv_ends_w_space - 1], Byte_ascii.Apos);
}
}
// Now let's actually convert our apostrophic mush to HTML!
int state = State__empty;
for (int j = 0; j < arr_len; j++) {
if ((j % 2) == 0) {
if (state == State__both) {
tmp.Add(arr[j]);
}
else {
bfr.Add(arr[j]);
}
}
else {
int apos_len = arr[j].length;
if (apos_len == 2) {
if (state == State__i) {
bfr.Add_str_a7("</i>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i>");
state = State__b;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i><b>");
state = State__b;
}
else if (state == State__both) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_preserve(tmp).Add_str_a7("</i>");
state = State__b;
}
else { // state can be 'b' or ''
bfr.Add_str_a7("<i>");
state = state == State__b ? State__bi : State__i;
}
}
else if (apos_len == 3) {
if (state == State__b) {
bfr.Add_str_a7("</b>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b><i>");
state = State__i;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b>");
state = State__i;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b>");
state = State__i;
}
else { // state can be 'i' or ''
bfr.Add_str_a7("<b>");
state = state == State__i ? State__ib : State__b;
}
}
else if (apos_len == 5) {
if (state == State__b) {
bfr.Add_str_a7("</b><i>");
state = State__i;
}
else if (state == State__i) {
bfr.Add_str_a7("</i><b>");
state = State__b;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b>");
state = State__empty;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i>");
state = State__empty;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b></i>");
state = State__empty;
}
else { // (state == '')
tmp.Clear();
state = State__both;
}
}
}
}
// Now close all remaining tags. Notice that the order is important.
if (state == State__b || state == State__ib) {
bfr.Add_str_a7("</b>");
}
if (state == State__i || state == State__bi || state == State__ib) {
bfr.Add_str_a7("</i>");
}
if (state == State__bi) {
bfr.Add_str_a7("</b>");
}
// There might be lonely ''''', so make sure we have a buffer
if (state == State__both && tmp.Len_gt_0()) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
}
bfr.Add_byte_nl();
return true;
}
private static final int
State__empty = 0
, State__b = 1
, State__i = 2
, State__bi = 3
, State__ib = 4
, State__both = 5
;
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
}

View File

@@ -0,0 +1,45 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import org.junit.*;
public class Xomw_quote_wkr__tst {
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
@Test public void Apos__0() {fxt.Test__parse("abc" , "abc");}
@Test public void Apos__1() {fxt.Test__parse("a'b'c" , "a'b'c");}
@Test public void Apos__2() {fxt.Test__parse("a''b''c" , "a<i>b</i>c");}
@Test public void Apos__3() {fxt.Test__parse("a'''b'''c" , "a<b>b</b>c");}
@Test public void Apos__4() {fxt.Test__parse("a''''b''''c" , "a'<b>b'</b>c");} // COVERS: "If there are ever four apostrophes"
@Test public void Apos__5() {fxt.Test__parse("a'''''b'''''c" , "a<i><b>b</b></i>c");}
@Test public void Apos__7() {fxt.Test__parse("a'''''''b'''''''c" , "a''<i><b>b''</b></i>c");} // COVERS: "If there are more than 5 apostrophes in a row"
@Test public void Mix__single() {fxt.Test__parse("''a ''' ''b b''' ''cc'''" , "<i>a <b> </b></i><b>b b'<i> </i>cc</b>");} // COVERS: "If there is a single-letter word, use it!"
@Test public void Mix__multi() {fxt.Test__parse("''a ''' ''b ''' ''cc'''" , "<i>a <b> </b></i><b>b </b> <i>cc'</i>");} // COVERS: "If not, but there's a multi-letter word, use that one."
@Test public void Mix__space() {fxt.Test__parse("''a ''' ''b ''' ''c '''" , "<i>a '</i> <i>b <b> </b></i><b>c </b>");} // COVERS: "... otherwise use the first one that has neither."
@Test public void Dangling__b() {fxt.Test__parse("a'''b" , "a<b>b</b>");} // COVERS: "if (state == State__b || state == State__ib)"
@Test public void Dangling__i() {fxt.Test__parse("a''b" , "a<i>b</i>");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)"
@Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "a<b><i>b</i></b>");} // COVERS: "There might be lonely ''''', so make sure we have a buffer"
@Test public void Nl__text() {fxt.Test__parse("a\nb''c''d\n\ne" , "a\nb<i>c</i>d\n\ne");}
}
class Xomw_quote_wkr__fxt {
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(new Xomw_parser());
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Do_all_quotes(new Xomw_parser_ctx(), pbfr.Init(src_bry));
Tfds.Eq_str_lines(expd, pbfr.Rslt().To_str_and_clear(), src_str);
}
}

View File

@@ -0,0 +1,292 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mediawiki.includes.parsers.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*;
import gplx.xowa.mediawiki.includes.utls.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.mediawiki.includes.libs.*; import gplx.xowa.parsers.uniqs.*;
public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr tmp;
private Bry_bfr bfr;
private final Xomw_sanitizer sanitizer; private final Xomw_strip_state strip_state;
private final List_adp
td_history = List_adp_.New() // Is currently a td tag open?
, last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption)
, tr_history = List_adp_.New() // Is currently a tr tag open?
, tr_attributes = List_adp_.New() // history of tr attributes
, has_opened_tr = List_adp_.New() // Did this table open a <tr> element?
;
private int indent_level = 0; // indent level of the table
private byte[] first_2 = new byte[2];
public Xomw_table_wkr(Xomw_parser parser) {
this.tmp = parser.Tmp();
this.sanitizer = parser.Sanitizer();
this.strip_state = parser.Strip_state();
}
public void Do_table_stuff(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) {
Bry_bfr src_bfr = pbfr.Src();
byte[] src = src_bfr.Bfr();
int src_bgn = 0;
int src_end = src_bfr.Len();
this.bfr = pbfr.Trg();
pbfr.Switch();
indent_level = 0;
Bry_split_.Split(src, src_bgn, src_end, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text);
// Closing open td, tr && table
while (td_history.Len() > 0) {
if (Php_ary_.Pop_bool_or_n(td_history)) {
bfr.Add_str_a7("</td>\n");
}
if (Php_ary_.Pop_bool_or_n(tr_history)) {
bfr.Add_str_a7("</tr>\n");
}
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
bfr.Add_str_a7("<tr><td></td></tr>\n");
}
bfr.Add_str_a7("</table>\n");
}
// Remove trailing line-ending (b/c)
if (bfr.Get_at_last_or_nil_if_empty() == Byte_ascii.Nl) {
bfr.Del_by_1();
}
// special case: don't return empty table
if ( bfr.Len() == Len__tb__empty
&& Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) {
bfr.Clear();
return;
}
}
public int Split(byte[] src, int itm_bgn, int itm_end) {
byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine"
byte[] line = Bry_.Trim(out_line); // MW: "$line"
int line_len = line.length;
if (line_len == 0) { // empty line, go to next line
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
byte first_char = line[0];
first_2[0] = line[0];
first_2[1] = line_len == 1 ? Byte_ascii.Null : line[1];
// PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)
byte[] tblw_atrs = null;
boolean tblw_bgn_found = false;
int colons_end = Bry_find_.Find_fwd_while(src, 0, line_len, Byte_ascii.Colon);
int tblw_bgn = Bry_find_.Find_fwd_while(line, colons_end, line_len, Byte_ascii.Space);
int tblw_atrs_bgn = tblw_bgn + 2;
if (Bry_.Eq(line, tblw_bgn, tblw_atrs_bgn, Wtxt__tb__bgn)) {
tblw_bgn_found = true;
tblw_atrs = (tblw_atrs_bgn == line_len) ? Bry_.Empty : Bry_.Mid(line, tblw_atrs_bgn, line_len);
}
if (tblw_bgn_found) {
// First check if we are starting a new table
indent_level = colons_end;
tblw_atrs = strip_state.Unstrip_both(tblw_atrs);
// PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>";
for (int j = 0; j < indent_level; j++)
tmp.Add(Html__dl__bgn);
tmp.Add_str_a7("<table");
sanitizer.Fix_tag_attributes(tmp, Name__table, tblw_atrs);
tmp.Add_byte(Byte_ascii.Angle_end);
out_line = tmp.To_bry_and_clear();
td_history.Add(false);
last_tag_history.Add(Bry_.Empty);
tr_history.Add(false);
tr_attributes.Add(Bry_.Empty);
has_opened_tr.Add(false);
}
else if (td_history.Len() == 0) {
// Don't do any of the following
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
else if (Bry_.Eq(first_2, Wtxt__tb__end)) {
// We are ending a table
line = tmp.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear();
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (!Php_ary_.Pop_bool_or_n(has_opened_tr)) {
line = tmp.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear();
}
if (Php_ary_.Pop_bool_or_n(tr_history)) {
line = tmp.Add_str_a7("</tr>").Add(line).To_bry_and_clear();
}
if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear();
}
Php_ary_.Pop_bry_or_null(tr_attributes);
// PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level );
tmp.Add(line);
for (int j = 0; j < indent_level; j++)
tmp.Add(Html__dl__end);
out_line = tmp.To_bry_and_clear();
}
else if (Bry_.Eq(first_2, Wtxt__tr)) {
// Now we have a table row
line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line);
// Whats after the tag is now only attributes
byte[] atrs = strip_state.Unstrip_both(line);
sanitizer.Fix_tag_attributes(tmp, Name__tr, atrs);
atrs = tmp.To_bry_and_clear();
Php_ary_.Pop_bry_or_null(tr_attributes);
tr_attributes.Add(atrs);
line = Bry_.Empty;
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
Php_ary_.Pop_bool_or_n(has_opened_tr);
has_opened_tr.Add(true);
if (Php_ary_.Pop_bool_or_n(tr_history)) {
line = Html__tr__end;
}
if (Php_ary_.Pop_bool_or_n(td_history)) {
line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear();
}
out_line = line;
tr_history.Add(false);
td_history.Add(false);
last_tag_history.Add(Bry_.Empty);
}
else if ( first_char == Byte_ascii.Pipe
|| first_char == Byte_ascii.Bang
|| Bry_.Eq(first_2, Wtxt__caption)
) {
// This might be cell elements, td, th or captions
if (Bry_.Eq(first_2, Wtxt__caption)) {
first_char = Byte_ascii.Plus;
line = Bry_.Mid(line, 2);
} else {
line = Bry_.Mid(line, 1);
}
// Implies both are valid for table headings.
if (first_char == Byte_ascii.Bang) {
Xomw_string_utils.Replace_markup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line);
}
// Split up multiple cells on the same line.
// FIXME : This can result in improper nesting of tags processed
// by earlier parser steps.
byte[][] cells = Bry_split_.Split(line, Wtxt__td2);
if (cells.length == 0) cells = Cells__empty; // handle "\n|\n" which should still generate "<tr><td></td></tr>", not ""; see TEST
out_line = Bry_.Empty;
byte[] previous = null;
// Loop through each table cell
int cells_len = cells.length;
for (int j = 0; j < cells_len; j++) {
byte[] cell = cells[j];
previous = Bry_.Empty;
if (first_char != Byte_ascii.Plus) {
byte[] tr_after = Php_ary_.Pop_bry_or_null(tr_attributes);
if (!Php_ary_.Pop_bool_or_n(tr_history)) {
previous = tmp.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear();
}
tr_history.Add(true);
tr_attributes.Add(Bry_.Empty);
Php_ary_.Pop_bool_or_n(has_opened_tr);
has_opened_tr.Add(true);
}
byte[] last_tag = Php_ary_.Pop_bry_or_null(last_tag_history);
if (Php_ary_.Pop_bool_or_n(td_history)) {
previous = tmp.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear();
}
if (first_char == Byte_ascii.Pipe) {
last_tag = Name__td;
}
else if (first_char == Byte_ascii.Bang) {
last_tag = Name__th;
}
else if (first_char == Byte_ascii.Plus) {
last_tag = Name__caption;
}
else {
last_tag = Bry_.Empty;
}
last_tag_history.Add(last_tag);
// A cell could contain both parameters and data
byte[][] cell_data = Bry_split_.Split_w_max(cell, Byte_ascii.Pipe, 2);
// Bug 553: Note that a '|' inside an invalid link should not
// be mistaken as delimiting cell parameters
byte[] cell_data_0 = cell_data[0];
byte[] cell_data_1 = cell_data[1];
if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) {
cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear();
}
else if (cell_data_1 == null) {
cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear();
}
else {
byte[] atrs = strip_state.Unstrip_both(cell_data_0);
tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag);
sanitizer.Fix_tag_attributes(tmp, last_tag, atrs);
tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1);
cell = tmp.To_bry_and_clear();
}
out_line = Bry_.Add(out_line, cell);
td_history.Add(true);
}
}
bfr.Add(out_line).Add_byte_nl();
return Bry_split_.Rv__ok;
}
private static final byte[]
Wtxt__tb__bgn = Bry_.new_a7("{|")
, Wtxt__tb__end = Bry_.new_a7("|}")
, Wtxt__tr = Bry_.new_a7("|-")
, Wtxt__caption = Bry_.new_a7("|+")
, Wtxt__th2 = Bry_.new_a7("!!")
, Wtxt__td2 = Bry_.new_a7("||")
, Wtxt__lnki__bgn = Bry_.new_a7("[[")
, Name__table = Bry_.new_a7("table")
, Name__tr = Bry_.new_a7("tr")
, Name__td = Bry_.new_a7("td")
, Name__th = Bry_.new_a7("th")
, Name__caption = Bry_.new_a7("caption")
, Html__tr__end = Bry_.new_a7("</tr>")
, Html__dl__bgn = Bry_.new_a7("<dl><dd>")
, Html__dl__end = Bry_.new_a7("</dd></dl>")
, Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>")
;
private static final int Len__tb__empty = Html__tb__empty.length;
private static final byte[][] Cells__empty = new byte[][] {Bry_.Empty};
}

Some files were not shown because too many files have changed in this diff Show More