1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2015-07-21 19:52:26 -04:00
parent 99f9c3ccea
commit 9d63f03b3d
31 changed files with 381 additions and 446 deletions

View File

@@ -0,0 +1,65 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Regx_adp {
@gplx.Internal protected Regx_adp(String regx) {Pattern_(regx);}
public String Pattern() {return pattern;} public Regx_adp Pattern_(String val) {pattern = val; Under_sync(); return this;} private String pattern;
public boolean Pattern_is_invalid() {return pattern_is_invalid;} private boolean pattern_is_invalid = false;
public Regx_match[] Match_all(String text, int bgn) {
int idx = bgn;
List_adp rv = List_adp_.new_();
int len = String_.Len(text);
while (idx <= len) { // NOTE: must be <= not < else "a?" will return null instead of ""; PAGE:en.d:民; DATE:2015-01-30
Regx_match match = this.Match(text, idx);
if (match.Rslt_none()) break;
rv.Add(match);
int find_bgn = match.Find_bgn();
int find_len = match.Find_len();
idx = find_len == 0 // find_bgn == find_end
? find_bgn + 1 // add 1 to resume search from next char; DATE:2014-09-02
: find_bgn + find_len // otherwise search after find_end
;
}
return (Regx_match[])rv.To_ary(Regx_match.class);
}
private Pattern under;
void Under_sync() {
try {under = Pattern.compile(pattern, Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);} // JRE.7:UNICODE_CHARACTER_CLASS; added during %w fix for en.w:A#; DATE:2015-06-10
catch (Exception e) { // NOTE: if invalid, then default to empty pattern (which should return nothing); EX:d:〆る generates [^]; DATE:2013-10-20
pattern_is_invalid = true;
under = Pattern.compile("", Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);
}
}
public Regx_match Match(String input, int bgn) {
Matcher match = under.matcher(input);
boolean success = match.find(bgn);
int match_bgn = success ? match.start() : String_.Find_none;
int match_end = success ? match.end() : String_.Find_none;
Regx_group[] ary = Regx_group.Ary_empty;
int groups_len = match.groupCount();
if (success && groups_len > 0) {
ary = new Regx_group[groups_len];
for (int i = 0; i < groups_len; i++)
ary[i] = new Regx_group(true, match.start(i + 1), match.end(i + 1), match.group(i + 1));
}
return new Regx_match(success, match_bgn, match_end, ary);
}
public String ReplaceAll(String input, String replace) {return under.matcher(input).replaceAll(replace);}
}

View File

@@ -0,0 +1,43 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
public class Regx_adp_ {
public static Regx_adp new_(String pattern) {return new Regx_adp(pattern);}
public static List_adp Find_all(String input, String find) {
Regx_adp regx = Regx_adp_.new_(find);
int idx = 0;
List_adp rv = List_adp_.new_();
while (true) {
Regx_match match = regx.Match(input, idx);
if (match.Rslt_none()) break;
rv.Add(match);
int findBgn = match.Find_bgn();
idx = findBgn + match.Find_len();
if (idx > String_.Len(input)) break;
}
return rv;
}
public static String Replace(String raw, String regx_str, String replace) {
Regx_adp regx = Regx_adp_.new_(regx_str);
return regx.ReplaceAll(raw, replace);
}
public static boolean Match(String input, String pattern) {
Regx_adp rv = new Regx_adp(pattern);
return rv.Match(input, 0).Rslt();
}
}

View File

@@ -0,0 +1,93 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
import org.junit.*;
public class Regx_adp__tst implements TfdsEqListItmStr {
@Test public void Match() {
tst_Match("a", "a", true); // basic
tst_Match("a", "b", false); // matchNot
tst_Match("a", "ab", true); // matchPart
tst_Match("a\\+b", "a+b", true); // matchEscape
tst_Match("[^a]", "b", true); // charSet_negate
} void tst_Match(String find, String input, boolean expd) {Tfds.Eq(expd, Regx_adp_.Match(input, find));}
@Test public void Match_all() {
tst_Match_all("#REDIRECT [[Template:Error]]", "^\\p{Nd}*", 1); // handle match = true but len = 0; DATE:2013-04-11
tst_Match_all("a", "$", 1); // $ should match once, not zero; DATE:2014-09-02
} void tst_Match_all(String input, String regx, int expd) {Tfds.Eq(expd, Regx_adp_.new_(regx).Match_all(input, 0).length);}
@Test public void Replace() {
tst_Replace("ab", "a", "b", "bb"); // basic
tst_Replace("ab", "c", "b", "ab"); // replaceNot
tst_Replace("aba", "a", "b", "bbb"); // replaceMultiple
} void tst_Replace(String input, String find, String replace, String expd) {Tfds.Eq(expd, Regx_adp_.Replace(input, find, replace));}
@Test public void Match_WholeWord() {
tst_WholeWord("a", "ab a", true); // pass a
tst_WholeWord("a", "ab c", false); // fail ab
tst_WholeWord("a", "a_", false); // fail a_
tst_WholeWord("[a]", "a [a] c", true); // pass [a]
tst_WholeWord("[a]", "a[a]c", false); // fail a[a]c
} void tst_WholeWord(String regx, String text, boolean expd) {Tfds.Eq(expd, Regx_adp_.Match(text, Regx_bldr.WholeWord(regx)));}
@Test public void Match_As() {
tst_Regx("public static [A-Za-z0-9_]+ as_\\(Object obj\\)", "public static Obj1 as_(Object obj) {return obj instanceof Obj1 ? (Obj1)obj : null;}", true);
tst_Regx("public static [A-Za-z0-9_]+ as_\\(Object obj\\)", "public static boolean Asterisk(Object obj) {}", false);
} void tst_Regx(String regx, String text, boolean expd) {Tfds.Eq(expd, Regx_adp_.Match(text, regx));}
@Test public void Find() {
tst_Matches("b", "a b c b a", match_(2, 1), match_(6, 1));
tst_Matches("d", "a b c b a");
tst_Matches("b", "a b c b a b b", matches_(2, 6, 10, 12)); // BUGFIX: multiple entries did not work b/c of += instead of +
}
@Test public void Groups() {
tst_Groups("abc def ghi dz", "(d\\p{L}+)", "def", "dz");
}
Regx_match[] matches_(int... bgnAry) {
int aryLen = Array_.Len(bgnAry);
Regx_match[] rv = new Regx_match[aryLen];
for (int i = 0; i < aryLen; i++)
rv[i] = match_(bgnAry[i]);
return rv;
}
Regx_match match_(int bgn) {return match_(bgn, Int_.MinValue);}
Regx_match match_(int bgn, int len) {return new Regx_match(true, bgn, bgn + len, Regx_group.Ary_empty);}
void tst_Matches(String find, String input, Regx_match... expd) {
List_adp expdList = Array_.XtoList(expd);
List_adp actlList = Regx_adp_.Find_all(input, find);
Tfds.Eq_list(expdList, actlList, this);
}
void tst_Groups(String text, String regx, String... expd) {
Regx_adp regx_mgr = Regx_adp_.new_(regx);
Regx_match[] rslts = regx_mgr.Match_all(text, 0);
Tfds.Eq_ary_str(expd, To_ary(rslts));
}
String[] To_ary(Regx_match[] ary) {
List_adp rv = List_adp_.new_();
int len = ary.length;
for (int i = 0; i < len; i++) {
Regx_match itm = ary[i];
int cap_len = itm.Groups().length;
for (int j = 0; j < cap_len; j++) {
rv.Add(itm.Groups()[j].Val());
}
}
return rv.To_str_ary();
}
public String XtoStr(Object curObj, Object expdObj) {
Regx_match cur = (Regx_match)curObj, expd = (Regx_match)expdObj;
String rv = "bgn=" + cur.Find_bgn();
if (expd != null && expd.Find_len() != Int_.MinValue) rv += " len=" + cur.Find_len();
return rv;
}
}

View File

@@ -0,0 +1,62 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
import gplx.core.strings.*;
public class Regx_bldr {
public static String Includes(String characters) {return String_.Concat_any(Regx_bldr.Tkn_CharSetBegin, characters, Regx_bldr.Tkn_CharSetEnd);}
public static String Excludes(String characters) {return String_.Concat_any(Regx_bldr.Tkn_CharSetBegin, Regx_bldr.Tkn_Not, characters, Regx_bldr.Tkn_CharSetEnd);}
public static String WholeWord(String word) {return String_.Concat_any("(?<![A-Za-z0-9_])", EscapeAll(word), "(?![A-Za-z0-9_])");}
public static String EscapeAll(String text) {
String_bldr sb = String_bldr_.new_();
int len = String_.Len(text);
for (int i = 0; i < len; i++) {
char c = String_.CharAt(text, i);
if (RegxChar_chk(c))
sb.Add(Regx_bldr.Tkn_Escape);
sb.Add(c);
}
return sb.XtoStr();
}
public static boolean RegxChar_chk(char c) {
return
( c == Regx_bldr.Tkn_Escape || c == Regx_bldr.Tkn_Or
|| c == Regx_bldr.Tkn_LineBegin || c == Regx_bldr.Tkn_LineEnd
|| c == Regx_bldr.Tkn_GroupBegin || c == Regx_bldr.Tkn_GroupEnd
|| c == Regx_bldr.Tkn_RepBegin || c == Regx_bldr.Tkn_RepEnd
|| c == Regx_bldr.Tkn_Wild_0Plus || c == Regx_bldr.Tkn_Wild_1Plus || c == Regx_bldr.Tkn_Wild_0or1
|| c == Regx_bldr.Tkn_CharSetBegin || c == Regx_bldr.Tkn_CharSetEnd
);
}
public static final char
Tkn_LineBegin = '^'
, Tkn_LineEnd = '$'
, Tkn_AnyChar = '.' // except newline
, Tkn_Wild_0Plus = '*'
, Tkn_Wild_1Plus = '+'
, Tkn_Wild_0or1 = '?'
, Tkn_CharSetBegin = '['
, Tkn_CharSetEnd = ']'
, Tkn_GroupBegin = '('
, Tkn_GroupEnd = ')'
, Tkn_RepBegin = '{'
, Tkn_RepEnd = '}'
, Tkn_Not = '^'
, Tkn_Or = '|'
, Tkn_Escape = '\\'
;
}

View File

@@ -0,0 +1,26 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
public class Regx_group {
public Regx_group(boolean rslt, int bgn, int end, String val) {this.rslt = rslt; this.bgn = bgn; this.end = end; this.val = val;}
public boolean Rslt() {return rslt;} private boolean rslt;
public int Bgn() {return bgn;} int bgn;
public int End() {return end;} int end;
public String Val() {return val;} private String val;
public static final Regx_group[] Ary_empty = new Regx_group[0];
}

View File

@@ -0,0 +1,28 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.regxs; import gplx.*; import gplx.core.*;
public class Regx_match {
public Regx_match(boolean rslt, int find_bgn, int find_end, Regx_group[] groups) {this.rslt = rslt; this.find_bgn = find_bgn; this.find_end = find_end; this.groups = groups;}
public boolean Rslt() {return rslt;} private boolean rslt;
public boolean Rslt_none() {return !rslt;} // NOTE: was "|| find_end - find_bgn == 0"; DATE:2013-04-11; DATE:2014-09-02
public int Find_bgn() {return find_bgn;} int find_bgn;
public int Find_end() {return find_end;} int find_end;
public int Find_len() {return find_end - find_bgn;}
public Regx_group[] Groups() {return groups;} Regx_group[] groups = Regx_group.Ary_empty;
public static final Regx_match[] Ary_empty = new Regx_match[0];
}