mirror of
https://github.com/gnosygnu/xowa.git
synced 2024-10-27 20:34:16 +00:00
Mw_parse.Apos: Add initial implementation
This commit is contained in:
parent
f8fcb553d5
commit
32a857f062
@ -17,7 +17,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.core.primitives; import gplx.*; import gplx.core.*;
|
||||
public class Int_list {
|
||||
private int capacity = 0;
|
||||
private int[] ary = Int_.Ary_empty; private int ary_len, ary_max;
|
||||
public Int_list() {this.capacity = 0; this.ary = Int_.Ary_empty;}
|
||||
public Int_list(int capacity) {this.capacity = capacity; this.ary = new int[capacity];}
|
||||
public void Add(int uid) {
|
||||
int new_len = ary_len + 1;
|
||||
if (new_len > ary_max) {
|
||||
@ -32,9 +35,17 @@ public class Int_list {
|
||||
public int Len() {return ary_len;}
|
||||
public int Get_at(int i) {return ary[i];}
|
||||
public void Clear() {
|
||||
ary = Int_.Ary_empty;
|
||||
if (ary_len > capacity) {
|
||||
ary = (capacity == 0) ? Int_.Ary_empty : new int[capacity];
|
||||
}
|
||||
ary_len = ary_max = 0;
|
||||
}
|
||||
public int[] To_ary() {
|
||||
int[] rv = new int[ary_len];
|
||||
for (int i = 0; i < ary_len; i++)
|
||||
rv[i] = ary[i];
|
||||
return rv;
|
||||
}
|
||||
public static Int_list new_(int... ary) {
|
||||
Int_list rv = new Int_list();
|
||||
int len = ary.length;
|
||||
|
54
400_xowa/src/gplx/langs/phps/utls/Php_preg_.java
Normal file
54
400_xowa/src/gplx/langs/phps/utls/Php_preg_.java
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
|
||||
import gplx.core.primitives.*;
|
||||
public class Php_preg_ {
|
||||
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
|
||||
// find delimiters
|
||||
int dlm_len = dlm.length;
|
||||
byte dlm_nth = dlm[dlm_len - 1];
|
||||
int i = src_bgn;
|
||||
list.Add(src_bgn);
|
||||
while (true) {
|
||||
if (i == src_end) break;
|
||||
int dlm_end = i + dlm_len;
|
||||
if (dlm_end < src_end && Bry_.Eq(src, i, dlm_end, dlm)) {
|
||||
if (extend) {
|
||||
dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth);
|
||||
}
|
||||
list.Add(i);
|
||||
list.Add(dlm_end);
|
||||
i = dlm_end;
|
||||
}
|
||||
else
|
||||
i++;
|
||||
}
|
||||
list.Add(src_end);
|
||||
|
||||
// create brys
|
||||
int rv_len = list.Len() - 1;
|
||||
if (rv_len == 1) return null;
|
||||
byte[][] rv = new byte[rv_len][];
|
||||
for (i = 0; i < rv_len; i += 2) {
|
||||
rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1));
|
||||
if (i + 1 == rv_len) break;
|
||||
rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2));
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
33
400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java
Normal file
33
400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Php_preg___tst {
|
||||
private final Php_preg___fxt fxt = new Php_preg___fxt();
|
||||
@Test public void Split() {
|
||||
fxt.Test__split("a''b''c", "''", Bool_.N, "a", "''", "b", "''", "c");
|
||||
}
|
||||
}
|
||||
class Php_preg___fxt {
|
||||
public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);}
|
||||
public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) {
|
||||
gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list();
|
||||
byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend);
|
||||
Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed");
|
||||
}
|
||||
}
|
@ -17,7 +17,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
|
||||
public class Php_str_ {
|
||||
public static byte[] Substr(byte[] src, int bgn, int len) {return Bry_.Mid(src, bgn, bgn + len);}
|
||||
public static byte[] Substr(byte[] src, int bgn) {
|
||||
return src;
|
||||
}
|
||||
public static byte[] Substr(byte[] src, int bgn, int len) {
|
||||
return Bry_.Mid(src, bgn, bgn + len);
|
||||
}
|
||||
public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
|
||||
if (max == -1) max = src_len;
|
||||
int rv = 0;
|
||||
|
@ -248,4 +248,14 @@ public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
|
||||
}
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
// private static final int
|
||||
// Para_stack_none = 0 // false
|
||||
// , Para_stack_bgn = 1 // <p>
|
||||
// , Para_stack_mid = 2 // </p><p>
|
||||
// ;
|
||||
// private static final byte
|
||||
// Mode_none = 0 // ''
|
||||
// , Mode_para = 1 // p
|
||||
// , Mode_pre = 2 // pre
|
||||
// ;
|
||||
}
|
||||
|
241
400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java
Normal file
241
400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java
Normal file
@ -0,0 +1,241 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import gplx.langs.phps.utls.*;
|
||||
import gplx.xowa.parsers.htmls.*;
|
||||
import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*;
|
||||
import gplx.core.primitives.*;
|
||||
public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
|
||||
private final Bry_bfr bfr = Bry_bfr_.New();
|
||||
private final Bry_bfr tmp = Bry_bfr_.New();
|
||||
private final Int_list apos_pos_ary = new Int_list(32);
|
||||
public byte[] Do_all_quotes(byte[] src) {
|
||||
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
|
||||
// PORTED: `$outtext .= $this->doQuotes( $line ) . "\n";` NOTE: "\n" is added below
|
||||
bfr.Del_by_1(); // $outtext = substr( $outtext, 0, -1 );
|
||||
apos_pos_ary.Clear();
|
||||
return bfr.To_bry_and_clear();
|
||||
}
|
||||
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
|
||||
public int Split(byte[] src, int itm_bgn, int itm_end) {
|
||||
// PORTED: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
||||
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y);
|
||||
if (arr == null) {
|
||||
bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
int arr_len = arr.length;
|
||||
|
||||
// First, do some preliminary work. This may shift some apostrophes from
|
||||
// being mark-up to being text. It also counts the number of occurrences
|
||||
// of bold and italics mark-ups.
|
||||
int num_bold = 0;
|
||||
int num_italics = 0;
|
||||
for (int i = 1; i < arr_len; i += 2) {
|
||||
int apos_len = arr[i].length;
|
||||
// If there are ever four apostrophes, assume the first is supposed to
|
||||
// be text, and the remaining three constitute mark-up for bold text.
|
||||
// (bug 13227: ''''foo'''' turns into ' ''' foo ' ''')
|
||||
if (apos_len == 4) {
|
||||
arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry);
|
||||
arr[i] = Bry_.new_a7("'''");
|
||||
apos_len = 3;
|
||||
}
|
||||
else if (apos_len > 5) {
|
||||
// If there are more than 5 apostrophes in a row, assume they're all
|
||||
// text except for the last 5.
|
||||
// (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''')
|
||||
arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5));
|
||||
arr[i] = Bry_.new_a7("'''''");
|
||||
apos_len = 5;
|
||||
}
|
||||
// Count the number of occurrences of bold and italics mark-ups.
|
||||
if (apos_len == 2) {
|
||||
num_italics++;
|
||||
}
|
||||
else if (apos_len == 3) {
|
||||
num_bold++;
|
||||
}
|
||||
else if (apos_len == 5) {
|
||||
num_italics++;
|
||||
num_bold++;
|
||||
}
|
||||
}
|
||||
|
||||
// If there is an odd number of both bold and italics, it is likely
|
||||
// that one of the bold ones was meant to be an apostrophe followed
|
||||
// by italics. Which one we cannot know for certain, but it is more
|
||||
// likely to be one that has a single-letter word before it.
|
||||
if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
|
||||
int first_word_1 = -1;
|
||||
int first_word_n = -1;
|
||||
int first_space = -1;
|
||||
for (int i = 1; i < arr_len; i += 2) {
|
||||
if (arr[i].length == 3) {
|
||||
byte[] prv = arr[i - 1];
|
||||
byte[] x1 = Php_str_.Substr(prv, -1);
|
||||
byte[] x2 = Php_str_.Substr(prv, -2, 1);
|
||||
if (Bry_.Eq(x1, Byte_ascii.Space_bry)) {
|
||||
if (first_space == -1) {
|
||||
first_space = i;
|
||||
}
|
||||
}
|
||||
else if (Bry_.Eq(x2, Byte_ascii.Space_bry)) {
|
||||
first_word_1 = i;
|
||||
// if $firstsingleletterword is set, we don't
|
||||
// look at the other options, so we can bail early.
|
||||
break;
|
||||
}
|
||||
else {
|
||||
if (first_word_n == -1) {
|
||||
first_word_n = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there is a single-letter word, use it!
|
||||
if (first_word_1 > -1) {
|
||||
arr[first_word_1] = Wtxt__apos;
|
||||
arr[first_word_1 - 1] = Bry_.Add(arr[first_word_1 - 1], Byte_ascii.Apos);
|
||||
}
|
||||
else if (first_word_n > -1) {
|
||||
// If not, but there's a multi-letter word, use that one.
|
||||
arr[first_word_n] = Wtxt__apos;
|
||||
arr[first_word_n - 1] = Bry_.Add(arr[first_word_n - 1], Byte_ascii.Apos);
|
||||
}
|
||||
else if (first_space > -1) {
|
||||
// ... otherwise use the first one that has neither.
|
||||
// (notice that it is possible for all three to be -1 if, for example,
|
||||
// there is only one pentuple-apostrophe in the line)
|
||||
arr[first_space] = Wtxt__apos;
|
||||
arr[first_space - 1] = Bry_.Add(arr[first_space - 1], Byte_ascii.Apos);
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's actually convert our apostrophic mush to HTML!
|
||||
int state = State__empty;
|
||||
for (int j = 0; j < arr_len; j++) {
|
||||
if ((j % 2) == 0) {
|
||||
if (state == State__both) {
|
||||
tmp.Add(arr[j]);
|
||||
}
|
||||
else {
|
||||
bfr.Add(arr[j]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int apos_len = 2; // strlen(r);
|
||||
if (apos_len == 2) {
|
||||
if (state == State__i) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b></i><b>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<b><i>").Add_bfr_and_preserve(tmp).Add_str_a7("</i>");
|
||||
state = State__b;
|
||||
}
|
||||
else { // state can be 'b' or ''
|
||||
bfr.Add_str_a7("<i>");
|
||||
state = state == State__b ? State__bi : State__i;
|
||||
}
|
||||
}
|
||||
else if (apos_len == 3) {
|
||||
if (state == State__b) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i></b><i>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b>");
|
||||
state = State__i;
|
||||
}
|
||||
else { // state can be 'i' or ''
|
||||
bfr.Add_str_a7("<b>");
|
||||
state = state == State__i ? State__ib : State__b;
|
||||
}
|
||||
}
|
||||
else if (apos_len == 5) {
|
||||
if (state == State__b) {
|
||||
bfr.Add_str_a7("</b><i>");
|
||||
state = State__i;
|
||||
}
|
||||
else if (state == State__i) {
|
||||
bfr.Add_str_a7("</i><b>");
|
||||
state = State__b;
|
||||
}
|
||||
else if (state == State__bi) {
|
||||
bfr.Add_str_a7("</i></b>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__ib) {
|
||||
bfr.Add_str_a7("</b></i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else if (state == State__both) {
|
||||
bfr.Add_str_a7("<i><b>' . buffer . '</b></i>");
|
||||
state = State__empty;
|
||||
}
|
||||
else { // (state == '')
|
||||
tmp.Clear();
|
||||
state = State__both;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now close all remaining tags. Notice that the order is important.
|
||||
if (state == State__b || state == State__ib) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
}
|
||||
if (state == State__i || state == State__bi || state == State__ib) {
|
||||
bfr.Add_str_a7("</i>");
|
||||
}
|
||||
if (state == State__bi) {
|
||||
bfr.Add_str_a7("</b>");
|
||||
}
|
||||
// There might be lonely ''''', so make sure we have a buffer
|
||||
if (state == State__both && tmp.Len_gt_0()) {
|
||||
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
|
||||
}
|
||||
bfr.Add_byte_nl();
|
||||
return Bry_split_.Rv__ok;
|
||||
}
|
||||
private static final int
|
||||
State__empty = 0
|
||||
, State__b = 1
|
||||
, State__i = 2
|
||||
, State__bi = 3
|
||||
, State__ib = 4
|
||||
, State__both = 5
|
||||
;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012 gnosygnu@gmail.com
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
|
||||
import org.junit.*;
|
||||
public class Xomw_quote_wkr__tst {
|
||||
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
|
||||
@Test public void Basic() {
|
||||
fxt.Test__parse("a''b''c", "a<i>b</i>c");
|
||||
}
|
||||
}
|
||||
class Xomw_quote_wkr__fxt {
|
||||
// private final Xomw_parser_ctx ctx = new Xomw_parser_ctx();
|
||||
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr();
|
||||
public void Test__parse(String src_str, String expd) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
byte[] actl = wkr.Do_all_quotes(src_bry);
|
||||
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user