Mw_parse.Apos: Add initial implementation

pull/620/head
gnosygnu 8 years ago
parent f8fcb553d5
commit 32a857f062

@ -17,7 +17,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.primitives; import gplx.*; import gplx.core.*;
public class Int_list {
private int capacity = 0;
private int[] ary = Int_.Ary_empty; private int ary_len, ary_max;
public Int_list() {this.capacity = 0; this.ary = Int_.Ary_empty;}
public Int_list(int capacity) {this.capacity = capacity; this.ary = new int[capacity];}
public void Add(int uid) {
int new_len = ary_len + 1;
if (new_len > ary_max) {
@ -32,9 +35,17 @@ public class Int_list {
public int Len() {return ary_len;}
public int Get_at(int i) {return ary[i];}
public void Clear() {
ary = Int_.Ary_empty;
if (ary_len > capacity) {
ary = (capacity == 0) ? Int_.Ary_empty : new int[capacity];
}
ary_len = ary_max = 0;
}
public int[] To_ary() {
int[] rv = new int[ary_len];
for (int i = 0; i < ary_len; i++)
rv[i] = ary[i];
return rv;
}
public static Int_list new_(int... ary) {
Int_list rv = new Int_list();
int len = ary.length;

@ -0,0 +1,54 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import gplx.core.primitives.*;
public class Php_preg_ {
public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) {
// find delimiters
int dlm_len = dlm.length;
byte dlm_nth = dlm[dlm_len - 1];
int i = src_bgn;
list.Add(src_bgn);
while (true) {
if (i == src_end) break;
int dlm_end = i + dlm_len;
if (dlm_end < src_end && Bry_.Eq(src, i, dlm_end, dlm)) {
if (extend) {
dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth);
}
list.Add(i);
list.Add(dlm_end);
i = dlm_end;
}
else
i++;
}
list.Add(src_end);
// create brys
int rv_len = list.Len() - 1;
if (rv_len == 1) return null;
byte[][] rv = new byte[rv_len][];
for (i = 0; i < rv_len; i += 2) {
rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1));
if (i + 1 == rv_len) break;
rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2));
}
return rv;
}
}

@ -0,0 +1,33 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
import org.junit.*; import gplx.core.tests.*;
public class Php_preg___tst {
private final Php_preg___fxt fxt = new Php_preg___fxt();
@Test public void Split() {
fxt.Test__split("a''b''c", "''", Bool_.N, "a", "''", "b", "''", "c");
}
}
class Php_preg___fxt {
public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);}
public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) {
gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list();
byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend);
Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed");
}
}

@ -17,7 +17,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
public class Php_str_ {
public static byte[] Substr(byte[] src, int bgn, int len) {return Bry_.Mid(src, bgn, bgn + len);}
public static byte[] Substr(byte[] src, int bgn) {
return src;
}
public static byte[] Substr(byte[] src, int bgn, int len) {
return Bry_.Mid(src, bgn, bgn + len);
}
public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
if (max == -1) max = src_len;
int rv = 0;

@ -248,4 +248,14 @@ public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
}
return Bry_split_.Rv__ok;
}
// private static final int
// Para_stack_none = 0 // false
// , Para_stack_bgn = 1 // <p>
// , Para_stack_mid = 2 // </p><p>
// ;
// private static final byte
// Mode_none = 0 // ''
// , Mode_para = 1 // p
// , Mode_pre = 2 // pre
// ;
}

@ -0,0 +1,241 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import gplx.langs.phps.utls.*;
import gplx.xowa.parsers.htmls.*;
import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*;
import gplx.core.primitives.*;
public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
private final Bry_bfr bfr = Bry_bfr_.New();
private final Bry_bfr tmp = Bry_bfr_.New();
private final Int_list apos_pos_ary = new Int_list(32);
public byte[] Do_all_quotes(byte[] src) {
Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
// PORTED: `$outtext .= $this->doQuotes( $line ) . "\n";` NOTE: "\n" is added below
bfr.Del_by_1(); // $outtext = substr( $outtext, 0, -1 );
apos_pos_ary.Clear();
return bfr.To_bry_and_clear();
}
private static final byte[] Wtxt__apos = Bry_.new_a7("''");
public int Split(byte[] src, int itm_bgn, int itm_end) {
// PORTED: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y);
if (arr == null) {
bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl();
return Bry_split_.Rv__ok;
}
int arr_len = arr.length;
// First, do some preliminary work. This may shift some apostrophes from
// being mark-up to being text. It also counts the number of occurrences
// of bold and italics mark-ups.
int num_bold = 0;
int num_italics = 0;
for (int i = 1; i < arr_len; i += 2) {
int apos_len = arr[i].length;
// If there are ever four apostrophes, assume the first is supposed to
// be text, and the remaining three constitute mark-up for bold text.
// (bug 13227: ''''foo'''' turns into ' ''' foo ' ''')
if (apos_len == 4) {
arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry);
arr[i] = Bry_.new_a7("'''");
apos_len = 3;
}
else if (apos_len > 5) {
// If there are more than 5 apostrophes in a row, assume they're all
// text except for the last 5.
// (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''')
arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5));
arr[i] = Bry_.new_a7("'''''");
apos_len = 5;
}
// Count the number of occurrences of bold and italics mark-ups.
if (apos_len == 2) {
num_italics++;
}
else if (apos_len == 3) {
num_bold++;
}
else if (apos_len == 5) {
num_italics++;
num_bold++;
}
}
// If there is an odd number of both bold and italics, it is likely
// that one of the bold ones was meant to be an apostrophe followed
// by italics. Which one we cannot know for certain, but it is more
// likely to be one that has a single-letter word before it.
if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
int first_word_1 = -1;
int first_word_n = -1;
int first_space = -1;
for (int i = 1; i < arr_len; i += 2) {
if (arr[i].length == 3) {
byte[] prv = arr[i - 1];
byte[] x1 = Php_str_.Substr(prv, -1);
byte[] x2 = Php_str_.Substr(prv, -2, 1);
if (Bry_.Eq(x1, Byte_ascii.Space_bry)) {
if (first_space == -1) {
first_space = i;
}
}
else if (Bry_.Eq(x2, Byte_ascii.Space_bry)) {
first_word_1 = i;
// if $firstsingleletterword is set, we don't
// look at the other options, so we can bail early.
break;
}
else {
if (first_word_n == -1) {
first_word_n = i;
}
}
}
}
// If there is a single-letter word, use it!
if (first_word_1 > -1) {
arr[first_word_1] = Wtxt__apos;
arr[first_word_1 - 1] = Bry_.Add(arr[first_word_1 - 1], Byte_ascii.Apos);
}
else if (first_word_n > -1) {
// If not, but there's a multi-letter word, use that one.
arr[first_word_n] = Wtxt__apos;
arr[first_word_n - 1] = Bry_.Add(arr[first_word_n - 1], Byte_ascii.Apos);
}
else if (first_space > -1) {
// ... otherwise use the first one that has neither.
// (notice that it is possible for all three to be -1 if, for example,
// there is only one pentuple-apostrophe in the line)
arr[first_space] = Wtxt__apos;
arr[first_space - 1] = Bry_.Add(arr[first_space - 1], Byte_ascii.Apos);
}
}
// Now let's actually convert our apostrophic mush to HTML!
int state = State__empty;
for (int j = 0; j < arr_len; j++) {
if ((j % 2) == 0) {
if (state == State__both) {
tmp.Add(arr[j]);
}
else {
bfr.Add(arr[j]);
}
}
else {
int apos_len = 2; // strlen(r);
if (apos_len == 2) {
if (state == State__i) {
bfr.Add_str_a7("</i>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i>");
state = State__b;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i><b>");
state = State__b;
}
else if (state == State__both) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_preserve(tmp).Add_str_a7("</i>");
state = State__b;
}
else { // state can be 'b' or ''
bfr.Add_str_a7("<i>");
state = state == State__b ? State__bi : State__i;
}
}
else if (apos_len == 3) {
if (state == State__b) {
bfr.Add_str_a7("</b>");
state = State__empty;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b><i>");
state = State__i;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b>");
state = State__i;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b>");
state = State__i;
}
else { // state can be 'i' or ''
bfr.Add_str_a7("<b>");
state = state == State__i ? State__ib : State__b;
}
}
else if (apos_len == 5) {
if (state == State__b) {
bfr.Add_str_a7("</b><i>");
state = State__i;
}
else if (state == State__i) {
bfr.Add_str_a7("</i><b>");
state = State__b;
}
else if (state == State__bi) {
bfr.Add_str_a7("</i></b>");
state = State__empty;
}
else if (state == State__ib) {
bfr.Add_str_a7("</b></i>");
state = State__empty;
}
else if (state == State__both) {
bfr.Add_str_a7("<i><b>' . buffer . '</b></i>");
state = State__empty;
}
else { // (state == '')
tmp.Clear();
state = State__both;
}
}
}
}
// Now close all remaining tags. Notice that the order is important.
if (state == State__b || state == State__ib) {
bfr.Add_str_a7("</b>");
}
if (state == State__i || state == State__bi || state == State__ib) {
bfr.Add_str_a7("</i>");
}
if (state == State__bi) {
bfr.Add_str_a7("</b>");
}
// There might be lonely ''''', so make sure we have a buffer
if (state == State__both && tmp.Len_gt_0()) {
bfr.Add_str_a7("<b><i>").Add_bfr_and_clear(tmp).Add_str_a7("</i></b>");
}
bfr.Add_byte_nl();
return Bry_split_.Rv__ok;
}
private static final int
State__empty = 0
, State__b = 1
, State__i = 2
, State__bi = 3
, State__ib = 4
, State__both = 5
;
}

@ -0,0 +1,34 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
import org.junit.*;
public class Xomw_quote_wkr__tst {
private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
@Test public void Basic() {
fxt.Test__parse("a''b''c", "a<i>b</i>c");
}
}
class Xomw_quote_wkr__fxt {
// private final Xomw_parser_ctx ctx = new Xomw_parser_ctx();
private final Xomw_quote_wkr wkr = new Xomw_quote_wkr();
public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
byte[] actl = wkr.Do_all_quotes(src_bry);
Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str);
}
}
Loading…
Cancel
Save