From 32a857f06240bd9e63d67f75f045a422e43ba6ad Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Fri, 13 Jan 2017 13:57:25 -0500 Subject: [PATCH] Mw_parse.Apos: Add initial implementation --- .../src/gplx/core/primitives/Int_list.java | 13 +- .../src/gplx/langs/phps/utls/Php_preg_.java | 54 ++++ .../gplx/langs/phps/utls/Php_preg___tst.java | 33 +++ .../src/gplx/langs/phps/utls/Php_str_.java | 7 +- .../parsers/mws/blocks/Xomw_block_wkr.java | 10 + .../parsers/mws/quotes/Xomw_quote_wkr.java | 241 ++++++++++++++++++ .../mws/quotes/Xomw_quote_wkr__tst.java | 34 +++ 7 files changed, 390 insertions(+), 2 deletions(-) create mode 100644 400_xowa/src/gplx/langs/phps/utls/Php_preg_.java create mode 100644 400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java create mode 100644 400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java create mode 100644 400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java diff --git a/100_core/src/gplx/core/primitives/Int_list.java b/100_core/src/gplx/core/primitives/Int_list.java index 90108ac19..1adac6a53 100644 --- a/100_core/src/gplx/core/primitives/Int_list.java +++ b/100_core/src/gplx/core/primitives/Int_list.java @@ -17,7 +17,10 @@ along with this program. If not, see . */ package gplx.core.primitives; import gplx.*; import gplx.core.*; public class Int_list { + private int capacity = 0; private int[] ary = Int_.Ary_empty; private int ary_len, ary_max; + public Int_list() {this.capacity = 0; this.ary = Int_.Ary_empty;} + public Int_list(int capacity) {this.capacity = capacity; this.ary = new int[capacity];} public void Add(int uid) { int new_len = ary_len + 1; if (new_len > ary_max) { @@ -32,9 +35,17 @@ public class Int_list { public int Len() {return ary_len;} public int Get_at(int i) {return ary[i];} public void Clear() { - ary = Int_.Ary_empty; + if (ary_len > capacity) { + ary = (capacity == 0) ? Int_.Ary_empty : new int[capacity]; + } ary_len = ary_max = 0; } + public int[] To_ary() { + int[] rv = new int[ary_len]; + for (int i = 0; i < ary_len; i++) + rv[i] = ary[i]; + return rv; + } public static Int_list new_(int... ary) { Int_list rv = new Int_list(); int len = ary.length; diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java b/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java new file mode 100644 index 000000000..d2a952d9c --- /dev/null +++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg_.java @@ -0,0 +1,54 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; +import gplx.core.primitives.*; +public class Php_preg_ { + public static byte[][] Split(Int_list list, byte[] src, int src_bgn, int src_end, byte[] dlm, boolean extend) { + // find delimiters + int dlm_len = dlm.length; + byte dlm_nth = dlm[dlm_len - 1]; + int i = src_bgn; + list.Add(src_bgn); + while (true) { + if (i == src_end) break; + int dlm_end = i + dlm_len; + if (dlm_end < src_end && Bry_.Eq(src, i, dlm_end, dlm)) { + if (extend) { + dlm_end = Bry_find_.Find_fwd_while(src, i, src_end, dlm_nth); + } + list.Add(i); + list.Add(dlm_end); + i = dlm_end; + } + else + i++; + } + list.Add(src_end); + + // create brys + int rv_len = list.Len() - 1; + if (rv_len == 1) return null; + byte[][] rv = new byte[rv_len][]; + for (i = 0; i < rv_len; i += 2) { + rv[i ] = Bry_.Mid(src, list.Get_at(i + 0), list.Get_at(i + 1)); + if (i + 1 == rv_len) break; + rv[i + 1] = Bry_.Mid(src, list.Get_at(i + 1), list.Get_at(i + 2)); + } + return rv; + } +} diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java new file mode 100644 index 000000000..4e6ffc103 --- /dev/null +++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java @@ -0,0 +1,33 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; +import org.junit.*; import gplx.core.tests.*; +public class Php_preg___tst { + private final Php_preg___fxt fxt = new Php_preg___fxt(); + @Test public void Split() { + fxt.Test__split("a''b''c", "''", Bool_.N, "a", "''", "b", "''", "c"); + } +} +class Php_preg___fxt { + public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);} + public void Test__split(String src, int src_bgn, int src_end, String dlm, boolean extend, String... expd) { + gplx.core.primitives.Int_list rv = new gplx.core.primitives.Int_list(); + byte[][] actl = Php_preg_.Split(rv, Bry_.new_u8(src), src_bgn, src_end, Bry_.new_u8(dlm), extend); + Gftest.Eq__ary(expd, String_.Ary(actl), "find_failed"); + } +} diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java index be95e004f..89bf40717 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java @@ -17,7 +17,12 @@ along with this program. If not, see . */ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; public class Php_str_ { - public static byte[] Substr(byte[] src, int bgn, int len) {return Bry_.Mid(src, bgn, bgn + len);} + public static byte[] Substr(byte[] src, int bgn) { + return src; + } + public static byte[] Substr(byte[] src, int bgn, int len) { + return Bry_.Mid(src, bgn, bgn + len); + } public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) { if (max == -1) max = src_len; int rv = 0; diff --git a/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java b/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java index 3665e4ead..14727a688 100644 --- a/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java +++ b/400_xowa/src/gplx/xowa/parsers/mws/blocks/Xomw_block_wkr.java @@ -248,4 +248,14 @@ public class Xomw_block_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U } return Bry_split_.Rv__ok; } +// private static final int +// Para_stack_none = 0 // false +// , Para_stack_bgn = 1 //

+// , Para_stack_mid = 2 //

+// ; +// private static final byte +// Mode_none = 0 // '' +// , Mode_para = 1 // p +// , Mode_pre = 2 // pre +// ; } diff --git a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java new file mode 100644 index 000000000..56b0cd9f3 --- /dev/null +++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java @@ -0,0 +1,241 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; +import gplx.langs.phps.utls.*; +import gplx.xowa.parsers.htmls.*; +import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*; +import gplx.core.primitives.*; +public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls + private final Bry_bfr bfr = Bry_bfr_.New(); + private final Bry_bfr tmp = Bry_bfr_.New(); + private final Int_list apos_pos_ary = new Int_list(32); + public byte[] Do_all_quotes(byte[] src) { + Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text ); + // PORTED: `$outtext .= $this->doQuotes( $line ) . "\n";` NOTE: "\n" is added below + bfr.Del_by_1(); // $outtext = substr( $outtext, 0, -1 ); + apos_pos_ary.Clear(); + return bfr.To_bry_and_clear(); + } + private static final byte[] Wtxt__apos = Bry_.new_a7("''"); + public int Split(byte[] src, int itm_bgn, int itm_end) { + // PORTED: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE); + byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); + if (arr == null) { + bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl(); + return Bry_split_.Rv__ok; + } + int arr_len = arr.length; + + // First, do some preliminary work. This may shift some apostrophes from + // being mark-up to being text. It also counts the number of occurrences + // of bold and italics mark-ups. + int num_bold = 0; + int num_italics = 0; + for (int i = 1; i < arr_len; i += 2) { + int apos_len = arr[i].length; + // If there are ever four apostrophes, assume the first is supposed to + // be text, and the remaining three constitute mark-up for bold text. + // (bug 13227: ''''foo'''' turns into ' ''' foo ' ''') + if (apos_len == 4) { + arr[i - 1] = Bry_.Add(arr[i - 1], Byte_ascii.Apos_bry); + arr[i] = Bry_.new_a7("'''"); + apos_len = 3; + } + else if (apos_len > 5) { + // If there are more than 5 apostrophes in a row, assume they're all + // text except for the last 5. + // (bug 13227: ''''''foo'''''' turns into ' ''''' foo ' ''''') + arr[i - 1] = Bry_.Add(arr[i - 1], Bry_.Repeat(Byte_ascii.Apos, apos_len - 5)); + arr[i] = Bry_.new_a7("'''''"); + apos_len = 5; + } + // Count the number of occurrences of bold and italics mark-ups. + if (apos_len == 2) { + num_italics++; + } + else if (apos_len == 3) { + num_bold++; + } + else if (apos_len == 5) { + num_italics++; + num_bold++; + } + } + + // If there is an odd number of both bold and italics, it is likely + // that one of the bold ones was meant to be an apostrophe followed + // by italics. Which one we cannot know for certain, but it is more + // likely to be one that has a single-letter word before it. + if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) { + int first_word_1 = -1; + int first_word_n = -1; + int first_space = -1; + for (int i = 1; i < arr_len; i += 2) { + if (arr[i].length == 3) { + byte[] prv = arr[i - 1]; + byte[] x1 = Php_str_.Substr(prv, -1); + byte[] x2 = Php_str_.Substr(prv, -2, 1); + if (Bry_.Eq(x1, Byte_ascii.Space_bry)) { + if (first_space == -1) { + first_space = i; + } + } + else if (Bry_.Eq(x2, Byte_ascii.Space_bry)) { + first_word_1 = i; + // if $firstsingleletterword is set, we don't + // look at the other options, so we can bail early. + break; + } + else { + if (first_word_n == -1) { + first_word_n = i; + } + } + } + } + + // If there is a single-letter word, use it! + if (first_word_1 > -1) { + arr[first_word_1] = Wtxt__apos; + arr[first_word_1 - 1] = Bry_.Add(arr[first_word_1 - 1], Byte_ascii.Apos); + } + else if (first_word_n > -1) { + // If not, but there's a multi-letter word, use that one. + arr[first_word_n] = Wtxt__apos; + arr[first_word_n - 1] = Bry_.Add(arr[first_word_n - 1], Byte_ascii.Apos); + } + else if (first_space > -1) { + // ... otherwise use the first one that has neither. + // (notice that it is possible for all three to be -1 if, for example, + // there is only one pentuple-apostrophe in the line) + arr[first_space] = Wtxt__apos; + arr[first_space - 1] = Bry_.Add(arr[first_space - 1], Byte_ascii.Apos); + } + } + + // Now let's actually convert our apostrophic mush to HTML! + int state = State__empty; + for (int j = 0; j < arr_len; j++) { + if ((j % 2) == 0) { + if (state == State__both) { + tmp.Add(arr[j]); + } + else { + bfr.Add(arr[j]); + } + } + else { + int apos_len = 2; // strlen(r); + if (apos_len == 2) { + if (state == State__i) { + bfr.Add_str_a7(""); + state = State__empty; + } + else if (state == State__bi) { + bfr.Add_str_a7(""); + state = State__b; + } + else if (state == State__ib) { + bfr.Add_str_a7(""); + state = State__b; + } + else if (state == State__both) { + bfr.Add_str_a7("").Add_bfr_and_preserve(tmp).Add_str_a7(""); + state = State__b; + } + else { // state can be 'b' or '' + bfr.Add_str_a7(""); + state = state == State__b ? State__bi : State__i; + } + } + else if (apos_len == 3) { + if (state == State__b) { + bfr.Add_str_a7(""); + state = State__empty; + } + else if (state == State__bi) { + bfr.Add_str_a7(""); + state = State__i; + } + else if (state == State__ib) { + bfr.Add_str_a7(""); + state = State__i; + } + else if (state == State__both) { + bfr.Add_str_a7("").Add_bfr_and_preserve(tmp).Add_str_a7(""); + state = State__i; + } + else { // state can be 'i' or '' + bfr.Add_str_a7(""); + state = state == State__i ? State__ib : State__b; + } + } + else if (apos_len == 5) { + if (state == State__b) { + bfr.Add_str_a7(""); + state = State__i; + } + else if (state == State__i) { + bfr.Add_str_a7(""); + state = State__b; + } + else if (state == State__bi) { + bfr.Add_str_a7(""); + state = State__empty; + } + else if (state == State__ib) { + bfr.Add_str_a7(""); + state = State__empty; + } + else if (state == State__both) { + bfr.Add_str_a7("' . buffer . '"); + state = State__empty; + } + else { // (state == '') + tmp.Clear(); + state = State__both; + } + } + } + } + // Now close all remaining tags. Notice that the order is important. + if (state == State__b || state == State__ib) { + bfr.Add_str_a7(""); + } + if (state == State__i || state == State__bi || state == State__ib) { + bfr.Add_str_a7(""); + } + if (state == State__bi) { + bfr.Add_str_a7(""); + } + // There might be lonely ''''', so make sure we have a buffer + if (state == State__both && tmp.Len_gt_0()) { + bfr.Add_str_a7("").Add_bfr_and_clear(tmp).Add_str_a7(""); + } + bfr.Add_byte_nl(); + return Bry_split_.Rv__ok; + } + private static final int + State__empty = 0 + , State__b = 1 + , State__i = 2 + , State__bi = 3 + , State__ib = 4 + , State__both = 5 + ; +} diff --git a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java new file mode 100644 index 000000000..6fe66f3c3 --- /dev/null +++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java @@ -0,0 +1,34 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; +import org.junit.*; +public class Xomw_quote_wkr__tst { + private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt(); + @Test public void Basic() { + fxt.Test__parse("a''b''c", "abc"); + } +} +class Xomw_quote_wkr__fxt { +// private final Xomw_parser_ctx ctx = new Xomw_parser_ctx(); + private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(); + public void Test__parse(String src_str, String expd) { + byte[] src_bry = Bry_.new_u8(src_str); + byte[] actl = wkr.Do_all_quotes(src_bry); + Tfds.Eq_str_lines(expd, String_.new_u8(actl), src_str); + } +}