From 6a5c114998e4ecf13aee98e59b20d039108ceada Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Fri, 13 Jan 2017 16:50:43 -0500 Subject: [PATCH] Mw_parse.Apos: Add more tests --- .../gplx/langs/phps/utls/Php_preg___tst.java | 5 +- .../src/gplx/langs/phps/utls/Php_str_.java | 20 +++++-- .../gplx/langs/phps/utls/Php_str___tst.java | 8 +++ .../parsers/mws/quotes/Xomw_quote_wkr.java | 56 +++++++++---------- .../mws/quotes/Xomw_quote_wkr__tst.java | 17 ++++-- 5 files changed, 66 insertions(+), 40 deletions(-) diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java index 4e6ffc103..ea65075cd 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java @@ -19,9 +19,8 @@ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.la import org.junit.*; import gplx.core.tests.*; public class Php_preg___tst { private final Php_preg___fxt fxt = new Php_preg___fxt(); - @Test public void Split() { - fxt.Test__split("a''b''c", "''", Bool_.N, "a", "''", "b", "''", "c"); - } + @Test public void Basic() {fxt.Test__split("a''b''c" , "''", Bool_.Y, "a", "''", "b", "''", "c");} + @Test public void Extend() {fxt.Test__split("a'''b'''c" , "''", Bool_.Y, "a", "'''", "b", "'''", "c");} } class Php_preg___fxt { public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);} diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java index 89bf40717..52898cc51 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java @@ -17,11 +17,23 @@ along with this program. If not, see . */ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*; public class Php_str_ { - public static byte[] Substr(byte[] src, int bgn) { - return src; - } + public static byte[] Substr(byte[] src, int bgn) {return Substr(src, bgn, src.length);} public static byte[] Substr(byte[] src, int bgn, int len) { - return Bry_.Mid(src, bgn, bgn + len); + int src_len = src.length; + if (bgn < 0) bgn = src_len + bgn; // handle negative + if (bgn < 0) bgn = 0; // handle out of bounds; EX: ("a", -1, -1) + int end = len < 0 ? src_len + len : bgn + len; + if (end > src.length) end = src.length;; // handle out of bounds; + return Bry_.Mid(src, bgn, end); + } + public static byte Substr_byte(byte[] src, int bgn) {return Substr_byte(src, bgn, src.length);} + public static byte Substr_byte(byte[] src, int bgn, int len) { + int src_len = src.length; + if (bgn < 0) bgn = src_len + bgn; // handle negative + if (bgn < 0) bgn = 0; // handle out of bounds; EX: ("a", -1, -1) + int end = len < 0 ? src_len + len : bgn + len; + if (end > src.length) end = src.length;; // handle out of bounds; + return src[bgn]; } public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) { if (max == -1) max = src_len; diff --git a/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java b/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java index b1a23c0b0..87048591b 100644 --- a/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java +++ b/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java @@ -39,6 +39,10 @@ public class Php_str___tst { fxt.Test__strspn_bwd__space_or_tab(" a", 4, -1, 4); // bgn fxt.Test__strspn_bwd__space_or_tab(" a", 4, 2, 2); // max } + @Test public void Substr__bgn_is_neg() { + fxt.Test__substr("abcde" , -1, "e"); + fxt.Test__substr("abcde" , -3, -1, "cd"); + } } class Php_str___fxt { public void Test__strspn_fwd__byte(String src_str, byte find, int bgn, int max, int expd) { @@ -55,4 +59,8 @@ class Php_str___fxt { public void Test__strspn_bwd__space_or_tab(String src_str, int bgn, int max, int expd) { Gftest.Eq__int(expd, Php_str_.Strspn_bwd__space_or_tab(Bry_.new_u8(src_str), bgn, max)); } + public void Test__substr(String src_str, int bgn, String expd) {Test__substr(src_str, bgn, String_.Len(src_str), expd);} + public void Test__substr(String src_str, int bgn, int len, String expd) { + Gftest.Eq__str(expd, Php_str_.Substr(Bry_.new_u8(src_str), bgn, len)); + } } diff --git a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java index 56b0cd9f3..a14427fd3 100644 --- a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java +++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java @@ -18,7 +18,6 @@ along with this program. If not, see . package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*; import gplx.langs.phps.utls.*; import gplx.xowa.parsers.htmls.*; -import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*; import gplx.core.primitives.*; public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls private final Bry_bfr bfr = Bry_bfr_.New(); @@ -26,15 +25,13 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U private final Int_list apos_pos_ary = new Int_list(32); public byte[] Do_all_quotes(byte[] src) { Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text ); - // PORTED: `$outtext .= $this->doQuotes( $line ) . "\n";` NOTE: "\n" is added below - bfr.Del_by_1(); // $outtext = substr( $outtext, 0, -1 ); + bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 ); apos_pos_ary.Clear(); return bfr.To_bry_and_clear(); } private static final byte[] Wtxt__apos = Bry_.new_a7("''"); - public int Split(byte[] src, int itm_bgn, int itm_end) { - // PORTED: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE); - byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); + public int Split(byte[] src, int itm_bgn, int itm_end) { + byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y); // PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE); if (arr == null) { bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl(); return Bry_split_.Rv__ok; @@ -81,50 +78,51 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U // that one of the bold ones was meant to be an apostrophe followed // by italics. Which one we cannot know for certain, but it is more // likely to be one that has a single-letter word before it. + // NOTE: this code primarily handles italicized possessives; EX: The ''[[Main Page]]'''s talk page. if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) { - int first_word_1 = -1; - int first_word_n = -1; - int first_space = -1; + int prv_ends_w_word_1char = -1; + int prv_ends_w_word_nchar = -1; + int prv_ends_w_space = -1; for (int i = 1; i < arr_len; i += 2) { if (arr[i].length == 3) { byte[] prv = arr[i - 1]; - byte[] x1 = Php_str_.Substr(prv, -1); - byte[] x2 = Php_str_.Substr(prv, -2, 1); - if (Bry_.Eq(x1, Byte_ascii.Space_bry)) { - if (first_space == -1) { - first_space = i; + byte prv__last_char = Php_str_.Substr_byte(prv, -1); + byte prv__last_minus_1_char = Php_str_.Substr_byte(prv, -2, 1); + if (prv__last_char == Byte_ascii.Space) { // NOTE: prv ends in space; EX: "''prv '''" + if (prv_ends_w_space == -1) { + prv_ends_w_space = i; } } - else if (Bry_.Eq(x2, Byte_ascii.Space_bry)) { - first_word_1 = i; + else if (prv__last_minus_1_char == Byte_ascii.Space) { // NOTE: prv ends in 1-char word; EX: "''prv a'''" + prv_ends_w_word_1char = i; // if $firstsingleletterword is set, we don't // look at the other options, so we can bail early. break; } else { - if (first_word_n == -1) { - first_word_n = i; + if (prv_ends_w_word_nchar == -1) { + prv_ends_w_word_nchar = i; } } } } // If there is a single-letter word, use it! - if (first_word_1 > -1) { - arr[first_word_1] = Wtxt__apos; - arr[first_word_1 - 1] = Bry_.Add(arr[first_word_1 - 1], Byte_ascii.Apos); + if (prv_ends_w_word_1char > -1) { + arr[prv_ends_w_word_1char] = Wtxt__apos; + arr[prv_ends_w_word_1char - 1] = Bry_.Add(arr[prv_ends_w_word_1char - 1], Byte_ascii.Apos); } - else if (first_word_n > -1) { + else if (prv_ends_w_word_nchar > -1) { // If not, but there's a multi-letter word, use that one. - arr[first_word_n] = Wtxt__apos; - arr[first_word_n - 1] = Bry_.Add(arr[first_word_n - 1], Byte_ascii.Apos); + arr[prv_ends_w_word_nchar] = Wtxt__apos; + arr[prv_ends_w_word_nchar - 1] = Bry_.Add(arr[prv_ends_w_word_nchar - 1], Byte_ascii.Apos); } - else if (first_space > -1) { + else if (prv_ends_w_space > -1) { // ... otherwise use the first one that has neither. // (notice that it is possible for all three to be -1 if, for example, // there is only one pentuple-apostrophe in the line) - arr[first_space] = Wtxt__apos; - arr[first_space - 1] = Bry_.Add(arr[first_space - 1], Byte_ascii.Apos); + arr[prv_ends_w_space] = Wtxt__apos; + arr[prv_ends_w_space - 1] = Bry_.Add(arr[prv_ends_w_space - 1], Byte_ascii.Apos); } } @@ -140,7 +138,7 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U } } else { - int apos_len = 2; // strlen(r); + int apos_len = arr[j].length; if (apos_len == 2) { if (state == State__i) { bfr.Add_str_a7(""); @@ -203,7 +201,7 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U state = State__empty; } else if (state == State__both) { - bfr.Add_str_a7("' . buffer . '"); + bfr.Add_str_a7("").Add_bfr_and_preserve(tmp).Add_str_a7(""); state = State__empty; } else { // (state == '') diff --git a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java index 6fe66f3c3..e96847945 100644 --- a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java +++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java @@ -19,12 +19,21 @@ package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import import org.junit.*; public class Xomw_quote_wkr__tst { private final Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt(); - @Test public void Basic() { - fxt.Test__parse("a''b''c", "abc"); - } + @Test public void Apos__0() {fxt.Test__parse("abc" , "abc");} + @Test public void Apos__1() {fxt.Test__parse("a'b'c" , "a'b'c");} + @Test public void Apos__2() {fxt.Test__parse("a''b''c" , "abc");} + @Test public void Apos__3() {fxt.Test__parse("a'''b'''c" , "abc");} + @Test public void Apos__4() {fxt.Test__parse("a''''b''''c" , "a'b'c");} // COVERS: "If there are ever four apostrophes" + @Test public void Apos__5() {fxt.Test__parse("a'''''b'''''c" , "abc");} + @Test public void Apos__7() {fxt.Test__parse("a'''''''b'''''''c" , "a''b''c");} // COVERS: "If there are more than 5 apostrophes in a row" + @Test public void Mix__single() {fxt.Test__parse("''a ''' ''b b''' ''cc'''" , "a b b' cc");} // COVERS: "If there is a single-letter word, use it!" + @Test public void Mix__multi() {fxt.Test__parse("''a ''' ''b ''' ''cc'''" , "a b cc'");} // COVERS: "If not, but there's a multi-letter word, use that one." + @Test public void Mix__space() {fxt.Test__parse("''a ''' ''b ''' ''c '''" , "a ' b c ");} // COVERS: "... otherwise use the first one that has neither." + @Test public void Dangling__b() {fxt.Test__parse("a'''b" , "ab");} // COVERS: "if (state == State__b || state == State__ib)" + @Test public void Dangling__i() {fxt.Test__parse("a''b" , "ab");} // COVERS: "if (state == State__i || state == State__bi || state == State__ib)" + @Test public void Dangling__lone(){fxt.Test__parse("a'''''b" , "ab");} // COVERS: "There might be lonely ''''', so make sure we have a buffer" } class Xomw_quote_wkr__fxt { -// private final Xomw_parser_ctx ctx = new Xomw_parser_ctx(); private final Xomw_quote_wkr wkr = new Xomw_quote_wkr(); public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str);