Mw_parse.Apos: Add more tests

2025-06-13 12:54:14 +00:00 · 2017-01-13 16:50:43 -05:00 · 2017-01-13 16:50:43 -05:00 · 6a5c114998
commit 6a5c114998
parent 32a857f062
5 changed files with 66 additions and 40 deletions
--- a/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java
+++ b/400_xowa/src/gplx/langs/phps/utls/Php_preg___tst.java
@ -19,9 +19,8 @@ package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.la
 import org.junit.*; import gplx.core.tests.*;
 public class Php_preg___tst {
 	private final    Php_preg___fxt fxt = new Php_preg___fxt();
-	@Test  public void Split() {
-		fxt.Test__split("a''b''c", "''", Bool_.N, "a", "''", "b", "''", "c");
-	}
+	@Test  public void Basic()         {fxt.Test__split("a''b''c"          , "''", Bool_.Y, "a", "''", "b", "''", "c");}
+	@Test  public void Extend()        {fxt.Test__split("a'''b'''c"        , "''", Bool_.Y, "a", "'''", "b", "'''", "c");}
 }
 class Php_preg___fxt {
 	public void Test__split(String src, String dlm, boolean extend, String... expd) {Test__split(src, 0, String_.Len(src), dlm, extend, expd);}
--- a/400_xowa/src/gplx/langs/phps/utls/Php_str_.java
+++ b/400_xowa/src/gplx/langs/phps/utls/Php_str_.java
@ -17,11 +17,23 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 package gplx.langs.phps.utls; import gplx.*; import gplx.langs.*; import gplx.langs.phps.*;
 public class Php_str_ {
-	public static byte[] Substr(byte[] src, int bgn) {
-		return src;
-	}
+	public static byte[] Substr(byte[] src, int bgn) {return Substr(src, bgn, src.length);}
 	public static byte[] Substr(byte[] src, int bgn, int len) {
-		return Bry_.Mid(src, bgn, bgn + len);
+		int src_len = src.length;
+		if (bgn < 0) bgn = src_len + bgn; // handle negative
+		if (bgn < 0) bgn = 0;	// handle out of bounds; EX: ("a", -1, -1)
+		int end = len < 0 ? src_len + len : bgn + len;
+		if (end > src.length) end = src.length;; // handle out of bounds;
+		return Bry_.Mid(src, bgn, end);
+	}
+	public static byte Substr_byte(byte[] src, int bgn) {return Substr_byte(src, bgn, src.length);}
+	public static byte Substr_byte(byte[] src, int bgn, int len) {
+		int src_len = src.length;
+		if (bgn < 0) bgn = src_len + bgn; // handle negative
+		if (bgn < 0) bgn = 0;	// handle out of bounds; EX: ("a", -1, -1)
+		int end = len < 0 ? src_len + len : bgn + len;
+		if (end > src.length) end = src.length;; // handle out of bounds;
+		return src[bgn];
 	}
 	public static int Strspn_fwd__byte(byte[] src, byte find, int bgn, int max, int src_len) {
 		if (max == -1) max = src_len;
--- a/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java
+++ b/400_xowa/src/gplx/langs/phps/utls/Php_str___tst.java
@ -39,6 +39,10 @@ public class Php_str___tst {
 		fxt.Test__strspn_bwd__space_or_tab("     a", 4, -1, 4);	// bgn
 		fxt.Test__strspn_bwd__space_or_tab("     a", 4,  2, 2);	// max
 	}
+	@Test   public void Substr__bgn_is_neg() {
+		fxt.Test__substr("abcde"                   , -1, "e");
+		fxt.Test__substr("abcde"                   , -3, -1, "cd");
+	}
 }
 class Php_str___fxt {
 	public void Test__strspn_fwd__byte(String src_str, byte find, int bgn, int max, int expd) {
@ -55,4 +59,8 @@ class Php_str___fxt {
 	public void Test__strspn_bwd__space_or_tab(String src_str, int bgn, int max, int expd) {
 		Gftest.Eq__int(expd, Php_str_.Strspn_bwd__space_or_tab(Bry_.new_u8(src_str), bgn, max));
 	}
+	public void Test__substr(String src_str, int bgn, String expd) {Test__substr(src_str, bgn, String_.Len(src_str), expd);}
+	public void Test__substr(String src_str, int bgn, int len, String expd) {
+		Gftest.Eq__str(expd, Php_str_.Substr(Bry_.new_u8(src_str), bgn, len));
+	}
 }
--- a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java
+++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr.java
@ -18,7 +18,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import gplx.xowa.parsers.*; import gplx.xowa.parsers.mws.*;
 import gplx.langs.phps.utls.*;
 import gplx.xowa.parsers.htmls.*;
-import gplx.xowa.parsers.mws.utils.*; import gplx.xowa.parsers.uniqs.*;
 import gplx.core.primitives.*;
 public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls
 	private final    Bry_bfr bfr = Bry_bfr_.New();
@ -26,15 +25,13 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 	private final    Int_list apos_pos_ary = new Int_list(32);
 	public byte[] Do_all_quotes(byte[] src) {
 		Bry_split_.Split(src, 0, src.length, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode( "\n", $text );
-		// PORTED: `$outtext .= $this->doQuotes( $line ) . "\n";` NOTE: "\n" is added below
-		bfr.Del_by_1(); // $outtext = substr( $outtext, 0, -1 );
+		bfr.Del_by_1(); // REF.MW: $outtext = substr( $outtext, 0, -1 );
 		apos_pos_ary.Clear();
 		return bfr.To_bry_and_clear();
 	}
 	private static final    byte[] Wtxt__apos = Bry_.new_a7("''");
-	public int Split(byte[] src, int itm_bgn, int itm_end) {
-		// PORTED: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
-		byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y);
+	public int Split(byte[] src, int itm_bgn, int itm_end) {			
+		byte[][] arr = Php_preg_.Split(apos_pos_ary, src, itm_bgn, itm_end, Wtxt__apos, Bool_.Y);	// PORTED.REGX: arr = preg_split("/(''+)/", text, -1, PREG_SPLIT_DELIM_CAPTURE);
 		if (arr == null) {
 			bfr.Add_mid(src, itm_bgn, itm_end).Add_byte_nl();
 			return Bry_split_.Rv__ok;
@ -81,50 +78,51 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 		// that one of the bold ones was meant to be an apostrophe followed
 		// by italics. Which one we cannot know for certain, but it is more
 		// likely to be one that has a single-letter word before it.
+		// NOTE: this code primarily handles italicized possessives; EX: The ''[[Main Page]]'''s talk page.
 		if ((num_bold % 2 == 1) && (num_italics % 2 == 1)) {
-			int first_word_1 = -1;
-			int first_word_n = -1;
-			int first_space = -1;
+			int prv_ends_w_word_1char = -1;
+			int prv_ends_w_word_nchar = -1;
+			int prv_ends_w_space = -1;
 			for (int i = 1; i < arr_len; i += 2) {
 				if (arr[i].length == 3) {
 					byte[] prv = arr[i - 1];
-					byte[] x1 = Php_str_.Substr(prv, -1);
-					byte[] x2 = Php_str_.Substr(prv, -2, 1);
-					if (Bry_.Eq(x1, Byte_ascii.Space_bry)) {
-						if (first_space == -1) {
-							first_space = i;
+					byte prv__last_char = Php_str_.Substr_byte(prv, -1);
+					byte prv__last_minus_1_char = Php_str_.Substr_byte(prv, -2, 1);
+					if (prv__last_char == Byte_ascii.Space) {              // NOTE: prv ends in space; EX: "''prv '''"
+						if (prv_ends_w_space == -1) {
+							prv_ends_w_space = i;
 						}
 					}
-					else if (Bry_.Eq(x2, Byte_ascii.Space_bry)) {
-						first_word_1 = i;
+					else if (prv__last_minus_1_char == Byte_ascii.Space) { // NOTE: prv ends in 1-char word; EX: "''prv a'''"
+						prv_ends_w_word_1char = i;
 						// if $firstsingleletterword is set, we don't
 						// look at the other options, so we can bail early.
 						break;
 					}
 					else {
-						if (first_word_n == -1) {
-							first_word_n = i;
+						if (prv_ends_w_word_nchar == -1) {
+							prv_ends_w_word_nchar = i;
 						}
 					}
 				}
 			}

 			// If there is a single-letter word, use it!
-			if (first_word_1 > -1) {
-				arr[first_word_1] = Wtxt__apos;
-				arr[first_word_1 - 1] = Bry_.Add(arr[first_word_1 - 1], Byte_ascii.Apos);
+			if (prv_ends_w_word_1char > -1) {
+				arr[prv_ends_w_word_1char] = Wtxt__apos;
+				arr[prv_ends_w_word_1char - 1] = Bry_.Add(arr[prv_ends_w_word_1char - 1], Byte_ascii.Apos);
 			}
-			else if (first_word_n > -1) {
+			else if (prv_ends_w_word_nchar > -1) {
 				// If not, but there's a multi-letter word, use that one.
-				arr[first_word_n] = Wtxt__apos;
-				arr[first_word_n - 1] = Bry_.Add(arr[first_word_n - 1], Byte_ascii.Apos);
+				arr[prv_ends_w_word_nchar] = Wtxt__apos;
+				arr[prv_ends_w_word_nchar - 1] = Bry_.Add(arr[prv_ends_w_word_nchar - 1], Byte_ascii.Apos);
 			}
-			else if (first_space > -1) {
+			else if (prv_ends_w_space > -1) {
 				// ... otherwise use the first one that has neither.
 				// (notice that it is possible for all three to be -1 if, for example,
 				// there is only one pentuple-apostrophe in the line)
-				arr[first_space] = Wtxt__apos;
-				arr[first_space - 1] = Bry_.Add(arr[first_space - 1], Byte_ascii.Apos);
+				arr[prv_ends_w_space] = Wtxt__apos;
+				arr[prv_ends_w_space - 1] = Bry_.Add(arr[prv_ends_w_space - 1], Byte_ascii.Apos);
 			}
 		}

@ -140,7 +138,7 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 				}
 			}
 			else {
-				int apos_len = 2; // strlen(r);
+				int apos_len = arr[j].length;
 				if (apos_len == 2) {
 					if (state == State__i) {
 						bfr.Add_str_a7("</i>");
@ -203,7 +201,7 @@ public class Xomw_quote_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.U
 						state = State__empty;
 					}
 					else if (state == State__both) {
-						bfr.Add_str_a7("<i><b>' . buffer . '</b></i>");
+						bfr.Add_str_a7("<i><b>").Add_bfr_and_preserve(tmp).Add_str_a7("</b></i>");
 						state = State__empty;
 					}
 					else { // (state == '')
--- a/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java
+++ b/400_xowa/src/gplx/xowa/parsers/mws/quotes/Xomw_quote_wkr__tst.java
@ -19,12 +19,21 @@ package gplx.xowa.parsers.mws.quotes; import gplx.*; import gplx.xowa.*; import
 import org.junit.*;
 public class Xomw_quote_wkr__tst {
 	private final    Xomw_quote_wkr__fxt fxt = new Xomw_quote_wkr__fxt();
-	@Test  public void Basic() {
-		fxt.Test__parse("a''b''c", "a<i>b</i>c");
-	}
+	@Test  public void Apos__0()       {fxt.Test__parse("abc"                         , "abc");}
+	@Test  public void Apos__1()       {fxt.Test__parse("a'b'c"                       , "a'b'c");}
+	@Test  public void Apos__2()       {fxt.Test__parse("a''b''c"                     , "a<i>b</i>c");}
+	@Test  public void Apos__3()       {fxt.Test__parse("a'''b'''c"                   , "a<b>b</b>c");}
+	@Test  public void Apos__4()       {fxt.Test__parse("a''''b''''c"                 , "a'<b>b'</b>c");}            // COVERS: "If there are ever four apostrophes"
+	@Test  public void Apos__5()       {fxt.Test__parse("a'''''b'''''c"               , "a<i><b>b</b></i>c");}
+	@Test  public void Apos__7()       {fxt.Test__parse("a'''''''b'''''''c"           , "a''<i><b>b''</b></i>c");}   // COVERS: "If there are more than 5 apostrophes in a row"
+	@Test  public void Mix__single()   {fxt.Test__parse("''a ''' ''b b''' ''cc'''"    , "<i>a <b> </b></i><b>b b'<i> </i>cc</b>");}   // COVERS: "If there is a single-letter word, use it!"
+	@Test  public void Mix__multi()    {fxt.Test__parse("''a ''' ''b ''' ''cc'''"     , "<i>a <b> </b></i><b>b </b> <i>cc'</i>");}    // COVERS: "If not, but there's a multi-letter word, use that one."
+	@Test  public void Mix__space()    {fxt.Test__parse("''a ''' ''b ''' ''c '''"     , "<i>a '</i> <i>b <b> </b></i><b>c </b>");}    // COVERS: "... otherwise use the first one that has neither."
+	@Test  public void Dangling__b()   {fxt.Test__parse("a'''b"                       , "a<b>b</b>");}               // COVERS: "if (state == State__b || state == State__ib)"
+	@Test  public void Dangling__i()   {fxt.Test__parse("a''b"                        , "a<i>b</i>");}               // COVERS: "if (state == State__i || state == State__bi || state == State__ib)"
+	@Test  public void Dangling__lone(){fxt.Test__parse("a'''''b"                     , "a<b><i>b</i></b>");}        // COVERS: "There might be lonely ''''', so make sure we have a buffer"
 }
 class Xomw_quote_wkr__fxt {
-//		private final    Xomw_parser_ctx ctx = new Xomw_parser_ctx();
 	private final    Xomw_quote_wkr wkr = new Xomw_quote_wkr();
 	public void Test__parse(String src_str, String expd) {
 		byte[] src_bry = Bry_.new_u8(src_str);