1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2024-09-28 14:30:51 +00:00

Scribunto: Fix script error '=Module:zh-glyph:120 attempt to concatenate nil and string' on a few hundred en.d pages for Chinese chars

This commit is contained in:
gnosygnu 2017-04-23 09:47:58 -04:00
parent eaa83db644
commit 1d6b3779a0
4 changed files with 12 additions and 8 deletions

View File

@ -27,8 +27,8 @@ public class Bry__tst {
fxt.Test_new_u8("a" , Bry_.New_by_ints(97)); // one
fxt.Test_new_u8("abc" , Bry_.New_by_ints(97, 98, 99)); // many
fxt.Test_new_u8("¢" , Bry_.New_by_ints(194, 162)); // bry_len=2; cent
fxt.Test_new_u8("" , Bry_.New_by_ints(226, 130, 172)); // bry_len=3; euro
fxt.Test_new_u8("𤭢" , Bry_.New_by_ints(240, 164, 173, 162)); // bry_len=3; example from en.w:UTF-8
fxt.Test_new_u8("" , Bry_.New_by_ints(226, 130, 172)); // bry_len=3; euro
fxt.Test_new_u8("𤭢" , Bry_.New_by_ints(240, 164, 173, 162)); // bry_len=4; example from en.w:UTF-8
}
@Test public void Add__bry_plus_byte() {
fxt.Test_add("a" , Byte_ascii.Pipe , "a|"); // basic

View File

@ -21,6 +21,7 @@ public class Utf16__tst {
// fxt.Test_encode_decode(162, 194, 162); // cent
// fxt.Test_encode_decode(8364, 226, 130, 172); // euro
fxt.Test_encode_decode(150370, 240, 164, 173, 162); // example from [[UTF-8]]; should be encoded as two bytes
fxt.Test_encode_decode(143489, 240, 163, 130, 129); // EX: 駣𣂁脁 DATE:2017-04-22
}
@Test public void Encode_as_bry_by_hex() {
fxt.Test_Encode_hex_to_bry("00", 0);

View File

@ -56,7 +56,9 @@ public class Scrib_lib_ustring implements Scrib_lib {
synchronized (surrogate_utl) {
byte[] text_bry = Bry_.new_u8(text_str); int text_bry_len = text_bry.length;
bgn_char_idx = Bgn_adjust(text_str, bgn_char_idx);
int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
// TOMBSTONE: do not adjust for 2-len chars (surrogates); lua always iterates correctly by chars; DATE:2017-04-23
// int bgn_adj = surrogate_utl.Count_surrogates__char_idx(text_bry, text_bry_len, 0, bgn_char_idx); // NOTE: convert from lua / php charidx to java regex codepoint; PAGE:zh.w:南北鐵路 (越南) DATE:2014-08-27
int bgn_adj = 0;
int bgn_codepoint_idx = bgn_char_idx + bgn_adj;
int bgn_byte_pos = surrogate_utl.Byte_pos();
if (String_.Len_eq_0(regx)) // regx of "" should return (bgn, bgn - 1) regardless of whether plain is true or false

View File

@ -16,7 +16,7 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.xtns.scribunto.libs; import gplx.*; import gplx.xowa.*; import gplx.xowa.xtns.*; import gplx.xowa.xtns.scribunto.*;
import org.junit.*; import gplx.xowa.xtns.scribunto.engines.mocks.*;
public class Scrib_lib_ustring__find__tst {
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
private final Mock_scrib_fxt fxt = new Mock_scrib_fxt(); private Scrib_lib lib;
@Before public void init() {
fxt.Clear();
lib = fxt.Core().Lib_ustring().Init();
@ -38,10 +38,11 @@ public class Scrib_lib_ustring__find__tst {
@Test public void Return_int() {
fxt.Test__proc__kvps__vals(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_("a", "()", 2, Bool_.N), 2, 1, 2);
}
@Test public void Surrogate() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
}
// DELETE: no longer needed after tombstoning surrogate logic; DATE:2017-04-23
// @Test public void Surrogate() { // PURPOSE: handle surrogates in Find PAGE:zh.w:南北鐵路_(越南); DATE:2014-08-28
// Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 1, Bool_.N, "4;4"); // 4 b/c \n starts at pos 4 (super 1)
// Exec_find("aé𡼾\nbî𡼾\n" , "\n" , 5, Bool_.N, "8;8"); // 8 b/c \n starts at pos 8 (super 1)
// }
private void Exec_find(String text, String regx, int bgn, boolean plain, String expd) {
fxt.Test__proc__kvps__flat(lib, Scrib_lib_ustring.Invk_find, Scrib_kv_utl_.base1_many_(text, regx, bgn, plain), expd);
}