1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

Scribunto: Do not fail in ustring.find if negative bgn is large [#366]

This commit is contained in:
gnosygnu
2019-02-24 16:14:34 -05:00
parent 91cbb34fa5
commit 3fd759b020
7 changed files with 392 additions and 180 deletions

View File

@@ -1,34 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class String_surrogate_utl {
public int Byte_pos() {return byte_pos;} int byte_pos;
public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);}
public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);}
private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) {
int char_count = 0, codepoint_count = 0;
byte_pos = byte_bgn;
while (true) {
if ( stop_idx == (stop_idx_is_char ? char_count : codepoint_count) // requested # of chars found
|| byte_pos >= src_len // eos reached; DATE:2014-09-02
) return codepoint_count - char_count;
int char_len_in_bytes = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]);
++char_count; // char_count always incremented by 1
codepoint_count += (char_len_in_bytes == 4) ? 2 : 1; // codepoint_count incremented by 2 if surrogate pair; else 1
byte_pos += char_len_in_bytes;
}
}
}

View File

@@ -1,55 +0,0 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*;
public class String_surrogate_utl_tst {
@Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt();
@Test public void Char_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__char_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__char_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__char_idx (test_str, 0, 3, 1, 7); // aé𡼾
fxt.Test_count_surrogates__char_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__char_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__char_idx (test_str, 7, 3, 1, 14); // bî𡼾
fxt.Test_count_surrogates__char_idx (test_str, 0, 6, 2, 14); // aé𡼾bî𡼾
fxt.Test_count_surrogates__char_idx (test_str, 14, 7, 0, 14); // PURPOSE: test out of bounds; DATE:2014-09-02
}
@Test public void Codepoint_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 4, 1, 7); // aé𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 4, 1, 14); // bî𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 8, 2, 14); // aé𡼾bî𡼾
}
}
class String_surrogate_utl_fxt {
private String_surrogate_utl codepoint_utl = new String_surrogate_utl();
public void Clear() {}
public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx));
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos());
}
public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count");
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos(), "pos");
}
}

View File

@@ -0,0 +1,79 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
public class Utf16_mapper {
private final int[] ary;
private final int dim_len;
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
public String Src_str() {return src_str;} private final String src_str;
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
// create ary
this.src_str = src_str;
this.src_bry = src_bry;
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
int ary_len = dim_len * Dims_total;
this.ary = new int[dim_len * Dims_total];
for (int i = 0; i < ary_len; i++)
ary[i] = Invalid;
// init
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
// loop till EOS
while (true) {
// update
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
// exit if EOS
if (pos_in_bytes >= src_bry_len) break;
// get lengths
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
// increment
pos_in_bytes += cur_len_in_bytes;
pos_in_chars += cur_len_in_chars;
pos_in_codes += 1;
}
// set lens
this.len_in_codes = pos_in_codes;
this.len_in_chars = pos_in_chars;
}
public static final int
Invalid = -1
, Dims_total = 4
, Dims_code_for_byte = 0
, Dims_byte_for_code = 1
, Dims_code_for_char = 2
, Dims_char_for_code = 3
;
}

View File

@@ -0,0 +1,62 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.core.intls; import gplx.*; import gplx.core.*;
import org.junit.*; import gplx.core.tests.*;
public class Utf16_mapper_tst {
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
@Test public void A() {
fxt.Test__map("a¢€𤭢"
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
);
}
}
class Utf16_mapper_fxt {
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
byte[] src_bry = Bry_.new_u8(src_str);
int src_len = src_bry.length;
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
}
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
int actl_len = src_len + 1;
int[] actl = new int[actl_len];
for (int i = 0; i < actl_len; i++) {
int v = -1;
switch (dim_type) {
case Utf16_mapper.Dims_code_for_byte:
v = mapper.Get_code_for_byte_or_neg1(i);
break;
case Utf16_mapper.Dims_byte_for_code:
v = mapper.Get_byte_for_code_or_neg1(i);
break;
case Utf16_mapper.Dims_code_for_char:
v = mapper.Get_code_for_char_or_neg1(i);
break;
case Utf16_mapper.Dims_char_for_code:
v = mapper.Get_char_for_code_or_neg1(i);
break;
}
actl[i] = v;
}
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
}
}