mirror of https://github.com/gnosygnu/xowa
Scribunto: Do not fail in ustring.find if negative bgn is large [#366]
parent
91cbb34fa5
commit
3fd759b020
@ -1,34 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class String_surrogate_utl {
|
||||
public int Byte_pos() {return byte_pos;} int byte_pos;
|
||||
public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);}
|
||||
public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);}
|
||||
private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) {
|
||||
int char_count = 0, codepoint_count = 0;
|
||||
byte_pos = byte_bgn;
|
||||
while (true) {
|
||||
if ( stop_idx == (stop_idx_is_char ? char_count : codepoint_count) // requested # of chars found
|
||||
|| byte_pos >= src_len // eos reached; DATE:2014-09-02
|
||||
) return codepoint_count - char_count;
|
||||
int char_len_in_bytes = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]);
|
||||
++char_count; // char_count always incremented by 1
|
||||
codepoint_count += (char_len_in_bytes == 4) ? 2 : 1; // codepoint_count incremented by 2 if surrogate pair; else 1
|
||||
byte_pos += char_len_in_bytes;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,55 +0,0 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*;
|
||||
public class String_surrogate_utl_tst {
|
||||
@Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt();
|
||||
@Test public void Char_idx() {
|
||||
String test_str = "aé𡼾bî𡼾";
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 0, 1, 0, 1); // a
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 0, 2, 0, 3); // aé
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 0, 3, 1, 7); // aé𡼾
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 7, 1, 0, 8); // b
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 7, 2, 0, 10); // bî
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 7, 3, 1, 14); // bî𡼾
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 0, 6, 2, 14); // aé𡼾bî𡼾
|
||||
fxt.Test_count_surrogates__char_idx (test_str, 14, 7, 0, 14); // PURPOSE: test out of bounds; DATE:2014-09-02
|
||||
}
|
||||
@Test public void Codepoint_idx() {
|
||||
String test_str = "aé𡼾bî𡼾";
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 1, 0, 1); // a
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 2, 0, 3); // aé
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 4, 1, 7); // aé𡼾
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 1, 0, 8); // b
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 2, 0, 10); // bî
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 4, 1, 14); // bî𡼾
|
||||
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 8, 2, 14); // aé𡼾bî𡼾
|
||||
}
|
||||
}
|
||||
class String_surrogate_utl_fxt {
|
||||
private String_surrogate_utl codepoint_utl = new String_surrogate_utl();
|
||||
public void Clear() {}
|
||||
public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
|
||||
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx));
|
||||
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos());
|
||||
}
|
||||
public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str); int src_len = src_bry.length;
|
||||
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count");
|
||||
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos(), "pos");
|
||||
}
|
||||
}
|
@ -0,0 +1,79 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
public class Utf16_mapper {
|
||||
private final int[] ary;
|
||||
private final int dim_len;
|
||||
public byte[] Src_bry() {return src_bry;} private final byte[] src_bry;
|
||||
public String Src_str() {return src_str;} private final String src_str;
|
||||
public int Len_in_codes() {return len_in_codes;} private int len_in_codes;
|
||||
public int Len_in_chars() {return len_in_chars;} private int len_in_chars;
|
||||
public int Get_code_for_byte_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_byte) + idx] : Invalid;}
|
||||
public int Get_byte_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_byte_for_code) + idx] : Invalid;}
|
||||
public int Get_code_for_char_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_code_for_char) + idx] : Invalid;}
|
||||
public int Get_char_for_code_or_neg1(int idx) {return idx < dim_len ? ary[(dim_len * Dims_char_for_code) + idx] : Invalid;}
|
||||
public int Get_code_for_byte_or_fail(int idx) {int rv = Get_code_for_byte_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_byte", "idx", idx); return rv;}
|
||||
public int Get_byte_for_code_or_fail(int idx) {int rv = Get_byte_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "byte_for_code", "idx", idx); return rv;}
|
||||
public int Get_code_for_char_or_fail(int idx) {int rv = Get_code_for_char_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "code_for_char", "idx", idx); return rv;}
|
||||
public int Get_char_for_code_or_fail(int idx) {int rv = Get_char_for_code_or_neg1(idx); if (idx == Invalid) throw Err_.new_wo_type("invalid idx", "src", src_bry, "type", "char_for_code", "idx", idx); return rv;}
|
||||
public Utf16_mapper(String src_str, byte[] src_bry, int src_bry_len) {
|
||||
// create ary
|
||||
this.src_str = src_str;
|
||||
this.src_bry = src_bry;
|
||||
this.dim_len = src_bry_len + 1; // +1 to capture end + 1
|
||||
int ary_len = dim_len * Dims_total;
|
||||
this.ary = new int[dim_len * Dims_total];
|
||||
for (int i = 0; i < ary_len; i++)
|
||||
ary[i] = Invalid;
|
||||
|
||||
// init
|
||||
int pos_in_bytes = 0, pos_in_chars = 0, pos_in_codes = 0;
|
||||
|
||||
// loop till EOS
|
||||
while (true) {
|
||||
// update
|
||||
ary[(dim_len * Dims_code_for_byte) + pos_in_bytes] = pos_in_codes;
|
||||
ary[(dim_len * Dims_byte_for_code) + pos_in_codes] = pos_in_bytes;
|
||||
ary[(dim_len * Dims_code_for_char) + pos_in_chars] = pos_in_codes;
|
||||
ary[(dim_len * Dims_char_for_code) + pos_in_codes] = pos_in_chars;
|
||||
|
||||
// exit if EOS
|
||||
if (pos_in_bytes >= src_bry_len) break;
|
||||
|
||||
// get lengths
|
||||
int cur_len_in_bytes = Utf8_.Len_of_char_by_1st_byte(src_bry[pos_in_bytes]);
|
||||
int cur_len_in_chars = cur_len_in_bytes > 2 ? 2 : 1;
|
||||
|
||||
// increment
|
||||
pos_in_bytes += cur_len_in_bytes;
|
||||
pos_in_chars += cur_len_in_chars;
|
||||
pos_in_codes += 1;
|
||||
}
|
||||
|
||||
// set lens
|
||||
this.len_in_codes = pos_in_codes;
|
||||
this.len_in_chars = pos_in_chars;
|
||||
}
|
||||
|
||||
public static final int
|
||||
Invalid = -1
|
||||
, Dims_total = 4
|
||||
, Dims_code_for_byte = 0
|
||||
, Dims_byte_for_code = 1
|
||||
, Dims_code_for_char = 2
|
||||
, Dims_char_for_code = 3
|
||||
;
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
/*
|
||||
XOWA: the XOWA Offline Wiki Application
|
||||
Copyright (C) 2012-2017 gnosygnu@gmail.com
|
||||
|
||||
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
|
||||
or alternatively under the terms of the Apache License Version 2.0.
|
||||
|
||||
You may use XOWA according to either of these licenses as is most appropriate
|
||||
for your project on a case-by-case basis.
|
||||
|
||||
The terms of each license can be found in the source code repository:
|
||||
|
||||
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
|
||||
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
|
||||
*/
|
||||
package gplx.core.intls; import gplx.*; import gplx.core.*;
|
||||
import org.junit.*; import gplx.core.tests.*;
|
||||
public class Utf16_mapper_tst {
|
||||
private final Utf16_mapper_fxt fxt = new Utf16_mapper_fxt();
|
||||
@Test public void A() {
|
||||
fxt.Test__map("a¢€𤭢"
|
||||
, Int_ary_.New( 0, 1, -1, 2, -1, -1, 3, -1, -1, -1, 4)
|
||||
, Int_ary_.New( 0, 1, 3, 6, 10, -1, -1, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, -1, 3, -1, 4, -1, -1, -1, -1)
|
||||
, Int_ary_.New( 0, 1, 2, 4, 6, -1, -1, -1, -1, -1, -1)
|
||||
);
|
||||
}
|
||||
}
|
||||
class Utf16_mapper_fxt {
|
||||
public void Test__map(String src_str, int[] expd_code_for_byte, int[] expd_byte_for_code, int[] expd_code_for_char, int[] expd_char_for_code) {
|
||||
byte[] src_bry = Bry_.new_u8(src_str);
|
||||
int src_len = src_bry.length;
|
||||
Utf16_mapper mapper = new Utf16_mapper(src_str, src_bry, src_len);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_byte, expd_code_for_byte);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_byte_for_code, expd_byte_for_code);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_code_for_char, expd_code_for_char);
|
||||
Test__ary(mapper, src_len, Utf16_mapper.Dims_char_for_code, expd_char_for_code);
|
||||
}
|
||||
private void Test__ary(Utf16_mapper mapper, int src_len, int dim_type, int[] expd) {
|
||||
int actl_len = src_len + 1;
|
||||
int[] actl = new int[actl_len];
|
||||
for (int i = 0; i < actl_len; i++) {
|
||||
int v = -1;
|
||||
switch (dim_type) {
|
||||
case Utf16_mapper.Dims_code_for_byte:
|
||||
v = mapper.Get_code_for_byte_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_byte_for_code:
|
||||
v = mapper.Get_byte_for_code_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_code_for_char:
|
||||
v = mapper.Get_code_for_char_or_neg1(i);
|
||||
break;
|
||||
case Utf16_mapper.Dims_char_for_code:
|
||||
v = mapper.Get_char_for_code_or_neg1(i);
|
||||
break;
|
||||
}
|
||||
actl[i] = v;
|
||||
}
|
||||
Gftest.Eq__ary(expd, actl, Int_.To_str(dim_type));
|
||||
}
|
||||
}
|
Loading…
Reference in new issue