1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2014-09-01 00:43:52 -04:00
parent b0a01882de
commit be63adc5af
125 changed files with 1859 additions and 952 deletions

View File

@@ -0,0 +1,35 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.intl; import gplx.*;
public class String_surrogate_utl {
public int Byte_pos() {return byte_pos;} int byte_pos;
public int Count_surrogates__char_idx(byte[] src, int src_len, int byte_bgn, int char_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.Y, char_idx);}
public int Count_surrogates__codepoint_idx1(byte[] src, int src_len, int byte_bgn, int codepoint_idx) {return Count_surrogates(src, src_len, byte_bgn, Bool_.N, codepoint_idx);}
private int Count_surrogates(byte[] src, int src_len, int byte_bgn, boolean stop_idx_is_char, int stop_idx) {
int char_count = 0, codepoint_count = 0;
byte_pos = byte_bgn;
while (true) {
if (stop_idx == (stop_idx_is_char ? char_count : codepoint_count)) return codepoint_count - char_count;
if (byte_pos >= src_len) throw Err_.new_("codepoint_idx is not in string; stop_idx={0} stop_idx_is_char={1} byte_bgn={2} string={3}", stop_idx, stop_idx_is_char, byte_bgn, String_.new_utf8_(src));
int char_len_in_bytes = gplx.intl.Utf8_.Len_of_char_by_1st_byte(src[byte_pos]);
++char_count; // char_count always incremented by 1
codepoint_count += (char_len_in_bytes == 4) ? 2 : 1; // codepoint_count incremented by 2 if surrogate pair; else 1
byte_pos += char_len_in_bytes;
}
}
}

View File

@@ -0,0 +1,56 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.intl; import gplx.*;
import org.junit.*;
public class String_surrogate_utl_tst {
@Before public void init() {fxt.Clear();} private String_surrogate_utl_fxt fxt = new String_surrogate_utl_fxt();
@Test public void Char_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__char_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__char_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__char_idx (test_str, 0, 3, 1, 7); // aé𡼾
fxt.Test_count_surrogates__char_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__char_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__char_idx (test_str, 7, 3, 1, 14); // bî𡼾
fxt.Test_count_surrogates__char_idx (test_str, 0, 6, 2, 14); // aé𡼾bî𡼾
}
@Test public void Codepoint_idx() {
String test_str = "aé𡼾bî𡼾";
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 1, 0, 1); // a
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 2, 0, 3); // aé
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 4, 1, 7); // aé𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 1, 0, 8); // b
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 2, 0, 10); // bî
fxt.Test_count_surrogates__codepoint_idx (test_str, 7, 4, 1, 14); // bî𡼾
fxt.Test_count_surrogates__codepoint_idx (test_str, 0, 8, 2, 14); // aé𡼾bî𡼾
}
}
class String_surrogate_utl_fxt {
private String_surrogate_utl codepoint_utl = new String_surrogate_utl();
public void Clear() {}
public void Test_count_surrogates__char_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_utf8_(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__char_idx(src_bry, src_len, bgn_byte, char_idx));
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos());
}
public void Test_count_surrogates__codepoint_idx(String src_str, int bgn_byte, int char_idx, int expd_count, int expd_pos) {
byte[] src_bry = Bry_.new_utf8_(src_str); int src_len = src_bry.length;
Tfds.Eq(expd_count , codepoint_utl.Count_surrogates__codepoint_idx1(src_bry, src_len, bgn_byte, char_idx), "count");
Tfds.Eq(expd_pos , codepoint_utl.Byte_pos(), "pos");
}
}