1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

uca category support and other changes

This commit is contained in:
gnosygnu
2016-10-12 08:57:22 -04:00
parent e3b393650d
commit 3fc2e0741f
187 changed files with 3486 additions and 2984 deletions

View File

@@ -0,0 +1,22 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.intls.ucas; import gplx.*; import gplx.core.*; import gplx.core.intls.*;
public interface Uca_collator {
void Init(String locale, boolean numeric_ordering);
byte[] Get_sortkey(String s);
}

View File

@@ -0,0 +1,25 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.intls.ucas; import gplx.*; import gplx.core.*; import gplx.core.intls.*;
public class Uca_collator_ {
public static Uca_collator New(String locale, boolean numeric_ordering) {
Uca_collator rv = new Uca_collator__icu__4_8();
rv.Init(locale, numeric_ordering);
return rv;
}
}

View File

@@ -0,0 +1,49 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.intls.ucas; import gplx.*; import gplx.core.*; import gplx.core.intls.*;
import java.util.Locale;
import com.ibm.icu.text.CollationKey;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
class Uca_collator__icu__4_8 implements Uca_collator {
private Collator collator;
public void Init(String locale, boolean numeric_ordering) {
try {
this.collator = Collator.getInstance(Locale.forLanguageTag(locale));
if (numeric_ordering) {
RuleBasedCollator rbc = (RuleBasedCollator)collator;
rbc.setNumericCollation(true);
}
} catch (Exception e) {throw Err_.new_wo_type("collator init failed", "err", Err_.Message_lang(e));}
}
public byte[] Get_sortkey(String s) {
CollationKey key = collator.getCollationKey(s);
byte[] src = key.toByteArray();
int src_len = src.length;
byte[] rv = src;
// remove last byte if it is 0 (which it often is)
if (src_len > 0 && src[src_len - 1] == 0) {
int rv_len = src_len - 1;
rv = new byte[rv_len];
for (int i = 0; i < rv_len; ++i)
rv[i] = src[i];
}
return rv;
}
}

View File

@@ -0,0 +1,51 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.intls.ucas; import gplx.*; import gplx.core.*; import gplx.core.intls.*;
public class Uca_ltr_extractor {
private final boolean numeric;
private final byte[] numeric_heading;
private final Hash_adp_bry numeric_hash;
public Uca_ltr_extractor(boolean numeric) {
this.numeric = numeric;
if (numeric) {
numeric_heading = Bry_.new_a7("0-9");
// create hash of "0", "1", "2", ...
numeric_hash = Hash_adp_bry.cs();
for (int i = 0; i < 10; ++i) {
byte[] digit_bry = Bry_.new_by_int(Byte_ascii.Num_0 + i);
numeric_hash.Add(digit_bry, digit_bry);
}
}
else {
numeric_heading = null;
numeric_hash = null;
}
}
public byte[] Get_1st_ltr(byte[] bry) {
// NOTE: this is simplified and only does numeric logic; MW code loads up all ICU chars via first-letters-root.ser, adds custom chars, sorts them, and then does a binary search to find it; REF:IcuCollation.php!getFirstLetter
int bry_len = bry.length;
if (bry_len == 0) return Bry_.Empty;
byte[] rv = gplx.core.intls.Utf8_.Get_char_at_pos_as_bry(bry, 0);
if (numeric) {
if (numeric_hash.Has(rv))
rv = numeric_heading;
}
return rv;
}
}

View File

@@ -22,7 +22,7 @@ public class Io_buffer_rdr_tst {
Io_mgr.Instance.InitEngine_mem();
fil = Io_url_.mem_fil_("mem/byteStreamRdr.txt");
ini_Write("0123456789");
rdr = Io_buffer_rdr.new_(Io_stream_rdr_.file_(fil), 4);
rdr = Io_buffer_rdr.new_(Io_stream_rdr_.New__raw(fil), 4);
} Io_buffer_rdr rdr; Io_url fil;
@After public void teardown() {rdr.Rls();}
@Test public void Bfr_load_all() {

View File

@@ -23,7 +23,7 @@ public class Io_stream_rdr_process implements Io_stream_rdr {
private InputStream stream_read;
private String[] process_args;
Io_stream_rdr_process(Io_url process_exe, Io_url stream_url, String[] process_args) {this.process_exe = process_exe; this.url = stream_url; this.process_args = process_args;}
public byte Tid() {return Io_stream_.Tid_bzip2;} // for now, classify as bzip2; not sure if separate tid is necessary
public byte Tid() {return Io_stream_tid_.Tid__bzip2;} // for now, classify as bzip2; not sure if separate tid is necessary
public boolean Exists() {return this.Len() > 0;}
public Io_url Url() {return url;} public Io_stream_rdr Url_(Io_url v) {url = v; return this;} private Io_url url;
public long Len() {return len;} public Io_stream_rdr Len_(long v) {len = v; return this;} private long len;

View File

@@ -18,16 +18,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.core.ios; import gplx.*; import gplx.core.*;
import gplx.core.ios.streams.*;
public class Io_stream_zip_mgr {
private Io_stream_wtr wtr_gzip, wtr_zip, wtr_bzip2;
private Io_stream_wtr wtr__gzip, wtr__zip, wtr__bzip2, wtr__xz;
public byte[] Zip(byte type, byte[] val) {
if (type == Io_stream_.Tid_raw) return val;
if (type == Io_stream_tid_.Tid__raw) return val;
Io_stream_wtr wtr = Wtr(type);
wtr.Write(val, 0, val.length);
wtr.Flush();
return wtr.To_ary_and_clear();
}
public byte[] Unzip(byte type, byte[] val) {
if (type == Io_stream_.Tid_raw) return val;
if (type == Io_stream_tid_.Tid__raw) return val;
Io_stream_rdr rdr = Rdr(type);
rdr.Open_mem(val);
return Io_stream_rdr_.Load_all_as_bry(Bry_bfr_.New(), rdr);
@@ -35,20 +35,22 @@ public class Io_stream_zip_mgr {
private Io_stream_wtr Wtr(byte type) {
Bry_bfr bfr = Bry_bfr_.New();
switch (type) {
case Io_stream_.Tid_gzip : if (wtr_gzip == null) wtr_gzip = Io_stream_wtr_.new_by_mem(bfr, Io_stream_.Tid_gzip) ; return wtr_gzip.Open();
case Io_stream_.Tid_zip : if (wtr_zip == null) wtr_zip = Io_stream_wtr_.new_by_mem(bfr, Io_stream_.Tid_zip) ; return wtr_zip.Open();
case Io_stream_.Tid_bzip2 : if (wtr_bzip2 == null) wtr_bzip2 = Io_stream_wtr_.new_by_mem(bfr, Io_stream_.Tid_bzip2) ; return wtr_bzip2.Open();
case Io_stream_.Tid_raw :
default : throw Err_.new_unhandled(type);
case Io_stream_tid_.Tid__gzip: if (wtr__gzip == null) wtr__gzip = Io_stream_wtr_.New_by_mem(bfr, Io_stream_tid_.Tid__gzip); return wtr__gzip.Open();
case Io_stream_tid_.Tid__zip: if (wtr__zip == null) wtr__zip = Io_stream_wtr_.New_by_mem(bfr, Io_stream_tid_.Tid__zip); return wtr__zip.Open();
case Io_stream_tid_.Tid__bzip2: if (wtr__bzip2 == null) wtr__bzip2 = Io_stream_wtr_.New_by_mem(bfr, Io_stream_tid_.Tid__bzip2); return wtr__bzip2.Open();
case Io_stream_tid_.Tid__xz: if (wtr__xz == null) wtr__xz = Io_stream_wtr_.New_by_mem(bfr, Io_stream_tid_.Tid__xz); return wtr__xz.Open();
case Io_stream_tid_.Tid__raw:
default: throw Err_.new_unhandled(type);
}
}
private Io_stream_rdr Rdr(byte type) { // TS.MEM: DATE:2016-07-12
switch (type) {
case Io_stream_.Tid_gzip : return Io_stream_rdr_.new_by_tid_(Io_stream_.Tid_gzip);
case Io_stream_.Tid_zip : return Io_stream_rdr_.new_by_tid_(Io_stream_.Tid_zip);
case Io_stream_.Tid_bzip2 : return Io_stream_rdr_.new_by_tid_(Io_stream_.Tid_bzip2);
case Io_stream_.Tid_raw :
default : throw Err_.new_unhandled(type);
case Io_stream_tid_.Tid__gzip: return Io_stream_rdr_.New_by_tid(Io_stream_tid_.Tid__gzip);
case Io_stream_tid_.Tid__zip: return Io_stream_rdr_.New_by_tid(Io_stream_tid_.Tid__zip);
case Io_stream_tid_.Tid__bzip2: return Io_stream_rdr_.New_by_tid(Io_stream_tid_.Tid__bzip2);
case Io_stream_tid_.Tid__xz: return Io_stream_rdr_.New_by_tid(Io_stream_tid_.Tid__xz);
case Io_stream_tid_.Tid__raw:
default: throw Err_.new_unhandled(type);
}
}
}

View File

@@ -18,22 +18,23 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.core.net; import gplx.*; import gplx.core.*;
import gplx.core.net.qargs.*;
public class Gfo_url {
public byte[] Raw() {return raw;} private byte[] raw;
public byte Protocol_tid() {return protocol_tid;} private byte protocol_tid;
public byte[] Protocol_bry() {return protocol_bry;} private byte[] protocol_bry;
public byte[] Anch() {return anch;} private byte[] anch;
public Gfo_qarg_itm[] Qargs() {return qargs;} private Gfo_qarg_itm[] qargs;
public byte[][] Segs() {return segs;} private byte[][] segs; private int segs__len;
public byte[] Segs__get_at(int i) {return i < segs__len ? segs[i] : null;}
public byte[] Segs__get_at_1st() {return segs__len > 0 ? segs[0] : null;}
public byte[] Segs__get_at_nth() {return segs__len > 1 ? segs[segs__len - 1] : null;}
public Gfo_url Ctor(byte[] raw, byte protocol_tid, byte[] protocol_bry, byte[][] segs, Gfo_qarg_itm[] qargs, byte[] anch) {
private final int segs__len;
public Gfo_url(byte[] raw, byte protocol_tid, byte[] protocol_bry, byte[][] segs, Gfo_qarg_itm[] qargs, byte[] anch) {
this.raw = raw;
this.protocol_tid = protocol_tid; this.protocol_bry = protocol_bry;
this.segs = segs; this.segs__len = segs.length;
this.qargs = qargs;
this.anch = anch;
return this;
}
public static final Gfo_url Empty = new Gfo_url().Ctor(Bry_.Empty, Gfo_protocol_itm.Tid_unknown, Bry_.Empty, Bry_.Ary_empty, null, null);
public byte[] Raw() {return raw;} private final byte[] raw;
public byte Protocol_tid() {return protocol_tid;} private final byte protocol_tid;
public byte[] Protocol_bry() {return protocol_bry;} private final byte[] protocol_bry;
public byte[] Anch() {return anch;} private final byte[] anch;
public Gfo_qarg_itm[] Qargs() {return qargs;} private final Gfo_qarg_itm[] qargs;
public byte[][] Segs() {return segs;} private final byte[][] segs;
public byte[] Segs__get_at(int i) {return i < segs__len ? segs[i] : null;}
public byte[] Segs__get_at_1st() {return segs__len > 0 ? segs[0] : null;}
public byte[] Segs__get_at_nth() {return segs__len > 1 ? segs[segs__len - 1] : null;}
public static final Gfo_url Empty = new Gfo_url(Bry_.Empty, Gfo_protocol_itm.Tid_unknown, Bry_.Empty, Bry_.Ary_empty, null, null);
}

View File

@@ -16,20 +16,21 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.net; import gplx.*; import gplx.core.*;
import gplx.core.primitives.*; import gplx.core.btries.*;
import gplx.core.btries.*;
import gplx.core.net.qargs.*;
import gplx.langs.htmls.encoders.*;
public class Gfo_url_parser {
private final Btrie_slim_mgr protocols = Btrie_slim_mgr.ci_a7(); // ASCII:url_protocol; EX:"http:", "ftp:", etc
private final Bry_ary segs_ary = new Bry_ary(4), qargs = new Bry_ary(4);
private final Bry_bfr tmp_bfr = Bry_bfr_.Reset(500);
private final Btrie_rv trv = new Btrie_rv();
public byte[] Relative_url_protocol_bry() {return Gfo_protocol_itm.Itm_https.Key_w_colon_bry();} // NOTE: https b/c any WMF wiki will now default to WMF; DATE:2015-07-26
private final List_adp segs_list = List_adp_.New(), qargs_list = List_adp_.New();
private final Bry_bfr tmp_bfr = Bry_bfr_.Reset(500);
public Gfo_url_parser() {
Init_protocols(Gfo_protocol_itm.Ary());
Init_protocol_itm(Gfo_protocol_itm.Bry_relative, Gfo_protocol_itm.Tid_relative_1);
Init_protocol_itm(Gfo_protocol_itm.Bry_file, Gfo_protocol_itm.Tid_file);
Init_protocol_itm(gplx.xowa.parsers.lnkes.Xop_lnke_wkr.Bry_xowa_protocol, Gfo_protocol_itm.Tid_xowa);
}
public byte[] Relative_url_protocol_bry() {return Gfo_protocol_itm.Itm_https.Key_w_colon_bry();} // NOTE: https b/c any WMF wiki will now default to WMF; DATE:2015-07-26
private void Init_protocols(Gfo_protocol_itm... itms) {
int len = itms.length;
for (int i = 0; i < len; i++) {
@@ -37,9 +38,7 @@ public class Gfo_url_parser {
Init_protocol_itm(itm.Key_w_colon_bry(), itm.Tid());
}
}
public void Init_protocol_itm(byte[] key, byte protocol_tid) {
protocols.Add_bry_byte(key, protocol_tid);
}
public void Init_protocol_itm(byte[] key, byte protocol_tid) {protocols.Add_bry_byte(key, protocol_tid);}
public void Parse_site_fast(Gfo_url_site_data site_data, byte[] src, int src_bgn, int src_end) {
int pos = src_bgn; boolean rel = false;
if (pos + 1 < src_end && src[pos] == Byte_ascii.Slash && src[pos + 1] == Byte_ascii.Slash) { // starts with "//"
@@ -62,199 +61,126 @@ public class Gfo_url_parser {
slash_pos = Bry_.Trim_end_pos(src, slash_pos);
site_data.Atrs_set(rel, pos, slash_pos);
}
private static final int Area__path = 1, Area__qarg_key_1st = 2, Area__qarg_key_nth = 3, Area__qarg_val = 4, Area__anch = 5;
private byte[] src; int src_bgn, src_end;
private int area;
private boolean encoded;
private byte protocol_tid; private byte[] protocol_bry, anch;
private int path_bgn, qarg_key_bgn, qarg_val_bgn, anch_bgn, anch_nth_bgn;
public Gfo_url Parse(byte[] src) {return Parse(new Gfo_url(), src, 0, src.length);}
public Gfo_url Parse(Gfo_url rv, byte[] src, int src_bgn, int src_end) {
this.src = src; this.src_bgn = src_bgn; this.src_end = src_end;
encoded = false;
protocol_tid = Gfo_protocol_itm.Tid_null;
protocol_bry = anch = null;
path_bgn = qarg_key_bgn = qarg_val_bgn = anch_bgn = anch_nth_bgn = -1;
segs_ary.Clear(); qargs.Clear();
int pos = src_bgn;
Object protocol_obj = protocols.Match_at(trv, src, src_bgn, src_end);
pos = trv.Pos();
pos = Bry_find_.Find_fwd_while(src, pos, src_end, Byte_ascii.Slash);
if (protocol_obj == null) {
this.protocol_tid = Gfo_protocol_itm.Tid_unknown;
}
else {
this.protocol_tid = ((Byte_obj_val)protocol_obj).Val();
this.protocol_bry = Make_bry(src_bgn, pos);
}
area = Area__path;
path_bgn = pos;
while (true) {
if (pos == src_end) break;
byte b = src[pos];
public Gfo_url Parse(byte[] src) {return Parse(src, 0, src.length);}
public Gfo_url Parse(byte[] src, int src_bgn, int src_end) {
// protocol
byte protocol_tid = protocols.Match_byte_or(trv, src, src_bgn, src_end, Gfo_protocol_itm.Tid_unknown);
int pos = Bry_find_.Find_fwd_while(src, trv.Pos(), src_end, Byte_ascii.Slash); // set pos after last slash; EX: "https://A" -> position before "A"
byte[] protocol_bry = protocol_tid == Gfo_protocol_itm.Tid_unknown
? null
: Make_bry(false, src, src_bgn, pos);
// loop chars and handle "/", "#", "?", and "%"
boolean encoded = false;
int src_zth = src_end - 1;
int anch_bgn = -1, qarg_bgn = -1, seg_bgn = pos;
for (int i = pos; i < src_end; ++i) {
byte b = src[i];
switch (b) {
case Byte_ascii.Slash: pos = Parse_slash(pos, b); break;
case Byte_ascii.Question: pos = Parse_qarg_key_1st(pos, b); break;
case Byte_ascii.Amp: pos = Parse_qarg_key_nth(pos, b); break;
case Byte_ascii.Eq: pos = Parse_qarg_val(pos, b); break;
case Byte_ascii.Hash: if (anch_bgn == -1) pos = Parse_anch(pos, b); else ++pos; break; // anchor begins at 1st #, not last #; EX:A#B#C has anchor of "B#C" not "C" PAGE:en.w:Grand_Central_Terminal; DATE:2015-12-31
case Byte_ascii.Percent: encoded = true; ++pos; break;
default:
++pos;
case Byte_ascii.Slash:
if (qarg_bgn == -1) { // ignore slash in qargs
segs_list.Add(Make_bry(encoded, src, seg_bgn, i));
encoded = false;
seg_bgn = i + 1; // +1 to skip "/"
}
break;
case Byte_ascii.Hash: // set qarg to first #; also, ignore rest of String; EX: A#B#C -> B#C
if (i == src_zth) continue; // ignore # at EOS; EX: "A#"
anch_bgn = i;
i = src_end;
break;
case Byte_ascii.Question: // set qarg to last "?"; EX: A?B?C -> C
if (i == src_zth) continue; // ignore ? at EOS; EX: "A?"
qarg_bgn = i;
break;
case Byte_ascii.Percent:
encoded = true;
break;
}
}
End_area(pos, Byte_ascii.Null);
rv.Ctor(src, protocol_tid, protocol_bry, segs_ary.To_ary(0), Make_qargs(), anch);
return rv;
}
private int Parse_slash(int pos, byte b) {
switch (area) {
case Area__path: return End_area(pos, b);
default: return pos + 1;
int seg_end = src_end; // set seg_end to src_end; EX: "https://site/A" -> "A"; seg_end may be overriden if "#" or "?" exists
// set anch
byte[] anch = null;
if (anch_bgn != -1) {
seg_end = anch_bgn; // set seg_end to anch_bgn; EX: "https://site/A#B" -> "A" x> "A#B"
anch = Make_bry(encoded, src, anch_bgn + 1, src_end); // +1 to skip "#"
}
}
private int Parse_anch(int pos, byte b) {
switch (area) {
case Area__path:
End_area(pos, b);
area = Area__anch;
anch_bgn = pos + 1;
break;
case Area__anch: // handle double; A#B#C -> "A#B", "C"
Append_to_last_path(Byte_ascii.Hash, Make_bry(anch_bgn, pos));
anch_bgn = pos + 1;
break;
case Area__qarg_val:
case Area__qarg_key_1st:
case Area__qarg_key_nth:
if (anch_nth_bgn == -1)
anch_nth_bgn = Bry_find_.Find_bwd(src, Byte_ascii.Hash, src_end);
if (pos == anch_nth_bgn) {
End_area(pos, b);
area = Area__anch;
anch_bgn = pos + 1;
}
break;
default:
break;
// set qargs
Gfo_qarg_itm[] qarg_ary = Gfo_qarg_itm.Ary_empty;
if (qarg_bgn != -1) {
int qarg_end = anch_bgn == -1
? src_end // # missing; set to src_end; EX: "A?B=C" -> EOS
: anch_bgn; // # exists; set to anch_bgn; EX: "A?B=C#D" -> #
qarg_ary = Make_qarg_ary(src, qarg_bgn, qarg_end);
seg_end = qarg_ary.length == 0
? src_end // set seg_end to src_end if pseudo qarg; EX: "https://site/A?B" -> "A?B" x> "A"
: qarg_bgn; // set seg_end to qarg_bgn; EX: "https://site/A?B=C" -> "A" x> "A#B"; NOTE: overrides anch; "A?B=C#D" -> "A"
}
return pos + 1;
// extract seg_end; note that there will always be a seg_end; if src ends with slash, then it will be ""; EX: "A/" -> "A", ""
segs_list.Add(Make_bry(encoded, src, seg_bgn, seg_end));
// build url and return it
return new Gfo_url(src, protocol_tid, protocol_bry, (byte[][])segs_list.To_ary_and_clear(byte[].class), qarg_ary, anch);
}
private int Parse_qarg_key_1st(int pos, byte b) {
switch (area) {
case Area__path: // only valid way to start qarg; EX: A?B=C
End_area(pos, b);
area = Area__qarg_key_1st;
qarg_key_bgn = pos + 1;
break;
case Area__qarg_key_1st: // handle dupe; EX: A?B?C
case Area__qarg_key_nth: // handle dupe; EX: A?B=C&D?
case Area__qarg_val: // handle dupe; EX: A?B=?
End_area(pos, b);
Append_to_last_path__qargs();
area = Area__qarg_key_1st;
qarg_key_bgn = pos + 1;
break;
private Gfo_qarg_itm[] Make_qarg_ary(byte[] src, int qarg_bgn, int qarg_end) {
// init
int key_bgn = qarg_bgn + 1; // +1 to skip "?"
byte[] key_bry = null;
int val_bgn = -1;
boolean encoded = false;
// loop qarg for "&", "=", "%"
int qarg_pos = qarg_bgn;
while (true) {
boolean b_is_last = qarg_pos == qarg_end;
byte b = b_is_last ? Byte_ascii.Null : src[qarg_pos];
boolean make_qarg = false;
switch (b) {
case Byte_ascii.Amp: // "&" always makes qarg
make_qarg = true;
break;
case Byte_ascii.Null: // "EOS" makes qarg as long as "=" seen or at least one qarg; specifically, "A?B" shouldn't make qarg
if ( val_bgn != -1 // "=" seen; EX: "?A=B"
|| qargs_list.Count() > 0) // at least one qarg exists; EX: "?A=B&C"
make_qarg = true;
break;
case Byte_ascii.Eq:
key_bry = Make_bry(encoded, src, key_bgn, qarg_pos);
encoded = false;
val_bgn = qarg_pos + 1;
break;
case Byte_ascii.Percent:
encoded = true;
break;
}
// make qarg
if (make_qarg) {
byte[] val_bry = null;
if (key_bry == null) // key missing; EX: "&A" -> "A,null"
key_bry = Make_bry(encoded, src, key_bgn, qarg_pos);
else // key exists; EX: "&A=B" -> "A,B"
val_bry = Make_bry(encoded, src, val_bgn, qarg_pos);
encoded = false;
qargs_list.Add(new Gfo_qarg_itm(key_bry, val_bry));
// reset vars
key_bry = null;
key_bgn = qarg_pos + 1;
val_bgn = -1;
}
if (b_is_last) break;
++qarg_pos;
}
return pos + 1;
}
private int Parse_qarg_key_nth(int pos, byte b) {
switch (area) {
case Area__path: // ignore if qarg not started; EX: A&B
break;
case Area__qarg_key_1st: // handle invalid; A?B&C
case Area__qarg_key_nth: // handle invalid; A?B=C&D&E=F
End_area(pos, b);
qargs.Add(null);
area = Area__qarg_key_nth;
qarg_key_bgn = pos + 1;
break;
case Area__qarg_val:
End_area(pos, b);
area = Area__qarg_key_nth;
qarg_key_bgn = pos + 1;
break;
}
return pos + 1;
}
private int Parse_qarg_val(int pos, byte b) {
switch (area) {
case Area__qarg_key_1st:
case Area__qarg_key_nth:
End_area(pos, b); break;
default: break;
}
return pos + 1;
}
private int End_area(int pos, byte b) {
switch (area) {
case Area__path:
segs_ary.Add(Make_bry(path_bgn, pos));
path_bgn = pos + 1;
break;
case Area__qarg_key_1st:
case Area__qarg_key_nth:
if (b == Byte_ascii.Null && qargs.Len() == 0) // handle A?b but not A?b=c&d
Append_to_last_path(Byte_ascii.Question, Make_bry(qarg_key_bgn, src_end));
else {
qargs.Add(Make_bry(qarg_key_bgn, pos));
qarg_val_bgn = pos + 1;
area = Area__qarg_val;
}
break;
case Area__qarg_val:
qargs.Add(Make_bry(qarg_val_bgn, pos));
qarg_key_bgn = pos + 1;
qarg_val_bgn = -1;
area = Area__qarg_key_nth;
break;
case Area__anch:
if (b == Byte_ascii.Null && anch_bgn == src_end) // handle A# but not "A#B"
Append_to_last_path(Byte_ascii.Hash, Make_bry(anch_bgn, src_end));
else
anch = Make_bry(anch_bgn, pos);
break;
default:
break;
}
encoded = false;
return pos + 1;
}
private byte[] Make_bry(int bgn, int end) {
return encoded ? gplx.langs.htmls.encoders.Gfo_url_encoder_.Xourl.Decode(tmp_bfr, Bool_.N, src, bgn, end).To_bry_and_clear() : Bry_.Mid(src, bgn, end);
}
private Gfo_qarg_itm[] Make_qargs() {
int qargs_len = qargs.Len(); if (qargs_len == 0) return Gfo_qarg_itm.Ary_empty;
if (qargs_len % 2 == 1) ++qargs_len; // handle odd qargs; EX: ?A=B&C&D=E
Gfo_qarg_itm[] rv = new Gfo_qarg_itm[qargs_len / 2];
for (int i = 0; i < qargs_len; i += 2) {
byte[] key = qargs.Get_at(i);
int val_idx = i + 1;
byte[] val = val_idx < qargs_len ? qargs.Get_at(val_idx) : null;
rv[i / 2] = new Gfo_qarg_itm(key, val);
}
return rv;
}
private void Append_to_last_path(byte b, byte[] append) {
byte[] last_path = segs_ary.Get_at_last(); if (last_path == null) return;
last_path = Bry_.Add_w_dlm(b, last_path, append);
segs_ary.Set_at_last(last_path);
}
private void Append_to_last_path__qargs() {
byte[] last_path = segs_ary.Get_at_last(); if (last_path == null) return;
tmp_bfr.Add(last_path);
int len = qargs.Len();
if (len % 2 == 1) qargs.Add(null); // handle odd qargs
for (int i = 0; i < len; i += 2) {
tmp_bfr.Add_byte(i == 0 ? Byte_ascii.Question : Byte_ascii.Amp);
tmp_bfr.Add(qargs.Get_at(i));
byte[] qarg_val = qargs.Get_at(i + 1);
if (qarg_val != null) // handle "null" added above
tmp_bfr.Add_byte_eq().Add(qarg_val);
}
qargs.Clear();
segs_ary.Set_at_last(tmp_bfr.To_bry_and_clear());
return (Gfo_qarg_itm[])qargs_list.To_ary_and_clear(Gfo_qarg_itm.class);
}
private byte[] Make_bry(boolean encoded, byte[] src, int bgn, int end) {
return encoded ? Gfo_url_encoder_.Xourl.Decode(tmp_bfr, Bool_.N, src, bgn, end).To_bry_and_clear() : Bry_.Mid(src, bgn, end);
}
public static final byte[] Bry_double_slash = new byte[] {Byte_ascii.Slash, Byte_ascii.Slash};
}

View File

@@ -20,15 +20,19 @@ import gplx.core.net.qargs.*;
class Gfo_url_parser_fxt {
private final Gfo_url_parser parser = new Gfo_url_parser();
private Gfo_url actl;
public Gfo_url_parser_fxt Chk_protocol_tid(byte v) {Tfds.Eq_byte(v, actl.Protocol_tid(), "protocol_tid"); return this;}
public Gfo_url_parser_fxt Chk_protocol_bry(String v) {Tfds.Eq_str(v, actl.Protocol_bry(), "protocol_bry"); return this;}
public Gfo_url_parser_fxt Chk_site(String v) {Tfds.Eq_str(v, actl.Segs__get_at_1st(), "site"); return this;}
public Gfo_url_parser_fxt Chk_page(String v) {Tfds.Eq_str(v, actl.Segs__get_at_nth(), "page"); return this;}
public Gfo_url_parser_fxt Chk_anch(String v) {Tfds.Eq_str(v, actl.Anch(), "anch"); return this;}
public Gfo_url_parser_fxt Chk_segs(String... ary) {Tfds.Eq_int(ary.length, actl.Segs().length, "segs_len"); Tfds.Eq_str_lines(String_.Concat_lines_nl(ary), String_.Concat_lines_nl(String_.Ary(actl.Segs())), "segs"); return this;}
public Gfo_url_parser_fxt Chk_qargs(String... ary) {Tfds.Eq_str_lines(String_.To_str__as_kv_ary(ary), Gfo_qarg_itm.To_str(actl.Qargs()), "qargs"); return this;}
public Gfo_url_parser_fxt Run_parse(String v) {
this.actl = parser.Parse(Bry_.new_u8(v));
public Gfo_url_parser_fxt Test__protocol_tid(byte v) {Tfds.Eq_byte(v, actl.Protocol_tid(), "protocol_tid"); return this;}
public Gfo_url_parser_fxt Test__protocol_bry(String v) {Tfds.Eq_str(v, actl.Protocol_bry(), "protocol_bry"); return this;}
public Gfo_url_parser_fxt Test__site(String v) {Tfds.Eq_str(v, actl.Segs__get_at_1st(), "site"); return this;}
public Gfo_url_parser_fxt Test__page(String v) {Tfds.Eq_str(v, actl.Segs__get_at_nth(), "page"); return this;}
public Gfo_url_parser_fxt Test__anch(String v) {Tfds.Eq_str(v, actl.Anch(), "anch"); return this;}
public Gfo_url_parser_fxt Test__segs(String... ary) {
Tfds.Eq_str_lines(String_.Concat_lines_nl(ary), String_.Concat_lines_nl(String_.Ary(actl.Segs())), "segs");
Tfds.Eq_int(ary.length, actl.Segs().length, "segs_len");
return this;
}
public Gfo_url_parser_fxt Test__qargs(String... ary) {Tfds.Eq_str_lines(String_.To_str__as_kv_ary(ary), Qargs__To_str(actl.Qargs()), "qargs"); return this;}
public Gfo_url_parser_fxt Exec__parse(String v) {
this.actl = parser.Parse(Bry_.new_u8(v), 0, String_.Len(v));
return this;
}
public void Test_Parse_site_fast(String raw, String expd) {
@@ -37,4 +41,16 @@ class Gfo_url_parser_fxt {
String actl = String_.new_u8(raw_bry, site_data.Site_bgn(), site_data.Site_end());
Tfds.Eq(expd, actl);
} private final Gfo_url_site_data site_data = new Gfo_url_site_data();
private static String Qargs__To_str(Gfo_qarg_itm[] ary) {
int len = ary.length;
Bry_bfr bfr = Bry_bfr_.New();
for (int i = 0; i < len; ++i) {
Gfo_qarg_itm itm = ary[i];
bfr.Add(itm.Key_bry()).Add_byte_eq();
if (itm.Val_bry() != null)
bfr.Add(itm.Val_bry());
bfr.Add_byte_nl();
}
return bfr.To_str_and_clear();
}
}

View File

@@ -18,102 +18,102 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package gplx.core.net; import gplx.*; import gplx.core.*;
import org.junit.*;
public class Gfo_url_parser_tst {
private final Gfo_url_parser_fxt tstr = new Gfo_url_parser_fxt();
private final Gfo_url_parser_fxt tstr = new Gfo_url_parser_fxt();
@Test public void Protocol__relative() {
tstr.Run_parse("//en.wikipedia.org").Chk_protocol_tid(Gfo_protocol_itm.Tid_relative_1).Chk_protocol_bry("//").Chk_site("en.wikipedia.org");
tstr.Exec__parse("//en.wikipedia.org").Test__protocol_tid(Gfo_protocol_itm.Tid_relative_1).Test__protocol_bry("//").Test__site("en.wikipedia.org");
}
@Test public void Protocol__none() {
tstr.Run_parse("en.wikipedia.org/wiki/A").Chk_protocol_tid(Gfo_protocol_itm.Tid_unknown).Chk_segs("en.wikipedia.org", "wiki", "A");
tstr.Exec__parse("en.wikipedia.org/wiki/A").Test__protocol_tid(Gfo_protocol_itm.Tid_unknown).Test__segs("en.wikipedia.org", "wiki", "A");
}
@Test public void Site__parts__3() {
tstr.Run_parse("https://en.wikipedia.org").Chk_protocol_tid(Gfo_protocol_itm.Tid_https).Chk_protocol_bry("https://").Chk_segs("en.wikipedia.org");
tstr.Exec__parse("https://en.wikipedia.org").Test__protocol_tid(Gfo_protocol_itm.Tid_https).Test__protocol_bry("https://").Test__segs("en.wikipedia.org");
}
@Test public void Site__parts__2() {
tstr.Run_parse("https://wikipedia.org").Chk_protocol_tid(Gfo_protocol_itm.Tid_https).Chk_segs("wikipedia.org");
tstr.Exec__parse("https://wikipedia.org").Test__protocol_tid(Gfo_protocol_itm.Tid_https).Test__segs("wikipedia.org");
}
@Test public void Site__parts__1() {
tstr.Run_parse("https://wikipedia").Chk_protocol_tid(Gfo_protocol_itm.Tid_https).Chk_segs("wikipedia");
tstr.Exec__parse("https://wikipedia").Test__protocol_tid(Gfo_protocol_itm.Tid_https).Test__segs("wikipedia");
}
@Test public void Site__slash__none() {
tstr.Run_parse("https:site").Chk_protocol_tid(Gfo_protocol_itm.Tid_https).Chk_site("site");
tstr.Exec__parse("https:site").Test__protocol_tid(Gfo_protocol_itm.Tid_https).Test__site("site");
}
@Test public void Site__slash__eos() {
tstr.Exec__parse("https://en.wikipedia.org/").Test__protocol_tid(Gfo_protocol_itm.Tid_https).Test__site("en.wikipedia.org");
}
@Test public void Paths__1() {
tstr.Run_parse("https://site/A").Chk_segs("site", "A");
tstr.Exec__parse("https://site/A").Test__segs("site", "A");
}
@Test public void Paths__2() {
tstr.Run_parse("https://site/wiki/A").Chk_segs("site", "wiki", "A");
tstr.Exec__parse("https://site/wiki/A").Test__segs("site", "wiki", "A");
}
@Test public void Paths__n() {
tstr.Run_parse("https://site/wiki/A/B/C/D").Chk_segs("site", "wiki", "A", "B", "C", "D");
tstr.Exec__parse("https://site/wiki/A/B/C/D").Test__segs("site", "wiki", "A", "B", "C", "D");
}
@Test public void Qargs__1() {
tstr.Run_parse("https://site/A?B=C").Chk_page("A").Chk_qargs("B", "C");
tstr.Exec__parse("https://site/A?B=C").Test__page("A").Test__qargs("B", "C");
}
@Test public void Qargs__2() {
tstr.Run_parse("https://site/A?B=C&D=E").Chk_page("A").Chk_qargs("B", "C", "D", "E");
tstr.Exec__parse("https://site/A?B=C&D=E").Test__page("A").Test__qargs("B", "C", "D", "E");
}
@Test public void Qargs__3() {
tstr.Run_parse("https://site/A?B=C&D=E&F=G").Chk_page("A").Chk_qargs("B", "C", "D", "E", "F", "G");
tstr.Exec__parse("https://site/A?B=C&D=E&F=G").Test__page("A").Test__qargs("B", "C", "D", "E", "F", "G");
}
@Test public void Qargs__ques__dupe__ques() {
tstr.Run_parse("https://site/A?B?Y=Z").Chk_page("A?B").Chk_qargs("Y", "Z");
tstr.Exec__parse("https://site/A?B?Y=Z").Test__page("A?B").Test__qargs("Y", "Z");
}
@Test public void Qargs__ques__dupe__amp() {
tstr.Run_parse("https://site/A?B=C&D?Y=Z").Chk_page("A?B=C&D").Chk_qargs("Y", "Z");
tstr.Exec__parse("https://site/A?B=C&D?Y=Z").Test__page("A?B=C&D").Test__qargs("Y", "Z");
}
@Test public void Qargs__ques__dupe__eq() {
tstr.Run_parse("https://site/A?B=C?Y=Z").Chk_page("A?B=C").Chk_qargs("Y", "Z");
tstr.Exec__parse("https://site/A?B=C?Y=Z").Test__page("A?B=C").Test__qargs("Y", "Z");
}
@Test public void Qargs__amp__dupe__ques() {
tstr.Run_parse("https://site/A?B&Y=Z").Chk_page("A").Chk_qargs("B", null, "Y", "Z");
tstr.Exec__parse("https://site/A?B&Y=Z").Test__page("A").Test__qargs("B", null, "Y", "Z");
}
@Test public void Qargs__amp__dupe__amp() {
tstr.Run_parse("https://site/A?B=C&D&Y=Z").Chk_page("A").Chk_qargs("B", "C", "D", null, "Y", "Z");
tstr.Exec__parse("https://site/A?B=C&D&Y=Z").Test__page("A").Test__qargs("B", "C", "D", null, "Y", "Z");
}
@Test public void Qargs__missing_val__0() {
tstr.Run_parse("https://site/A?").Chk_page("A?").Chk_qargs();
tstr.Exec__parse("https://site/A?").Test__page("A?").Test__qargs();
}
@Test public void Qargs__missing_val__2() {
tstr.Run_parse("https://site/A?B=C&D&F=G").Chk_page("A").Chk_qargs("B", "C", "D", null, "F", "G");
tstr.Exec__parse("https://site/A?B=C&D&F=G").Test__page("A").Test__qargs("B", "C", "D", null, "F", "G");
}
@Test public void Qargs__missing_val__n() {
tstr.Run_parse("https://site/A?B=C&D=E&F").Chk_page("A").Chk_qargs("B", "C", "D", "E", "F", null);
tstr.Exec__parse("https://site/A?B=C&D=E&F").Test__page("A").Test__qargs("B", "C", "D", "E", "F", null);
}
@Test public void Qargs__site_less__missing__0() {
tstr.Run_parse("A?B").Chk_segs("A?B").Chk_qargs();
tstr.Exec__parse("A?B").Test__segs("A?B").Test__qargs();
}
@Test public void Qargs__site_less() {
tstr.Run_parse("A?B=C&D=E").Chk_site("A").Chk_qargs("B", "C", "D", "E");
tstr.Exec__parse("A?B=C&D=E").Test__site("A").Test__qargs("B", "C", "D", "E");
}
@Test public void Anch__basic() {
tstr.Run_parse("https://site/A#B").Chk_page("A").Chk_anch("B");
tstr.Exec__parse("https://site/A#B").Test__page("A").Test__anch("B");
}
@Test public void Anch__repeat__2() {
tstr.Run_parse("https://site/A#B#C").Chk_page("A").Chk_anch("B#C");
tstr.Exec__parse("https://site/A#B#C").Test__page("A").Test__anch("B#C");
}
@Test public void Anch__repeat__3() {
tstr.Run_parse("https://site/A#B#C#D").Chk_page("A").Chk_anch("B#C#D");
tstr.Exec__parse("https://site/A#B#C#D").Test__page("A").Test__anch("B#C#D");
}
@Test public void Anch__missing() {
tstr.Run_parse("https://site/A#").Chk_page("A#").Chk_anch(null);
tstr.Exec__parse("https://site/A#").Test__page("A#").Test__anch(null);
}
@Test public void Anch__missing__eos() {
tstr.Run_parse("https://site/A#B#").Chk_page("A").Chk_anch("B#");
tstr.Exec__parse("https://site/A#B#").Test__page("A").Test__anch("B#");
}
@Test public void Anch__qargs__basic() {
tstr.Run_parse("https://site/A?B=C&D=E#F").Chk_page("A").Chk_qargs("B", "C", "D", "E").Chk_anch("F");
}
@Test public void Anch__qargs__repeat() {
tstr.Run_parse("https://site/A?B=C#&D=E#F").Chk_page("A").Chk_qargs("B", "C#", "D", "E").Chk_anch("F");
tstr.Exec__parse("https://site/A?B=C&D=E#F").Test__page("A").Test__qargs("B", "C", "D", "E").Test__anch("F");
}
@Test public void Anch__site_less() {
tstr.Run_parse("A#B").Chk_site("A").Chk_anch("B");
tstr.Exec__parse("A#B").Test__site("A").Test__anch("B");
}
@Test public void Encode__page() {
tstr.Run_parse("http://site/A%27s").Chk_site("site").Chk_page("A's");
tstr.Exec__parse("http://site/A%27s").Test__site("site").Test__page("A's");
}
@Test public void Protocol_less__qargs() {
tstr.Run_parse("Special:Search/Earth?fulltext=yes").Chk_segs("Special:Search", "Earth").Chk_page("Earth").Chk_qargs("fulltext", "yes");
tstr.Exec__parse("Special:Search/Earth?fulltext=yes").Test__segs("Special:Search", "Earth").Test__page("Earth").Test__qargs("fulltext", "yes");
}
@Test public void Parse_site_fast() {
tstr.Test_Parse_site_fast("http://a.org/B" , "a.org");
@@ -121,4 +121,8 @@ public class Gfo_url_parser_tst {
tstr.Test_Parse_site_fast("//a.org/B" , "a.org");
tstr.Test_Parse_site_fast("//a.org/B:C" , "a.org");
}
// DELETED: logic isn't right; anch is first # not last; EX: https://en.wikipedia.org/w/index.php?title=Category:2001_albums&pagefrom=Beautiful+#View#mw-pages; DATE:2016-10-10
// @Test public void Anch__qargs__repeat() {
// tstr.Exec__parse("https://site/A?B=C#&D=E#F").Test__page("A").Test__qargs("B", "C#", "D", "E").Test__anch("F");
// }
}

View File

@@ -17,35 +17,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.core.net.qargs; import gplx.*; import gplx.core.*; import gplx.core.net.*;
public class Gfo_qarg_itm {
public Gfo_qarg_itm(byte[] key_bry, byte[] val_bry) {this.key_bry = key_bry; this.val_bry = val_bry;}
public byte[] Key_bry() {return key_bry;} private byte[] key_bry;
public byte[] Val_bry() {return val_bry;} private byte[] val_bry;
public Gfo_qarg_itm Val_bry_(byte[] v) {val_bry = v; return this;}
public Gfo_qarg_itm(byte[] key_bry, byte[] val_bry) {
this.key_bry = key_bry;
this.val_bry = val_bry;
}
public byte[] Key_bry() {return key_bry;} private final byte[] key_bry;
public byte[] Val_bry() {return val_bry;} private byte[] val_bry;
public void Val_bry_(byte[] v) {val_bry = v;}
public static final Gfo_qarg_itm[] Ary_empty = new Gfo_qarg_itm[0];
public static Gfo_qarg_itm new_key_(String key) {return new Gfo_qarg_itm(Bry_.new_u8(key), Bry_.Empty);}
public static Gfo_qarg_itm[] Ary(String... kvs) {
int len = kvs.length;
Gfo_qarg_itm[] rv = new Gfo_qarg_itm[len / 2];
String key = null;
for (int i = 0; i < len; ++i) {
String s = kvs[i];
if (i % 2 == 0)
key = s;
else
rv[i / 2] = new Gfo_qarg_itm(Bry_.new_u8(key), Bry_.new_u8(s));
}
return rv;
}
public static String To_str(Gfo_qarg_itm[] ary) {
int len = ary.length;
Bry_bfr bfr = Bry_bfr_.New();
for (int i = 0; i < len; ++i) {
Gfo_qarg_itm itm = ary[i];
bfr.Add(itm.Key_bry()).Add_byte_eq();
if (itm.Val_bry() != null)
bfr.Add(itm.Val_bry());
bfr.Add_byte_nl();
}
return bfr.To_str_and_clear();
}
}