1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2025-06-13 12:54:14 +00:00
gnosygnu_xowa/400_xowa/src/gplx/xowa/Xoa_ttl.java

425 lines
21 KiB
Java
Raw Normal View History

2015-07-13 01:10:02 +00:00
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
2016-07-11 03:35:32 +00:00
import gplx.core.primitives.*; import gplx.core.btries.*;
2016-08-08 01:36:50 +00:00
import gplx.langs.htmls.entitys.*;
2016-07-11 03:35:32 +00:00
import gplx.xowa.wikis.nss.*; import gplx.xowa.wikis.xwikis.*;
import gplx.xowa.parsers.amps.*; import gplx.xowa.parsers.miscs.*;
2016-09-26 02:05:47 +00:00
import gplx.xowa.apps.utls.*; import gplx.langs.htmls.encoders.*; import gplx.xowa.langs.cases.*;
2015-07-13 01:10:02 +00:00
public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.MW: Ttl.php|secureAndSplit;
2016-07-11 03:35:32 +00:00
private int wik_bgn = -1, ns_bgn = -1, page_bgn = 0, leaf_bgn = -1, anch_bgn = -1, root_bgn = -1;
private byte[] tors_txt;
private Xoa_ttl() {}
2015-07-13 01:10:02 +00:00
public Xow_ns Ns() {return ns;} private Xow_ns ns;
public boolean ForceLiteralLink() {return forceLiteralLink;} private boolean forceLiteralLink;
// NOTE: in procs below, all -1 are used to skip previous delimiters; they will only occur for end_pos arguments
public boolean Eq_page_db(Xoa_ttl comp) {if (comp == null) return false; return Bry_.Eq(this.Page_db(), comp.Page_db());} // check page is same; ignores anchor and xwiki
public boolean Eq_full_db(Xoa_ttl comp) {if (comp == null) return false; return Bry_.Eq(this.Full_db(), comp.Full_db());} // check page is same; ignores anchor and xwiki
public byte[] Raw() {return raw;} private byte[] raw = Bry_.Empty;
public byte[] Wik_txt() {return wik_bgn == -1 ? Bry_.Empty : Bry_.Mid(full_txt, wik_bgn, ns_bgn == -1 ? page_bgn - 1 : ns_bgn - 1);}
public Xow_xwiki_itm Wik_itm() {return wik_itm;} private Xow_xwiki_itm wik_itm;
2016-03-28 03:44:59 +00:00
public byte[] Full_txt_w_ttl_case() {return Xoa_ttl.Replace_unders(Full_db());}
public byte[] Full_txt_by_orig() {
2015-07-13 01:10:02 +00:00
int bgn = wik_bgn == -1 ? 0 : ns_bgn == -1 ? page_bgn : ns_bgn;
int end = full_txt.length;
if (anch_bgn != -1) end = anch_bgn - 1;
return Bry_.Mid(full_txt, bgn, end);
}
public byte[] Full_txt_raw() {return full_txt;} private byte[] full_txt = Bry_.Empty;
2016-01-18 04:18:07 +00:00
public byte[] Full_db_wo_xwiki() {
byte[] rv = Bry_.Mid(full_txt, wik_bgn == -1 ? 0 : ns_bgn == -1 ? page_bgn - 1 : ns_bgn - 1, full_txt.length);
Bry_.Replace_reuse(rv, Byte_ascii.Space, Byte_ascii.Underline);
return rv;
}
2015-07-13 01:10:02 +00:00
public byte[] Page_txt_w_anchor() {return Bry_.Mid(full_txt, page_bgn, qarg_bgn == -1 ? full_txt.length : qarg_bgn - 1);}
public byte[] Page_txt() {return Bry_.Mid(full_txt, page_bgn, anch_bgn == -1 ? full_txt.length : anch_bgn - 1);}
public byte[] Page_db() {
byte[] rv = this.Page_txt();
Bry_.Replace_reuse(rv, Byte_ascii.Space, Byte_ascii.Underline);
return rv;
}
2015-12-07 04:12:52 +00:00
public String Page_db_as_str() {return String_.new_u8(Page_db());}
2016-08-01 01:41:19 +00:00
public byte[] Page_url_w_anch() {
synchronized (href_encoder) { // LOCK:static-obj
return href_encoder.Encode(Bry_.Mid(full_txt, page_bgn, qarg_bgn == -1 ? full_txt.length : qarg_bgn - 1));
}
}
2015-07-13 01:10:02 +00:00
public int Leaf_bgn() {return leaf_bgn;}
public byte[] Base_txt() {return leaf_bgn == -1 ? Page_txt() : Bry_.Mid(full_txt, page_bgn, leaf_bgn - 1);}
public byte[] Leaf_txt() {return leaf_bgn == -1 ? Page_txt() : Bry_.Mid(full_txt, leaf_bgn, anch_bgn == -1 ? full_txt.length : anch_bgn - 1);}
public int Wik_bgn() {return wik_bgn;}
public int Anch_bgn() {return anch_bgn;} // NOTE: anch_bgn is not correct when page has trailing ws; EX: [[A #b]] should have anch_bgn of 3 (1st char after #), but instead it is 2
public byte[] Anch_txt() {return anch_bgn == -1 ? Bry_.Empty : Bry_.Mid(full_txt, anch_bgn, full_txt.length);}
2016-03-28 03:44:59 +00:00
public byte[] Talk_txt() {return ns.Id_is_talk() ? Full_txt_w_ttl_case() : Bry_.Add(tors_txt, Page_txt());}
public byte[] Subj_txt() {return ns.Id_is_subj() ? Full_txt_w_ttl_case() : Bry_.Add(tors_txt, Page_txt());}
2016-08-01 01:41:19 +00:00
public byte[] Full_url() {
synchronized (href_encoder) { // LOCK:static-obj
2016-08-08 01:36:50 +00:00
return href_encoder.Encode(full_txt);
2016-08-01 01:41:19 +00:00
}
}
2016-03-28 03:44:59 +00:00
public String Full_db_as_str() {return String_.new_u8(Full_db());}
2015-11-23 02:39:33 +00:00
public byte[] Full_db() {return ns.Gen_ttl(this.Page_db());}
public byte[] Full_db_w_anch() {return Replace_spaces(full_txt);}
2016-08-01 01:41:19 +00:00
public byte[] Page_url() {
synchronized (url_encoder) { // LOCK:static-obj
return url_encoder.Encode(this.Page_txt());
}
}
public byte[] Leaf_url() {
synchronized (url_encoder) { // LOCK:static-obj
return url_encoder.Encode(this.Leaf_txt());
}
}
public byte[] Base_url() {
synchronized (url_encoder) { // LOCK:static-obj
return url_encoder.Encode(this.Base_txt());
}
}
2015-07-13 01:10:02 +00:00
public byte[] Root_txt() {return root_bgn == -1 ? Page_txt() : Bry_.Mid(full_txt, page_bgn, root_bgn - 1);}
public byte[] Rest_txt() {return root_bgn == -1 ? Page_txt() : Bry_.Mid(full_txt, root_bgn, anch_bgn == -1 ? full_txt.length : anch_bgn - 1);}
2016-08-01 01:41:19 +00:00
public byte[] Talk_url() {
synchronized (url_encoder) { // LOCK:static-obj
return url_encoder.Encode(this.Talk_txt());
}
}
public byte[] Subj_url() {
synchronized (url_encoder) { // LOCK:static-obj
return url_encoder.Encode(this.Subj_txt());
}
}
2015-07-13 01:10:02 +00:00
public int Qarg_bgn() {return qarg_bgn;} private int qarg_bgn = -1;
public byte[] Qarg_txt() {return this.Qarg_bgn() == -1 ? null : Bry_.Mid(full_txt, this.Qarg_bgn(), full_txt.length);}
public byte[] Base_txt_wo_qarg() {
int bgn = page_bgn;
int end = full_txt.length;
if (leaf_bgn != -1) end = leaf_bgn - 1;
else if (qarg_bgn != -1) end = qarg_bgn - 1;
return Bry_.Mid(full_txt, bgn, end);
}
public byte[] Leaf_txt_wo_qarg() {
int bgn = leaf_bgn == -1 ? 0 : leaf_bgn;
int end = full_txt.length;
if (anch_bgn != -1) end = anch_bgn - 1;
else if (qarg_bgn != -1) end = qarg_bgn - 1;
return Bry_.Mid(full_txt, bgn, end);
}
public byte[] Full_txt_wo_qarg() {
int bgn = wik_bgn == -1 ? 0 : ns_bgn == -1 ? page_bgn : ns_bgn;
int end = full_txt.length;
if (anch_bgn != -1) end = anch_bgn - 1;
else if (qarg_bgn != -1) end = qarg_bgn - 1;
return Bry_.Mid(full_txt, bgn, end);
}
public byte[] Page_txt_wo_qargs() { // assume that no Special page has non-ascii characters
int full_txt_len = full_txt.length;
2015-09-21 03:43:51 +00:00
int ques_pos = Bry_find_.Find_bwd(full_txt, Byte_ascii.Question, full_txt_len, page_bgn);
2015-11-02 01:50:05 +00:00
return Bry_.Mid(full_txt, page_bgn, ques_pos == Bry_find_.Not_found ? full_txt_len : ques_pos);
2015-07-13 01:10:02 +00:00
}
2016-07-11 03:35:32 +00:00
public static final byte Subpage_spr = Byte_ascii.Slash; // EX: A/B/C
public static final Xoa_ttl Null = null;
2015-08-03 04:10:03 +00:00
private static final int Char__bidi = 1, Char__ws = 2;
2016-03-28 03:44:59 +00:00
private static final Btrie_slim_mgr char_trie = Btrie_slim_mgr.cs()
2016-01-04 02:27:38 +00:00
.Add_many_int(Char__bidi , Bry_.New_by_ints(0xE2, 0x80, 0x8E), Bry_.New_by_ints(0xE2, 0x80, 0x8F), Bry_.New_by_ints(0xE2, 0x80, 0xAA), Bry_.New_by_ints(0xE2, 0x80, 0xAB), Bry_.New_by_ints(0xE2, 0x80, 0xAC), Bry_.New_by_ints(0xE2, 0x80, 0xAD), Bry_.New_by_ints(0xE2, 0x80, 0xAE))
2016-07-11 03:35:32 +00:00
.Add_many_int(Char__ws , "\u00A0", "\u1680", "\u180E", "\u2000", "\u2001", "\u2002", "\u2003", "\u2004", "\u2005", "\u2006", "\u2007", "\u2008", "\u2009", "\u200A", "\u2028", "\u2029", "\u202F", "\u205F", "\u3000");
2016-08-01 01:41:19 +00:00
private final static Gfo_url_encoder href_encoder = Gfo_url_encoder_.New__html_href_mw(Bool_.Y).Make();
private final static Xoa_url_encoder url_encoder = new Xoa_url_encoder();
2016-07-11 03:35:32 +00:00
public static byte[] Replace_spaces(byte[] raw) {return Bry_.Replace(raw, Byte_ascii.Space, Byte_ascii.Underline);}
public static byte[] Replace_unders(byte[] raw) {return Replace_unders(raw, 0, raw.length);}
public static byte[] Replace_unders(byte[] raw, int bgn, int end) {return Bry_.Replace(raw, bgn, end, Byte_ascii.Underline, Byte_ascii.Space);}
public static Xoa_ttl Parse(Xow_wiki wiki, byte[] raw) {return Parse(wiki, raw, 0, raw.length);}
public static Xoa_ttl Parse(Xow_wiki wiki, int ns_id, byte[] ttl) {
Xow_ns ns = wiki.Ns_mgr().Ids_get_or_null(ns_id);
byte[] raw = Bry_.Add(ns.Name_db_w_colon(), ttl);
return Parse(wiki, raw, 0, raw.length);
2015-07-13 01:10:02 +00:00
}
2016-07-11 03:35:32 +00:00
public static Xoa_ttl Parse(Xow_wiki wiki, byte[] src, int bgn, int end){return Parse(wiki.App().Parser_amp_mgr(), wiki.Lang().Case_mgr(), wiki.Xwiki_mgr(), wiki.Ns_mgr(), src, bgn, end);}
public static Xoa_ttl Parse(Xop_amp_mgr amp_mgr, Xol_case_mgr case_mgr, Xow_xwiki_mgr xwiki_mgr, Xow_ns_mgr ns_mgr, byte[] src, int bgn, int end) {
Xoa_ttl rv = new Xoa_ttl();
Bry_bfr bfr = Bry_bfr_.Get(); // changed from bry_mkr.Get_b512(); DATE:2016-07-06
try {return rv.Parse(bfr, amp_mgr, case_mgr, xwiki_mgr, ns_mgr, src, bgn, end) ? rv : null;}
finally {bfr.Mkr_rls();}
}
private boolean Parse(Bry_bfr bfr, Xop_amp_mgr amp_mgr, Xol_case_mgr case_mgr, Xow_xwiki_mgr xwiki_mgr, Xow_ns_mgr ns_mgr, byte[] src, int bgn, int end) {
2015-07-13 01:10:02 +00:00
/* This proc will
- identify all parts: Wiki, Namespace, Base/Leaf, Anchor; it will also identify Subject/Talk ns
- trim whitespace around part delimiters; EX: "Help : Test" --> "Help:Test"; note that it will trim only if the ns part is real; EX: "Helpx : Test" is unchanged
- replace multiple whitespaces with 1; EX: "Many ws" --> "Many ws"
- capitalize the first letter of the page title
note: a byte[] is needed b/c proc does collapsing and casing
FUTURE:
- "/", "a/" (should be page); "#" (not a page)
- Talk:Help:a disallowed; Category talk:Help:a allowed
- remove invalid characters $rxTc
- forbid ./ /.
- forbid ~~~
- handle ip address urls for User and User talk
*/
2015-11-23 02:39:33 +00:00
Gfo_url_encoder anchor_encoder = null;
2015-07-13 01:10:02 +00:00
Bry_bfr anchor_encoder_bfr = null;
bfr.Clear();
2016-07-11 03:35:32 +00:00
if (end - bgn == 0) return false;
2015-07-13 01:10:02 +00:00
this.raw = src;
ns = ns_mgr.Ns_main();
boolean add_ws = false, ltr_bgn_reset = false;
2016-07-11 03:35:32 +00:00
int ltr_bgn = -1, txt_bb_len = 0, colon_count = 0;
Btrie_slim_mgr amp_trie = amp_mgr.Amp_trie(); Btrie_rv trv = null;
2015-07-13 01:10:02 +00:00
byte[] b_ary = null;
int cur = bgn;
int match_pos = -1;
while (cur != end) {
byte b = src[cur];
switch (b) {
case Byte_ascii.Colon:
if (cur == bgn) { // initial colon; flag; note that " :" is not handled; note that colon_count is not incremented
forceLiteralLink = true;
++cur;
if (cur < end && src[cur] == Byte_ascii.Colon)
++cur;
continue; // do not add to bfr
}
else {
if (ltr_bgn == -1) {// no ltrs seen; treat as literal; occurs for ::fr:wikt:test and fr::Help:test
++colon_count;
break;
}
boolean part_found = false;
if (colon_count == 0) {// 1st colon;
Object o = ns_mgr.Names_get_or_null(bfr.Bfr(), ltr_bgn, txt_bb_len);
if (o == null) { // not ns; try alias
wik_itm = xwiki_mgr.Get_by_mid(bfr.Bfr(), ltr_bgn, txt_bb_len); // check if wiki; note: wiki is not possible for other colons
if (wik_itm != null) {
wik_bgn = 0; // wik_bgn can only start at 0
part_found = true;
anch_bgn = -1; // NOTE: do not allow anchors to begin before wiki_itm; breaks Full_txt for [[:#batch:Main Page]]; DATE:20130102
}
}
else {
ns = (Xow_ns)o;
2015-11-09 04:48:07 +00:00
byte[] ns_name = ns.Name_ui();
2015-07-13 01:10:02 +00:00
int ns_name_len = ns_name.length;
int tmp_bfr_end = bfr.Len();
2015-09-21 03:43:51 +00:00
if (!Bry_.Eq(bfr.Bfr(), ltr_bgn, tmp_bfr_end, ns_name) && ns_name_len == tmp_bfr_end - ltr_bgn) { // if (a) ns_name != bfr_txt (b) both are same length; note that (b) should not happen, but want to safeguard against mismatched arrays
2015-07-13 01:10:02 +00:00
Bry_.Set(bfr.Bfr(), ltr_bgn, tmp_bfr_end, ns_name);
}
ns_bgn = ltr_bgn;
part_found = true;
}
}
if (part_found) {
page_bgn = txt_bb_len + 1; // anticipate page_bgn;
add_ws = false; // if there was an add_ws, ignore; EX: "Category :" should ignore space
ltr_bgn_reset = true; // ltr_bgn_reset
}
colon_count++; // increment colon count
break;
}
2016-01-04 02:27:38 +00:00
case Byte_ascii.Hash:
if (anch_bgn == -1) // anchor begins at 1st #, not last #; EX:A#B#C has anchor of "B#C" not "C" PAGE:en.w:Grand_Central_Terminal; DATE:2015-12-31
anch_bgn = (txt_bb_len) + 1;
break;
2015-07-13 01:10:02 +00:00
case Byte_ascii.Slash:
if (root_bgn == -1)
root_bgn = (txt_bb_len) + 1;
if (anch_bgn == -1) { // only set leaf if anchor found; guards against A#B/C and / setting leaf; DATE:2014-01-14
leaf_bgn = (txt_bb_len) + 1;
qarg_bgn = -1; // always reset qarg; handles ttls which have question_mark which are premptively assumed to be qarg; PAGE:en.w:Portal:Organized_Labour/Did_You_Know?/1 DATE:2014-06-08
}
break; // flag last leaf_bgn
case Byte_ascii.Nl: // NOTE: for now, treat nl just like space; not sure if it should accept "a\nb" or "\nab"; need to handle trailing \n for "Argentina\n\n" in {{Infobox settlement|pushpin_map=Argentina|pushpin_label_position=|pushpin_map_alt=|pushpin_map_caption=Location of Salta in Argentina}};
case Byte_ascii.Space: case Byte_ascii.Tab: case Byte_ascii.Cr: // added \t, \r; DATE:2013-03-27
2016-07-18 01:10:59 +00:00
case Byte_ascii.Underline: if (ltr_bgn != -1) add_ws = true; ++cur;
2015-07-13 01:10:02 +00:00
continue; // only mark add_ws if ltr_seen; this ignores ws at bgn; also, note "continue"
case Byte_ascii.Question:
if (txt_bb_len + 1 < end) // guard against trailing ? (which shouldn't happen)
qarg_bgn = txt_bb_len + 1;
break;
case Byte_ascii.Amp:
2016-07-18 01:10:59 +00:00
int cur2 = cur + 1;
2015-07-13 01:10:02 +00:00
if (cur2 == end) {} // guards against terminating &; EX: [[Bisc &]]; NOTE: needed b/c Match_bgn does not do bounds checking for cur in src; src[src.length] will be called when & is last character;
else {
2016-07-11 03:35:32 +00:00
if (trv == null) trv = new Btrie_rv();
Object html_ent_obj = amp_trie.Match_at(trv, src, cur2, end);
2015-07-13 01:10:02 +00:00
if (html_ent_obj != null) {
2016-08-08 01:36:50 +00:00
Gfh_entity_itm amp_itm = (Gfh_entity_itm)html_ent_obj;
2016-07-11 03:35:32 +00:00
match_pos = trv.Pos();
2016-08-08 01:36:50 +00:00
if (amp_itm.Tid() == Gfh_entity_itm.Tid_name_std) {
2015-07-13 01:10:02 +00:00
switch (amp_itm.Char_int()) {
case 160: // NOTE: &nbsp must convert to space; EX:w:United States [[Image:Dust Bowl&nbsp;- Dallas, South Dakota 1936.jpg|220px|alt=]]
if (ltr_bgn != -1) add_ws = true; // apply same ws rules as Space, NewLine; needed for converting multiple ws into one; EX:" &nbsp; " -> " " x> " "; PAGEen.w:Greek_government-debt_crisis; DATE:2014-09-25
cur = match_pos; // set cur after ";"
continue;
case Byte_ascii.Amp:
2015-08-03 04:10:03 +00:00
b_ary = Byte_ascii.Amp_bry; // NOTE: if &amp; convert to &; PAGE:en.w:Amadou Bagayoko?redirect=n; DATE:2014-09-23
2015-07-13 01:10:02 +00:00
break;
case Byte_ascii.Quote:
case Byte_ascii.Lt:
case Byte_ascii.Gt:
b_ary = amp_itm.Xml_name_bry();
break;
2016-08-08 01:36:50 +00:00
case Gfh_entity_itm.Char_int_null: // &#xx;
2015-09-21 03:43:51 +00:00
int end_pos = Bry_find_.Find_fwd(src, Byte_ascii.Semic, match_pos, end);
2015-11-02 01:50:05 +00:00
if (end_pos == Bry_find_.Not_found) {} // &# but no terminating ";" noop: defaults to current_byte which will be added below;
2015-07-13 01:10:02 +00:00
else {
b_ary = amp_itm.Xml_name_bry();
match_pos = end_pos + 1;
}
break;
default:
2015-08-03 04:10:03 +00:00
b_ary = amp_itm.U8_bry();
2015-07-13 01:10:02 +00:00
break;
}
}
else {
2016-07-04 02:41:56 +00:00
Xop_amp_mgr_rslt amp_rv = new Xop_amp_mgr_rslt();
2016-08-08 01:36:50 +00:00
amp_mgr.Parse_ncr(amp_rv, amp_itm.Tid() == Gfh_entity_itm.Tid_num_hex, src, end, cur2, match_pos);
2016-07-04 02:41:56 +00:00
if (amp_rv.Pass()) {
b_ary = gplx.core.intls.Utf16_.Encode_int_to_bry(amp_rv.Val());
2015-07-13 01:10:02 +00:00
if (b_ary.length == 1 && b_ary[0] == Byte_ascii.Hash) // NOTE: A&#x23;B should be interpreted as A#b; PAGE:en.s:The_English_Constitution_(1894) DATE:2014-09-07
anch_bgn = (txt_bb_len) + 1;
2016-07-04 02:41:56 +00:00
match_pos = amp_rv.Pos();
2015-07-13 01:10:02 +00:00
}
}
}
}
break;
case Byte_ascii.Lt:
if (cur + 3 < end) {
if ( src[cur + 1] == Byte_ascii.Bang
&& src[cur + 2] == Byte_ascii.Dash
&& src[cur + 3] == Byte_ascii.Dash
) {
2016-07-18 01:10:59 +00:00
int cur3 = cur + 3;
2015-09-21 03:43:51 +00:00
int find = Bry_find_.Find_fwd(src, Xop_comm_lxr.End_ary, cur3, end);
2015-07-13 01:10:02 +00:00
if (find != -1) {
cur = find + Xop_comm_lxr.End_ary.length;
continue;
}
2016-07-11 03:35:32 +00:00
else
2015-07-13 01:10:02 +00:00
return false;
}
}
if (anch_bgn != -1) {
if (anchor_encoder == null) {
2015-11-23 02:39:33 +00:00
anchor_encoder = Gfo_url_encoder_.Id;
2016-06-20 03:58:10 +00:00
anchor_encoder_bfr = Bry_bfr_.Reset(32);
2015-07-13 01:10:02 +00:00
}
anchor_encoder.Encode(anchor_encoder_bfr, src, cur, cur + 1);
2015-10-19 02:17:57 +00:00
b_ary = anchor_encoder_bfr.To_bry_and_clear();
2015-07-13 01:10:02 +00:00
match_pos = cur + 1;
}
break;
// NOTE: DefaultSettings.php defines wgLegalTitleChars as " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+"; the characters above are okay; those below are not
case Byte_ascii.Gt: case Byte_ascii.Pipe:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end:
if (anch_bgn != -1) {
if (anchor_encoder == null) {
2015-11-23 02:39:33 +00:00
anchor_encoder = Gfo_url_encoder_.Id;
2016-06-20 03:58:10 +00:00
anchor_encoder_bfr = Bry_bfr_.Reset(32);
2015-07-13 01:10:02 +00:00
}
anchor_encoder.Encode(anchor_encoder_bfr, src, cur, cur + 1);
2015-10-19 02:17:57 +00:00
b_ary = anchor_encoder_bfr.To_bry_and_clear();
2015-07-13 01:10:02 +00:00
match_pos = cur + 1;
}
2016-07-11 03:35:32 +00:00
else
2015-07-13 01:10:02 +00:00
return false;
break;
2015-08-03 04:10:03 +00:00
default:
if ((b & 0xff) > 127) {// PATCH.JAVA:need to convert to unsigned byte
2016-07-11 03:35:32 +00:00
if (trv == null) trv = new Btrie_rv();
Object char_obj = char_trie.Match_at_w_b0(trv, b, src, cur, end);
2015-08-03 04:10:03 +00:00
if (char_obj != null) {
int tid = ((Int_obj_val)(char_obj)).Val();
2016-07-11 03:35:32 +00:00
cur = trv.Pos();
2015-08-03 04:10:03 +00:00
switch (tid) {
case Char__bidi: // ignore bidi
2015-07-13 01:10:02 +00:00
continue;
2015-08-03 04:10:03 +00:00
case Char__ws: // treat extended_ws as space; PAGE:ja.w:Template:Location_map_USA New_York; DATE:2015-07-28
if (ltr_bgn != -1) add_ws = true;
continue;
}
2015-07-13 01:10:02 +00:00
}
}
break;
}
++cur;
if (add_ws) { // add ws and toggle flag
bfr.Add_byte(Byte_ascii.Space); ++txt_bb_len;
add_ws = false;
}
if (ltr_bgn == -1) ltr_bgn = txt_bb_len; // if 1st letter not seen, mark 1st letter
if (b_ary == null) {bfr.Add_byte(b); ++txt_bb_len;} // add to bfr
else {bfr.Add(b_ary); txt_bb_len += b_ary.length; b_ary = null; cur = match_pos;} // NOTE: b_ary != null only for amp_trie
if (ltr_bgn_reset) {// colon found; set ws to bgn mode; note that # and / do not reset
ltr_bgn_reset = false;
ltr_bgn = -1;
}
}
2016-07-11 03:35:32 +00:00
if (txt_bb_len == 0) return false;
if (wik_bgn == -1 && page_bgn == txt_bb_len) return false; // if no wiki, but page_bgn is at end, then ttl is ns only; EX: "Help:"; NOTE: "fr:", "fr:Help" is allowed
2015-10-19 02:17:57 +00:00
full_txt = bfr.To_bry_and_clear();
2015-11-02 01:50:05 +00:00
if ( ns.Case_match() == Xow_ns_case_.Tid__1st
2015-07-13 01:10:02 +00:00
&& wik_bgn == -1 ) { // do not check case if xwiki; EX: "fr:" would have a wik_bgn of 0 (and a wik_end of 3); "A" (and any non-xwiki ttl) would have a wik_bgn == -1
byte char_1st = full_txt[page_bgn];
2015-09-21 03:43:51 +00:00
int char_1st_len = gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(char_1st);
2015-07-13 01:10:02 +00:00
int page_end = page_bgn + char_1st_len;
if ( char_1st_len > 1) { // 1st char is multi-byte char
int full_txt_len = full_txt.length;
if (page_end > full_txt_len) // ttl is too too short for 1st multi-byte char; EX: [[%D0]] is 208 but in utf8, 208 requires at least another char; DATE:2013-11-11
return false; // ttl is invalid
else { // ttl is long enough for 1st mult-byte char; need to use platform uppercasing; Xol_case_mgr_.Utf_8 is not sufficient
2016-07-11 03:35:32 +00:00
Bry_bfr upper_1st = Bry_bfr_.Get();
2015-07-13 01:10:02 +00:00
byte[] page_txt = case_mgr.Case_build_1st_upper(upper_1st, full_txt, page_bgn, full_txt_len); // always build; never reuse; (multi-byte character will expand array)
if (page_bgn == 0) // page only; EX: A
full_txt = page_txt;
else // ns + page; EX: Help:A
full_txt = Bry_.Add(Bry_.Mid(full_txt, 0, page_bgn), page_txt); // add page_txt to exsiting ns
upper_1st.Mkr_rls();
}
}
else
full_txt = case_mgr.Case_reuse_upper(full_txt, page_bgn, page_end);
}
2015-11-09 04:48:07 +00:00
Xow_ns tors_ns = ns.Id_is_talk() ? ns_mgr.Ords_get_at(ns.Ord_subj_id()) : ns_mgr.Ords_get_at(ns.Ord_talk_id());
tors_txt = tors_ns.Name_ui_w_colon();
2016-06-20 03:58:10 +00:00
// tors_txt = tors_ns == null ? Bry_.Empty : tors_ns.Name_ui_w_colon();
2015-07-13 01:10:02 +00:00
return true;
}
}
class Xoa_ttl_trie {
public static Btrie_fast_mgr new_() {
2015-08-17 06:09:16 +00:00
Btrie_fast_mgr rv = Btrie_fast_mgr.cs();
2015-07-13 01:10:02 +00:00
rv.Add(Byte_ascii.Colon , Byte_obj_val.new_(Id_colon));
rv.Add(Byte_ascii.Hash , Byte_obj_val.new_(Id_hash));
rv.Add(Byte_ascii.Slash , Byte_obj_val.new_(Id_slash));
rv.Add(Byte_ascii.Space , Byte_obj_val.new_(Id_space));
rv.Add(Byte_ascii.Underline , Byte_obj_val.new_(Id_underline));
rv.Add(Byte_ascii.Amp , Byte_obj_val.new_(Id_amp));
rv.Add(Xop_comm_lxr.Bgn_ary , Byte_obj_val.new_(Id_comment_bgn));
2016-01-18 04:18:07 +00:00
rv.Add(Byte_ascii.Nl , Byte_obj_val.new_(Id_newLine));
2015-07-13 01:10:02 +00:00
rv.Add(Byte_ascii.Brack_bgn , Byte_obj_val.new_(Id_invalid));
rv.Add(Byte_ascii.Curly_bgn , Byte_obj_val.new_(Id_invalid));
return rv;
}
public static final byte Id_colon = 0, Id_hash = 1, Id_slash = 2, Id_space = 3, Id_underline = 4, Id_amp = 5, Id_comment_bgn = 6, Id_invalid = 7, Id_newLine = 8;
}