1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2014-09-21 23:32:37 -04:00
parent 985863b224
commit 314de43d76
133 changed files with 4112 additions and 722 deletions

View File

@@ -16,7 +16,7 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa; import gplx.*;
import gplx.core.btries.*; import gplx.xowa.wikis.xwikis.*; import gplx.xowa.parsers.amps.*;
import gplx.core.btries.*; import gplx.xowa.wikis.xwikis.*; import gplx.xowa.parsers.amps.*; import gplx.xowa.langs.cases.*;
public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.MW: Ttl.php|secureAndSplit;
public Xow_ns Ns() {return ns;} private Xow_ns ns;
public boolean ForceLiteralLink() {return forceLiteralLink;} private boolean forceLiteralLink;
@@ -90,25 +90,30 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
public static Xoa_ttl parse_(Xow_wiki wiki, int ns_id, byte[] ttl) {
Xow_ns ns = wiki.Ns_mgr().Ids_get_or_null(ns_id);
byte[] raw = Bry_.Add(ns.Name_db_w_colon(), ttl);
return new_(wiki, wiki.App().Msg_log(), raw, raw, 0, raw.length);
return new_(wiki, wiki.App().Msg_log(), raw, 0, raw.length);
}
public static Xoa_ttl parse_(Xow_wiki wiki, byte[] raw) {return new_(wiki, wiki.App().Msg_log(), raw, raw, 0, raw.length);}
public static Xoa_ttl parse_(Xow_wiki wiki, byte[] raw) {return new_(wiki, wiki.App().Msg_log(), raw, 0, raw.length);}
private static final Object thread_lock = new Object();
public static Xoa_ttl new_(Xow_wiki wiki, Gfo_msg_log msg_log, byte[] raw, byte[] src, int bgn, int end) {
public static Xoa_ttl new_(Xow_wiki wiki, Gfo_msg_log msg_log, byte[] src, int bgn, int end) {
Xoa_app app = wiki.App();
Bry_bfr_mkr bry_mkr = app.Utl_bry_bfr_mkr();
return parse(bry_mkr, app.Parser_amp_mgr(), wiki.Lang().Case_mgr(), wiki.Xwiki_mgr(), wiki.Ns_mgr(), msg_log, src, bgn, end);
} private Xoa_ttl() {}
public static Xoa_ttl parse(Bry_bfr_mkr bry_mkr, Xop_amp_mgr amp_mgr, Xol_case_mgr case_mgr, Xow_xwiki_mgr xwiki_mgr, Xow_ns_mgr ns_mgr, Gfo_msg_log msg_log, byte[] src, int bgn, int end) {
synchronized (thread_lock) {
Xoa_ttl rv = new Xoa_ttl();
Bry_bfr bfr = wiki.Utl_bry_bfr_mkr().Get_b512();
Bry_bfr bfr = bry_mkr.Get_b512();
try {
boolean pass = rv.Parse(bfr, wiki, msg_log, raw, src, bgn, end);
boolean pass = rv.Parse(bfr, bry_mkr, amp_mgr, case_mgr, xwiki_mgr, ns_mgr, msg_log, src, bgn, end);
return pass ? rv : null;
}
finally {bfr.Mkr_rls();}
}
} private Xoa_ttl() {}
private boolean Parse(Bry_bfr bfr, Xow_wiki wiki, Gfo_msg_log msg_log, byte[] raw, byte[] src, int bgn, int end) {
}
private boolean Parse(Bry_bfr bfr, Bry_bfr_mkr bry_mkr, Xop_amp_mgr amp_mgr, Xol_case_mgr case_mgr, Xow_xwiki_mgr xwiki_mgr, Xow_ns_mgr ns_mgr, Gfo_msg_log msg_log, byte[] src, int bgn, int end) {
/* This proc will
- identify all parts: Wiki, Namespace, Base/Leaf, Anchor; it will also identify Subject/Talk ns
- trim whitespace around part delimiters; EX: "Help : Test" --> "Help:Test"; note that it will trim only if the part is real; EX: "Helpx : Test" is unchanged
- trim whitespace around part delimiters; EX: "Help : Test" --> "Help:Test"; note that it will trim only if the ns part is real; EX: "Helpx : Test" is unchanged
- replace multiple whitespaces with 1; EX: "Many ws" --> "Many ws"
- capitalize the first letter of the page title
note: a byte[] is needed b/c proc does collapsing and casing
@@ -125,13 +130,11 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
Bry_bfr anchor_encoder_bfr = null;
bfr.Clear();
if (end - bgn == 0) {msg_log.Add_itm_none(Xop_ttl_log.Len_0, src, bgn, bgn); return false;}
this.raw = raw;
Xow_ns_mgr ns_mgr = wiki.Ns_mgr(); ns = ns_mgr.Ns_main();
this.raw = src;
ns = ns_mgr.Ns_main();
boolean add_ws = false, ltr_bgn_reset = false;
int ltr_bgn = -1, txt_bb_len = 0, colon_count = 0; bfr.Clear();
Xop_amp_mgr amp_mgr = wiki.App().Parser_amp_mgr();
Btrie_slim_mgr amp_trie = amp_mgr.Amp_trie();
//Btrie_fast_mgr ttlTrie = wiki.App().TtlTrie();
Btrie_slim_mgr amp_trie = amp_mgr.Amp_trie();
byte[] b_ary = null;
int cur = bgn;
int match_pos = -1;
@@ -155,7 +158,7 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
if (colon_count == 0) {// 1st colon;
Object o = ns_mgr.Names_get_or_null(bfr.Bfr(), ltr_bgn, txt_bb_len);
if (o == null) { // not ns; try alias
wik_itm = wiki.Xwiki_mgr().Get_by_mid(bfr.Bfr(), ltr_bgn, txt_bb_len); // check if wiki; note: wiki is not possible for other colons
wik_itm = xwiki_mgr.Get_by_mid(bfr.Bfr(), ltr_bgn, txt_bb_len); // check if wiki; note: wiki is not possible for other colons
if (wik_itm != null) {
wik_bgn = 0; // wik_bgn can only start at 0
part_found = true;
@@ -271,24 +274,7 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
match_pos = cur + 1;
}
break;
//case Byte_ascii.Bang: case Byte_ascii.Quote: case Byte_ascii.Dollar: case Byte_ascii.Percent: case Byte_ascii.Apos: case Byte_ascii.Paren_bgn:
//case Byte_ascii.Paren_end: case Byte_ascii.Asterisk: case Byte_ascii.Comma: case Byte_ascii.Dash: case Byte_ascii.Dot:
//case Byte_ascii.Semic: case Byte_ascii.Eq: case Byte_ascii.Question: case Byte_ascii.At:
//case Byte_ascii.Num_0: case Byte_ascii.Num_1: case Byte_ascii.Num_2: case Byte_ascii.Num_3: case Byte_ascii.Num_4:
//case Byte_ascii.Num_5: case Byte_ascii.Num_6: case Byte_ascii.Num_7: case Byte_ascii.Num_8: case Byte_ascii.Num_9:
//case Byte_ascii.Ltr_A: case Byte_ascii.Ltr_B: case Byte_ascii.Ltr_C: case Byte_ascii.Ltr_D: case Byte_ascii.Ltr_E:
//case Byte_ascii.Ltr_F: case Byte_ascii.Ltr_G: case Byte_ascii.Ltr_H: case Byte_ascii.Ltr_I: case Byte_ascii.Ltr_J:
//case Byte_ascii.Ltr_K: case Byte_ascii.Ltr_L: case Byte_ascii.Ltr_M: case Byte_ascii.Ltr_N: case Byte_ascii.Ltr_O:
//case Byte_ascii.Ltr_P: case Byte_ascii.Ltr_Q: case Byte_ascii.Ltr_R: case Byte_ascii.Ltr_S: case Byte_ascii.Ltr_T:
//case Byte_ascii.Ltr_U: case Byte_ascii.Ltr_V: case Byte_ascii.Ltr_W: case Byte_ascii.Ltr_X: case Byte_ascii.Ltr_Y: case Byte_ascii.Ltr_Z:
//case Byte_ascii.Ltr_a: case Byte_ascii.Ltr_b: case Byte_ascii.Ltr_c: case Byte_ascii.Ltr_d: case Byte_ascii.Ltr_e:
//case Byte_ascii.Ltr_f: case Byte_ascii.Ltr_g: case Byte_ascii.Ltr_h: case Byte_ascii.Ltr_i: case Byte_ascii.Ltr_j:
//case Byte_ascii.Ltr_k: case Byte_ascii.Ltr_l: case Byte_ascii.Ltr_m: case Byte_ascii.Ltr_n: case Byte_ascii.Ltr_o:
//case Byte_ascii.Ltr_p: case Byte_ascii.Ltr_q: case Byte_ascii.Ltr_r: case Byte_ascii.Ltr_s: case Byte_ascii.Ltr_t:
//case Byte_ascii.Ltr_u: case Byte_ascii.Ltr_v: case Byte_ascii.Ltr_w: case Byte_ascii.Ltr_x: case Byte_ascii.Ltr_y: case Byte_ascii.Ltr_z:
//case Byte_ascii.Backslash: case Byte_ascii.Pow: case Byte_ascii.Tick: case Byte_ascii.Tilde:
//case Byte_ascii.Plus: // NOTE: not in wgLegalTitleChars, but appears in MW titles; EX: [[File:ASLSJSW Aas+.PNG]]
// NOTE: DefaultSettings.php defines wgLegalTitleChars as " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+"; the characters above are okay; those below are not
// NOTE: DefaultSettings.php defines wgLegalTitleChars as " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+"; the characters above are okay; those below are not
case Byte_ascii.Gt: case Byte_ascii.Pipe:
case Byte_ascii.Brack_bgn: case Byte_ascii.Brack_end: case Byte_ascii.Curly_bgn: case Byte_ascii.Curly_end:
if (anch_bgn != -1) {
@@ -322,93 +308,8 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
break;
}
++cur;
// Object itm_obj = ttlTrie.Match(b, src, cur, end);
// if (itm_obj == null) ++cur;
// else {
// Byte_obj_val itm = (Byte_obj_val)itm_obj;
// switch (itm.Val()) {
// case Xoa_ttl_trie.Id_invalid:
// msg_log.Add_itm_none(src, bgn, end, Xop_ttl_log.Invalid_char);
// return false;
// case Xoa_ttl_trie.Id_colon:
// if (cur == bgn) { // initial colon; flag; note that " :" is not handled; note that colon_count is not incremented
// forceLiteralLink = true;
// cur = ttlTrie.Match_pos();
// continue; // do not add to bfr
// }
// else {
// if (ltr_bgn == -1) {// no ltrs seen; treat as literal; occurs for ::fr:wikt:test and fr::Help:test
// ++colon_count;
// break;
// }
// boolean part_found = false;
// if (colon_count == 0) {// 1st colon;
// Object o = wiki.Wiki_mgr().GetByMid(bfr.Bfr(), ltr_bgn, txt_bb_len); // check if wiki; note: wiki is not possible for other colons
// if (o != null) {
// wik_bgn = 0; // wik_bgn can only start at 0
// part_found = true;
// }
// else { // not wiki; check if ns; note: if wiki, don't try to extract ns; EX: "fr:Aide:test"
// o = ns_mgr.Trie_match_exact(bfr.Bfr(), ltr_bgn, txt_bb_len);
// if (o != null) {
// ns = (Xow_ns)o;
// ns_bgn = ltr_bgn;
// part_found = true;
// }
// }
// }
// if (part_found) {
// page_bgn = txt_bb_len + 1; // anticipate page_bgn;
// add_ws = false; // if there was an add_ws, ignore; EX: "Category :" should ignore space
// ltr_bgn_reset = true; // ltr_bgn_reset
// }
// colon_count++; // increment colon count
// break;
// }
// case Xoa_ttl_trie.Id_hash: anch_bgn = (txt_bb_len) + 1; break; // flag last anch_bgn
// case Xoa_ttl_trie.Id_slash: leaf_bgn = (txt_bb_len) + 1; break; // flag last leaf_bgn
// case Xoa_ttl_trie.Id_newLine: // NOTE: for now, treat nl just like space; not sure if it should accept "a\nb" or "\nab"; need to handle trailing \n for "Argentina\n\n" in {{Infobox settlement|pushpin_map=Argentina|pushpin_label_position=|pushpin_map_alt=|pushpin_map_caption=Location of Salta in Argentina}};
// case Xoa_ttl_trie.Id_space:
// case Xoa_ttl_trie.Id_underline:if (ltr_bgn != -1) add_ws = true; cur = ttlTrie.Match_pos(); continue; // only mark add_ws if ltr_seen; this ignores ws at bgn; also, note "continue"
// case Xoa_ttl_trie.Id_amp:
// cur = ttlTrie.Match_pos();
// if (cur == end) {} // guards against terminating &; EX: [[Bisc &]]; NOTE: needed b/c Match_bgn does not do bounds checking for cur in src; src[src.length] will be called when & is last character;
// else {
// Object html_ent_obj = wiki.App().Amp_trie().Match_bgn(src, cur, end);
// if (html_ent_obj != null) {
// Xop_amp_trie_itm amp_itm = (Xop_amp_trie_itm)html_ent_obj;
// switch (amp_itm.CharInt()) {
// case Byte_ascii.Amp:
// case Byte_ascii.Quote:
// case Byte_ascii.Lt:
// case Byte_ascii.Gt:
// b_ary = amp_itm.XmlEntityName();
// break;
// default:
// b_ary = amp_itm.CharIntAsAry();
// break;
// }
// }
// }
// break;
// case Xoa_ttl_trie.Id_comment_bgn:
// cur = ttlTrie.Match_pos();
// int find = Bry_finder.Find_fwd(src, Xop_comm_lxr.End_ary, cur, end);
// if (find != -1) {
// cur = find + Xop_comm_lxr.End_ary.length;
// continue;
// }
// else {
// msg_log.Add_itm_none(src, bgn, end, Xop_ttl_log.Comment_eos);
// return false;
// }
// default: break;
// }
// cur = ttlTrie.Match_pos();
// }
if (add_ws) { // add ws and toggle flag
bfr.Add_byte(Byte_ascii.Space); ++txt_bb_len;
// bfr.Add_byte(Byte_ascii.Underline); ++txt_bb_len;
add_ws = false;
}
if (ltr_bgn == -1) ltr_bgn = txt_bb_len; // if 1st letter not seen, mark 1st letter
@@ -420,19 +321,11 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
}
}
if (txt_bb_len == 0) {msg_log.Add_itm_none(Xop_ttl_log.Len_0, src, bgn, end); return false;}
// if ( ((txt_bb_len - ns_len) > 255 && ns.Id() != Xow_ns_.Id_special) // DELETE: will cause multi-byte langs like thai to fail on long links; DATE:2013-02-02
// || ((txt_bb_len - ns_len) > 512 && ns.Id() == Xow_ns_.Id_special)) {msg_log.Add_itm_none(Xop_ttl_log.Len_max, src, bgn, end); return false;}
if (wik_bgn == -1 && page_bgn == txt_bb_len) { // if no wiki, but page_bgn is at end, then ttl is ns only; EX: "Help:"; NOTE: "fr:", "fr:Help" is allowed
msg_log.Add_itm_none(Xop_ttl_log.Ttl_is_ns_only, src, bgn, end);
return false;
}
full_txt = bfr.XtoAryAndClear();
// if (wik_bgn == 0 && txt_bb_len == page_bgn) { // 0-len intrawiki; EX: [[fr:]] on Main Page
// Xoa_ttl cur_page_ttl = wiki.Ctx().Cur_page().Ttl();
// this.raw = cur_page_ttl == null ? Bry_.Empty : Bry_.Copy(cur_page_ttl.Page_txt()); // set raw to current page ttl; note that this is needed for lnki_caption to show correctly (otherwise lnki_caption will be blank); this is a quasi-hack as it depends on the ttl of the current page, but passing in another argument feels sloppy
// bfr.Add(this.raw);
// }
if ( ns.Case_match() == Xow_ns_case_.Id_1st
&& wik_bgn == -1 ) { // do not check case if xwiki; EX: "fr:" would have a wik_bgn of 0 (and a wik_end of 3); "A" (and any non-xwiki ttl) would have a wik_bgn == -1
byte char_1st = full_txt[page_bgn];
@@ -443,8 +336,8 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
if (page_end > full_txt_len) // ttl is too too short for 1st multi-byte char; EX: [[%D0]] is 208 but in utf8, 208 requires at least another char; DATE:2013-11-11
return false; // ttl is invalid
else { // ttl is long enough for 1st mult-byte char; need to use platform uppercasing; Xol_case_mgr_.Utf_8 is not sufficient
Bry_bfr upper_1st = wiki.Utl_bry_bfr_mkr().Get_b512();
byte[] page_txt = wiki.Lang().Case_mgr().Case_build_1st_upper(upper_1st, full_txt, page_bgn, full_txt_len); // always build; never reuse; (multi-byte character will expand array)
Bry_bfr upper_1st = bry_mkr.Get_b512();
byte[] page_txt = case_mgr.Case_build_1st_upper(upper_1st, full_txt, page_bgn, full_txt_len); // always build; never reuse; (multi-byte character will expand array)
if (page_bgn == 0) // page only; EX: A
full_txt = page_txt;
else // ns + page; EX: Help:A
@@ -453,7 +346,7 @@ public class Xoa_ttl { // PAGE:en.w:http://en.wikipedia.org/wiki/Help:Link; REF.
}
}
else
full_txt = wiki.Lang().Case_mgr().Case_reuse_upper(full_txt, page_bgn, page_end);
full_txt = case_mgr.Case_reuse_upper(full_txt, page_bgn, page_end);
}
Xow_ns tors_ns = ns.Id_talk() ? ns_mgr.Ords_get_at(ns.Ord_subj_id()) : ns_mgr.Ords_get_at(ns.Ord_talk_id());
tors_txt = tors_ns.Name_txt_w_colon();