Mw_parse: Add basic implementation for double_underscore

pull/620/head
gnosygnu 8 years ago
parent aa1f1ec801
commit 31ade6aa5f

@ -19,7 +19,7 @@ package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xo
import gplx.core.btries.*; import gplx.core.net.*; import gplx.core.btries.*; import gplx.core.net.*;
import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*; import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*;
import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*; import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*;
import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*; import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*; import gplx.xowa.mws.parsers.doubleunders.*;
import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*; import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*;
public class Xomw_parser { public class Xomw_parser {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
@ -30,11 +30,13 @@ public class Xomw_parser {
private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass(); private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass();
private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr(); private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr();
private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr(); private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr();
private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr();
private final Xomw_link_renderer link_renderer = new Xomw_link_renderer(); private final Xomw_link_renderer link_renderer = new Xomw_link_renderer();
private final Xomw_link_holders holders; private final Xomw_link_holders holders;
private final Xomw_heading_cbk__html heading_wkr_cbk; private final Xomw_heading_cbk__html heading_wkr_cbk;
private final Btrie_slim_mgr protocols_trie; private final Btrie_slim_mgr protocols_trie;
private static Xomw_regex_space regex_space; private static Xomw_regex_space regex_space;
private static Xomw_regex_boundary regex_boundary;
private static Xomw_regex_url regex_url; private static Xomw_regex_url regex_url;
private final Btrie_rv trv = new Btrie_rv(); private final Btrie_rv trv = new Btrie_rv();
private int marker_index = 0; private int marker_index = 0;
@ -57,6 +59,7 @@ public class Xomw_parser {
if (regex_space == null) { if (regex_space == null) {
synchronized (Type_adp_.ClassOf_obj(this)) { synchronized (Type_adp_.ClassOf_obj(this)) {
regex_space = new Xomw_regex_space(); regex_space = new Xomw_regex_space();
regex_boundary = new Xomw_regex_boundary(regex_space);
regex_url = new Xomw_regex_url(regex_space); regex_url = new Xomw_regex_url(regex_space);
} }
} }
@ -65,6 +68,11 @@ public class Xomw_parser {
linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie()); linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie());
lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space); lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space);
lnki_wkr.Init_by_wiki(wiki); lnki_wkr.Init_by_wiki(wiki);
magiclinks_wkr.Init_by_wiki(linker, regex_boundary, regex_url);
doubleunder_wkr.Init_by_wiki();
}
public void Init_by_page(Xoa_ttl ttl) {
pctx.Init_by_page(ttl);
} }
public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) { public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) {
pbfr.Init(text); pbfr.Init(text);
@ -106,7 +114,7 @@ public class Xomw_parser {
table_wkr.Do_table_stuff(pctx, pbfr); table_wkr.Do_table_stuff(pctx, pbfr);
hr_wkr.Replace_hrs(pctx, pbfr); hr_wkr.Replace_hrs(pctx, pbfr);
// text = $this->doDoubleUnderscore(text); doubleunder_wkr.Do_double_underscore(pctx, pbfr);
heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk); heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk);
lnki_wkr.Replace_internal_links(pctx, pbfr); lnki_wkr.Replace_internal_links(pctx, pbfr);

@ -21,9 +21,41 @@ public class Xomw_parser__tst {
private final Xomw_parser__fxt fxt = new Xomw_parser__fxt(); private final Xomw_parser__fxt fxt = new Xomw_parser__fxt();
@Test public void Basic() { @Test public void Basic() {
fxt.Test__parse(String_.Concat_lines_nl_skip_last fxt.Test__parse(String_.Concat_lines_nl_skip_last
("a https://c.org b" ( "== heading_1 =="
, "para_1"
, "== heading_2 =="
, "para_2"
, "-----"
, "{|"
, "|-"
, "|a"
, "|}"
, "''italics''"
, "__TOC__"
, "[https://a.org b]"
, "[[A|abc]]"
, "https://c.org"
, "a »b«  !important c"
), String_.Concat_lines_nl_skip_last ), String_.Concat_lines_nl_skip_last
( "" ( "<h2> heading_1 </h2>"
, "<p>para_1"
, "</p>"
, "<h2> heading_2 </h2>"
, "<p>para_2"
, "</p>"
, "<hr />"
, "<table>"
, ""
, "<tr>"
, "<td>a"
, "</td></tr></table>"
, "<p><i>italics</i>"
, "<!--MWTOC-->"
, "<a class=\"external text\" rel=\"nofollow\" href=\"https://a.org\">b</a>"
, "<a href=\"/wiki/A\" title=\"A\">abc</a>"
, "<a class=\"external free\" rel=\"nofollow\" href=\"https://c.org\">https://c.org</a>"
, "a&#160;»b«&#160; !important c"
, "</p>"
)); ));
} }
} }
@ -34,6 +66,7 @@ class Xomw_parser__fxt {
Xoae_app app = Xoa_app_fxt.Make__app__edit(); Xoae_app app = Xoa_app_fxt.Make__app__edit();
Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app);
mgr.Init_by_wiki(wiki); mgr.Init_by_wiki(wiki);
mgr.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1")));
} }
public void Test__parse(String src_str, String expd) { public void Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str); byte[] src_bry = Bry_.new_u8(src_str);

@ -0,0 +1,56 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
public class Xomw_doubleunder_data {
// XO.MW: MW stores these as mDoubleUnderscores in Parser
public boolean toc;
public boolean no_toc;
public boolean force_toc;
public boolean no_gallery;
public boolean force_gallery;
public boolean no_title_convert;
public boolean no_content_convert;
public boolean no_edit_section;
public boolean new_section_link;
public boolean static_redirect;
public boolean hidden_cat;
public boolean index;
public boolean no_index;
// XO.MW: MW stores these as member variables in Parser
public boolean show_toc;
public boolean force_toc_position;
public void Reset() {
toc = no_toc = force_toc =
no_gallery = force_gallery =
no_title_convert = no_content_convert =
no_edit_section = new_section_link =
static_redirect =
hidden_cat = index = no_index =
false;
show_toc = force_toc_position = false;
}
}

@ -16,69 +16,106 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
class Xomw_doubleunder_wkr { import gplx.core.btries.*;
public boolean show_toc; // TODO.CS: handle case sensitive keys; EX: __notoc__ should not match __NOTOC__ if cs is enabled for magic word
public boolean force_toc_position; public class Xomw_doubleunder_wkr {
public boolean output__no_gallery ; private final Btrie_slim_mgr trie = Btrie_slim_mgr.ci_u8();
public Xomw_doubleunder_data doubleunderscore_data = new Xomw_doubleunder_data(); private final Btrie_rv trv = new Btrie_rv();
private void Match_and_remove(byte[] text, Xomw_doubleunder_data doubleunderscore_data) { public Xomw_doubleunder_data data = new Xomw_doubleunder_data();
doubleunderscore_data.Reset(); public void Init_by_wiki() {
// TODO.XO: pull from lang
trie.Add_str_byte("__TOC__", Tid__toc);
trie.Add_str_byte("__NOTOC__", Tid__no_toc);
trie.Add_str_byte("__FORCETOC__", Tid__force_toc);
} }
public void Do_double_underscore(byte[] text) { public void Do_double_underscore(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1<hr />', text);
// The position of __TOC__ needs to be recorded // XO.PBFR
// $mw = MagicWord::get( 'toc' ); Bry_bfr src_bfr = pbfr.Src();
// if ( $mw->match( $text ) ) { byte[] src = src_bfr.Bfr();
this.show_toc = true; int src_bgn = 0;
this.force_toc_position = true; int src_end = src_bfr.Len();
Bry_bfr bfr = pbfr.Trg();
// Set a placeholder. At the end we'll fill it in with the TOC. data.Reset();
// $text = $mw->replace( '<!--MWTOC-->', $text, 1 ); // XO.MW: MW does TOC before others; XO does it at the same time
// Only keep the first one.
// $text = $mw->replace( '', $text );
// }
// Now match and remove the rest of them // Now match and remove the rest of them
// $mwa = MagicWord::getDoubleUnderscoreArray(); // XO.MW.BGN: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text );
Match_and_remove(text, doubleunderscore_data); int cur = src_bgn;
int prv = cur;
boolean dirty = false;
while (true) {
if (cur == src_end) {
if (dirty) {
bfr.Add_mid(src, prv, src_end);
}
break;
}
if (doubleunderscore_data.no_gallery) { byte b = src[cur];
output__no_gallery = true; Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end);
} if (o == null) {
if (doubleunderscore_data.no_toc && !force_toc_position) { cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b);
this.show_toc = false; continue;
} }
if ( doubleunderscore_data.hidden_cat
// && $this->mTitle->getNamespace() == NS_CATEGORY dirty = true;
) { bfr.Add_mid(src, prv, cur);
//$this->addTrackingCategory( 'hidden-category-category' ); byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val();
} switch (tid) {
// (T10068) Allow control over whether robots index a page. case Tid__toc:
// __INDEX__ always overrides __NOINDEX__, see T16899 // The position of __TOC__ needs to be recorded
if (doubleunderscore_data.no_index // && $this->mTitle->canUseNoindex() boolean already_seen = !data.show_toc;
) { data.toc = true;
// $this->mOutput->setIndexPolicy( 'noindex' ); data.show_toc = true;
// $this->addTrackingCategory( 'noindex-category' ); data.force_toc_position = true;
if (already_seen) { // Set a placeholder. At the end we'll fill it in with the TOC.
bfr.Add_str_a7("<!--MWTOC-->");
}
else { // Only keep the first one. XO.MW:ignore by not adding anything to bfr
}
break;
// XO.MW: MW adds boolean to hash_table; XO uses boolean props; note that "remove" is done by not adding to bfr
case Tid__no_toc: data.no_toc = true; break;
case Tid__no_gallery: data.no_gallery = true; break;
case Tid__force_toc: data.force_toc = true; break;
case Tid__no_edit_section: data.no_edit_section = true; break;
case Tid__new_section_link: data.new_section_link = true; break;
case Tid__hidden_cat: data.hidden_cat = true; break;
case Tid__index: data.index = true; break;
case Tid__no_index: data.no_index = true; break;
case Tid__static_redirect: data.static_redirect = true; break;
case Tid__no_title_convert: data.no_title_convert = true; break;
case Tid__no_content_convert: data.no_content_convert = true; break;
default: throw Err_.new_unhandled_default(tid);
}
cur = trv.Pos();
prv = cur;
} }
if (doubleunderscore_data.index //&& $this->mTitle->canUseNoindex() // XO.MW.END: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text );
) {
// $this->mOutput->setIndexPolicy( 'index' ); if (data.no_toc && !data.force_toc_position) {
// $this->addTrackingCategory( 'index-category' ); data.show_toc = false;
} }
// Cache all double underscores in the database // XO.MW.EDIT: hidden_cat, index, noindex are used to add to tracking category
// foreach ( $this->mDoubleUnderscores as $key => $val ) { if (dirty)
// $this->mOutput->setProperty( $key, '' ); pbfr.Switch();
// }
}
}
class Xomw_doubleunder_data {
public boolean no_gallery;
public boolean no_toc;
public boolean hidden_cat;
public boolean no_index;
public boolean index;
public void Reset() {
no_gallery = no_toc = hidden_cat = no_index = index = false;
} }
private static final byte
Tid__no_toc = 0
, Tid__no_gallery = 1
, Tid__force_toc = 2
, Tid__toc = 3
, Tid__no_edit_section = 4
, Tid__new_section_link = 5
, Tid__hidden_cat = 6
, Tid__index = 7
, Tid__no_index = 8
, Tid__static_redirect = 9
, Tid__no_title_convert = 10
, Tid__no_content_convert = 11
;
} }

@ -0,0 +1,50 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*;
import org.junit.*; import gplx.core.tests.*;
public class Xomw_doubleunder_wkr__tst {
private final Xomw_doubleunder_wkr__fxt fxt = new Xomw_doubleunder_wkr__fxt();
@Test public void No_match() {fxt.Test__parse("a b c" , "a b c");}
@Test public void Force_toc() {fxt.Test__parse("a __FORCETOC__ b" , "a b").Test__prop_y(fxt.data.force_toc);}
@Test public void Toc() {fxt.Test__parse("a __TOC__ b __TOC__ c" , "a <!--MWTOC--> b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);}
@Test public void Notoc_only() {fxt.Test__parse("a __NOTOC__ b" , "a b").Test__prop_y(fxt.data.no_toc).Test__prop_n(fxt.data.show_toc);} // show_toc is false
@Test public void Notoc_w_toc() {fxt.Test__parse("a __TOC__ b __NOTOC__ c" , "a <!--MWTOC--> b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);} // show_toc is true
}
class Xomw_doubleunder_wkr__fxt {
private final Xomw_parser_ctx pctx = new Xomw_parser_ctx();
private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr();
private final Xomw_doubleunder_wkr wkr = new Xomw_doubleunder_wkr();
public Xomw_doubleunder_data data;
public Xomw_doubleunder_wkr__fxt() {
wkr.Init_by_wiki();
data = wkr.data;
}
public Xomw_doubleunder_wkr__fxt Test__parse(String src_str, String expd) {
byte[] src_bry = Bry_.new_u8(src_str);
wkr.Do_double_underscore(pctx, pbfr.Init(src_bry));
Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear(), src_str);
return this;
}
public Xomw_doubleunder_wkr__fxt Test__prop_y(boolean... ary) {return Test__prop(Bool_.Y, ary);}
public Xomw_doubleunder_wkr__fxt Test__prop_n(boolean... ary) {return Test__prop(Bool_.N, ary);}
private Xomw_doubleunder_wkr__fxt Test__prop(boolean expd, boolean... ary) {
for (boolean v : ary)
Gftest.Eq__bool(expd, v);
return this;
}
}
Loading…
Cancel
Save