From 31ade6aa5fd5f02c44aa45db1ec0aaf95a3ba8af Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Fri, 27 Jan 2017 11:10:15 -0500 Subject: [PATCH] Mw_parse: Add basic implementation for double_underscore --- .../gplx/xowa/mws/parsers/Xomw_parser.java | 12 +- .../xowa/mws/parsers/Xomw_parser__tst.java | 37 ++++- .../doubleunders/Xomw_doubleunder_data.java | 56 +++++++ .../doubleunders/Xomw_doubleunder_wkr.java | 149 +++++++++++------- .../Xomw_doubleunder_wkr__tst.java | 50 ++++++ 5 files changed, 244 insertions(+), 60 deletions(-) create mode 100644 400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_data.java create mode 100644 400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java index 9aa4dd415..507299a92 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser.java @@ -19,7 +19,7 @@ package gplx.xowa.mws.parsers; import gplx.*; import gplx.xowa.*; import gplx.xo import gplx.core.btries.*; import gplx.core.net.*; import gplx.xowa.mws.parsers.prepros.*; import gplx.xowa.mws.parsers.headings.*; import gplx.xowa.mws.parsers.quotes.*; import gplx.xowa.mws.parsers.tables.*; import gplx.xowa.mws.parsers.hrs.*; import gplx.xowa.mws.parsers.nbsps.*; -import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*; +import gplx.xowa.mws.parsers.lnkes.*; import gplx.xowa.mws.parsers.lnkis.*; import gplx.xowa.mws.parsers.magiclinks.*; import gplx.xowa.mws.parsers.doubleunders.*; import gplx.xowa.mws.utls.*; import gplx.xowa.mws.linkers.*; public class Xomw_parser { private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); @@ -30,11 +30,13 @@ public class Xomw_parser { private final Xomw_block_level_pass block_wkr = new Xomw_block_level_pass(); private final Xomw_heading_wkr heading_wkr = new Xomw_heading_wkr(); private final Xomw_magiclinks_wkr magiclinks_wkr = new Xomw_magiclinks_wkr(); + private final Xomw_doubleunder_wkr doubleunder_wkr = new Xomw_doubleunder_wkr(); private final Xomw_link_renderer link_renderer = new Xomw_link_renderer(); private final Xomw_link_holders holders; private final Xomw_heading_cbk__html heading_wkr_cbk; private final Btrie_slim_mgr protocols_trie; private static Xomw_regex_space regex_space; + private static Xomw_regex_boundary regex_boundary; private static Xomw_regex_url regex_url; private final Btrie_rv trv = new Btrie_rv(); private int marker_index = 0; @@ -57,6 +59,7 @@ public class Xomw_parser { if (regex_space == null) { synchronized (Type_adp_.ClassOf_obj(this)) { regex_space = new Xomw_regex_space(); + regex_boundary = new Xomw_regex_boundary(regex_space); regex_url = new Xomw_regex_url(regex_space); } } @@ -65,6 +68,11 @@ public class Xomw_parser { linker.Init_by_wiki(wiki.Lang().Lnki_trail_mgr().Trie()); lnke_wkr.Init_by_wiki(protocols_trie, regex_url, regex_space); lnki_wkr.Init_by_wiki(wiki); + magiclinks_wkr.Init_by_wiki(linker, regex_boundary, regex_url); + doubleunder_wkr.Init_by_wiki(); + } + public void Init_by_page(Xoa_ttl ttl) { + pctx.Init_by_page(ttl); } public void Internal_parse(Xomw_parser_bfr pbfr, byte[] text) { pbfr.Init(text); @@ -106,7 +114,7 @@ public class Xomw_parser { table_wkr.Do_table_stuff(pctx, pbfr); hr_wkr.Replace_hrs(pctx, pbfr); - // text = $this->doDoubleUnderscore(text); + doubleunder_wkr.Do_double_underscore(pctx, pbfr); heading_wkr.Do_headings(pctx, pbfr, heading_wkr_cbk); lnki_wkr.Replace_internal_links(pctx, pbfr); diff --git a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java index b6a66e4cb..656b9e693 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/Xomw_parser__tst.java @@ -21,9 +21,41 @@ public class Xomw_parser__tst { private final Xomw_parser__fxt fxt = new Xomw_parser__fxt(); @Test public void Basic() { fxt.Test__parse(String_.Concat_lines_nl_skip_last - ("a https://c.org b" + ( "== heading_1 ==" + , "para_1" + , "== heading_2 ==" + , "para_2" + , "-----" + , "{|" + , "|-" + , "|a" + , "|}" + , "''italics''" + , "__TOC__" + , "[https://a.org b]" + , "[[A|abc]]" + , "https://c.org" + , "a »b«  !important c" ), String_.Concat_lines_nl_skip_last - ( "" + ( "

heading_1

" + , "

para_1" + , "

" + , "

heading_2

" + , "

para_2" + , "

" + , "
" + , "" + , "" + , "" + , "
a" + , "
" + , "

italics" + , "" + , "b" + , "abc" + , "https://c.org" + , "a »b«  !important c" + , "

" )); } } @@ -34,6 +66,7 @@ class Xomw_parser__fxt { Xoae_app app = Xoa_app_fxt.Make__app__edit(); Xowe_wiki wiki = Xoa_app_fxt.Make__wiki__edit(app); mgr.Init_by_wiki(wiki); + mgr.Init_by_page(wiki.Ttl_parse(Bry_.new_a7("Page_1"))); } public void Test__parse(String src_str, String expd) { byte[] src_bry = Bry_.new_u8(src_str); diff --git a/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_data.java b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_data.java new file mode 100644 index 000000000..318b587a4 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_data.java @@ -0,0 +1,56 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; +public class Xomw_doubleunder_data { + // XO.MW: MW stores these as mDoubleUnderscores in Parser + public boolean toc; + public boolean no_toc; + public boolean force_toc; + + public boolean no_gallery; + public boolean force_gallery; + + public boolean no_title_convert; + public boolean no_content_convert; + + public boolean no_edit_section; + public boolean new_section_link; + + public boolean static_redirect; + + public boolean hidden_cat; + + public boolean index; + public boolean no_index; + + // XO.MW: MW stores these as member variables in Parser + public boolean show_toc; + public boolean force_toc_position; + + public void Reset() { + toc = no_toc = force_toc = + no_gallery = force_gallery = + no_title_convert = no_content_convert = + no_edit_section = new_section_link = + static_redirect = + hidden_cat = index = no_index = + false; + + show_toc = force_toc_position = false; + } +} diff --git a/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr.java b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr.java index accd90101..efc4afc40 100644 --- a/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr.java +++ b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr.java @@ -16,69 +16,106 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; -class Xomw_doubleunder_wkr { - public boolean show_toc; - public boolean force_toc_position; - public boolean output__no_gallery ; - public Xomw_doubleunder_data doubleunderscore_data = new Xomw_doubleunder_data(); - private void Match_and_remove(byte[] text, Xomw_doubleunder_data doubleunderscore_data) { - doubleunderscore_data.Reset(); +import gplx.core.btries.*; +// TODO.CS: handle case sensitive keys; EX: __notoc__ should not match __NOTOC__ if cs is enabled for magic word +public class Xomw_doubleunder_wkr { + private final Btrie_slim_mgr trie = Btrie_slim_mgr.ci_u8(); + private final Btrie_rv trv = new Btrie_rv(); + public Xomw_doubleunder_data data = new Xomw_doubleunder_data(); + public void Init_by_wiki() { + // TODO.XO: pull from lang + trie.Add_str_byte("__TOC__", Tid__toc); + trie.Add_str_byte("__NOTOC__", Tid__no_toc); + trie.Add_str_byte("__FORCETOC__", Tid__force_toc); } - public void Do_double_underscore(byte[] text) { - // The position of __TOC__ needs to be recorded -// $mw = MagicWord::get( 'toc' ); -// if ( $mw->match( $text ) ) { - this.show_toc = true; - this.force_toc_position = true; + public void Do_double_underscore(Xomw_parser_ctx pctx, Xomw_parser_bfr pbfr) { // REF.MW: text = preg_replace('/(^|\n)-----*/', '\\1
', text); + // XO.PBFR + Bry_bfr src_bfr = pbfr.Src(); + byte[] src = src_bfr.Bfr(); + int src_bgn = 0; + int src_end = src_bfr.Len(); + Bry_bfr bfr = pbfr.Trg(); - // Set a placeholder. At the end we'll fill it in with the TOC. -// $text = $mw->replace( '', $text, 1 ); - - // Only keep the first one. -// $text = $mw->replace( '', $text ); -// } + data.Reset(); + // XO.MW: MW does TOC before others; XO does it at the same time // Now match and remove the rest of them -// $mwa = MagicWord::getDoubleUnderscoreArray(); - Match_and_remove(text, doubleunderscore_data); + // XO.MW.BGN: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text ); + int cur = src_bgn; + int prv = cur; + boolean dirty = false; + while (true) { + if (cur == src_end) { + if (dirty) { + bfr.Add_mid(src, prv, src_end); + } + break; + } - if (doubleunderscore_data.no_gallery) { - output__no_gallery = true; - } - if (doubleunderscore_data.no_toc && !force_toc_position) { - this.show_toc = false; - } - if ( doubleunderscore_data.hidden_cat - // && $this->mTitle->getNamespace() == NS_CATEGORY - ) { - //$this->addTrackingCategory( 'hidden-category-category' ); - } - // (T10068) Allow control over whether robots index a page. - // __INDEX__ always overrides __NOINDEX__, see T16899 - if (doubleunderscore_data.no_index // && $this->mTitle->canUseNoindex() - ) { - // $this->mOutput->setIndexPolicy( 'noindex' ); - // $this->addTrackingCategory( 'noindex-category' ); + byte b = src[cur]; + Object o = trie.Match_at_w_b0(trv, b, src, cur, src_end); + if (o == null) { + cur += gplx.core.intls.Utf8_.Len_of_char_by_1st_byte(b); + continue; + } + + dirty = true; + bfr.Add_mid(src, prv, cur); + byte tid = ((gplx.core.primitives.Byte_obj_val)o).Val(); + switch (tid) { + case Tid__toc: + // The position of __TOC__ needs to be recorded + boolean already_seen = !data.show_toc; + data.toc = true; + data.show_toc = true; + data.force_toc_position = true; + + if (already_seen) { // Set a placeholder. At the end we'll fill it in with the TOC. + bfr.Add_str_a7(""); + } + else { // Only keep the first one. XO.MW:ignore by not adding anything to bfr + } + break; + // XO.MW: MW adds boolean to hash_table; XO uses boolean props; note that "remove" is done by not adding to bfr + case Tid__no_toc: data.no_toc = true; break; + case Tid__no_gallery: data.no_gallery = true; break; + case Tid__force_toc: data.force_toc = true; break; + case Tid__no_edit_section: data.no_edit_section = true; break; + case Tid__new_section_link: data.new_section_link = true; break; + case Tid__hidden_cat: data.hidden_cat = true; break; + case Tid__index: data.index = true; break; + case Tid__no_index: data.no_index = true; break; + case Tid__static_redirect: data.static_redirect = true; break; + case Tid__no_title_convert: data.no_title_convert = true; break; + case Tid__no_content_convert: data.no_content_convert = true; break; + default: throw Err_.new_unhandled_default(tid); + } + cur = trv.Pos(); + prv = cur; } - if (doubleunderscore_data.index //&& $this->mTitle->canUseNoindex() - ) { - // $this->mOutput->setIndexPolicy( 'index' ); - // $this->addTrackingCategory( 'index-category' ); + // XO.MW.END: $this->mDoubleUnderscores = $mwa->matchAndRemove( $text ); + + if (data.no_toc && !data.force_toc_position) { + data.show_toc = false; } - // Cache all double underscores in the database - // foreach ( $this->mDoubleUnderscores as $key => $val ) { - // $this->mOutput->setProperty( $key, '' ); - // } - } -} -class Xomw_doubleunder_data { - public boolean no_gallery; - public boolean no_toc; - public boolean hidden_cat; - public boolean no_index; - public boolean index; - public void Reset() { - no_gallery = no_toc = hidden_cat = no_index = index = false; + // XO.MW.EDIT: hidden_cat, index, noindex are used to add to tracking category + if (dirty) + pbfr.Switch(); } + + private static final byte + Tid__no_toc = 0 + , Tid__no_gallery = 1 + , Tid__force_toc = 2 + , Tid__toc = 3 + , Tid__no_edit_section = 4 + , Tid__new_section_link = 5 + , Tid__hidden_cat = 6 + , Tid__index = 7 + , Tid__no_index = 8 + , Tid__static_redirect = 9 + , Tid__no_title_convert = 10 + , Tid__no_content_convert = 11 + ; } diff --git a/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java new file mode 100644 index 000000000..48c7c3e62 --- /dev/null +++ b/400_xowa/src/gplx/xowa/mws/parsers/doubleunders/Xomw_doubleunder_wkr__tst.java @@ -0,0 +1,50 @@ +/* +XOWA: the XOWA Offline Wiki Application +Copyright (C) 2012 gnosygnu@gmail.com + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package gplx.xowa.mws.parsers.doubleunders; import gplx.*; import gplx.xowa.*; import gplx.xowa.mws.*; import gplx.xowa.mws.parsers.*; +import org.junit.*; import gplx.core.tests.*; +public class Xomw_doubleunder_wkr__tst { + private final Xomw_doubleunder_wkr__fxt fxt = new Xomw_doubleunder_wkr__fxt(); + @Test public void No_match() {fxt.Test__parse("a b c" , "a b c");} + @Test public void Force_toc() {fxt.Test__parse("a __FORCETOC__ b" , "a b").Test__prop_y(fxt.data.force_toc);} + @Test public void Toc() {fxt.Test__parse("a __TOC__ b __TOC__ c" , "a b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);} + @Test public void Notoc_only() {fxt.Test__parse("a __NOTOC__ b" , "a b").Test__prop_y(fxt.data.no_toc).Test__prop_n(fxt.data.show_toc);} // show_toc is false + @Test public void Notoc_w_toc() {fxt.Test__parse("a __TOC__ b __NOTOC__ c" , "a b c").Test__prop_y(fxt.data.toc, fxt.data.show_toc, fxt.data.force_toc_position);} // show_toc is true +} +class Xomw_doubleunder_wkr__fxt { + private final Xomw_parser_ctx pctx = new Xomw_parser_ctx(); + private final Xomw_parser_bfr pbfr = new Xomw_parser_bfr(); + private final Xomw_doubleunder_wkr wkr = new Xomw_doubleunder_wkr(); + public Xomw_doubleunder_data data; + public Xomw_doubleunder_wkr__fxt() { + wkr.Init_by_wiki(); + data = wkr.data; + } + public Xomw_doubleunder_wkr__fxt Test__parse(String src_str, String expd) { + byte[] src_bry = Bry_.new_u8(src_str); + wkr.Do_double_underscore(pctx, pbfr.Init(src_bry)); + Gftest.Eq__str(expd, pbfr.Rslt().To_str_and_clear(), src_str); + return this; + } + public Xomw_doubleunder_wkr__fxt Test__prop_y(boolean... ary) {return Test__prop(Bool_.Y, ary);} + public Xomw_doubleunder_wkr__fxt Test__prop_n(boolean... ary) {return Test__prop(Bool_.N, ary);} + private Xomw_doubleunder_wkr__fxt Test__prop(boolean expd, boolean... ary) { + for (boolean v : ary) + Gftest.Eq__bool(expd, v); + return this; + } +}