mirror of
https://github.com/gnosygnu/xowa.git
synced 2026-03-02 03:49:30 +00:00
update_make_page
This commit is contained in:
@@ -25,7 +25,18 @@
|
||||
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr">
|
||||
|
||||
<p>
|
||||
XOWA can generate two types of dumps: file-dumps and html-dumps
|
||||
XOWA can make complete wikis which will have the following:
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
All images downloaded offline
|
||||
</li>
|
||||
<li>
|
||||
All pages compiled into HTML (pages will load faster)
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
This process is run by a custom command-line <code>make</code> script.
|
||||
</p>
|
||||
<p>
|
||||
<br>
|
||||
@@ -61,53 +72,74 @@
|
||||
<a href="#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-2">
|
||||
<a href="#Requirements"><span class="tocnumber">2</span> <span class="toctext">Requirements</span></a>
|
||||
<a href="#Process"><span class="tocnumber">2</span> <span class="toctext">Process</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-3">
|
||||
<a href="#Script"><span class="tocnumber">3</span> <span class="toctext">Script</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-2 tocsection-3">
|
||||
<a href="#commons.wikimedia.org"><span class="tocnumber">2.1</span> <span class="toctext">commons.wikimedia.org</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-4">
|
||||
<a href="#www.wikidata.org"><span class="tocnumber">2.2</span> <span class="toctext">www.wikidata.org</span></a>
|
||||
<a href="#make_commons"><span class="tocnumber">3.1</span> <span class="toctext">make_commons</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-5">
|
||||
<a href="#Hardware"><span class="tocnumber">2.3</span> <span class="toctext">Hardware</span></a>
|
||||
<a href="#make_wikidata"><span class="tocnumber">3.2</span> <span class="toctext">make_wikidata</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-6">
|
||||
<a href="#Internet-connectivity_(optional)"><span class="tocnumber">2.4</span> <span class="toctext">Internet-connectivity (optional)</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-7">
|
||||
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">2.5</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
|
||||
<a href="#make_wiki"><span class="tocnumber">3.3</span> <span class="toctext">make_wiki</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-8">
|
||||
<a href="#gfs"><span class="tocnumber">3</span> <span class="toctext">gfs</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-9">
|
||||
<a href="#Terms"><span class="tocnumber">4</span> <span class="toctext">Terms</span></a>
|
||||
<li class="toclevel-1 tocsection-7">
|
||||
<a href="#Appendix"><span class="tocnumber">4</span> <span class="toctext">Appendix</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-2 tocsection-10">
|
||||
<a href="#lnki"><span class="tocnumber">4.1</span> <span class="toctext">lnki</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-11">
|
||||
<a href="#orig"><span class="tocnumber">4.2</span> <span class="toctext">orig</span></a>
|
||||
<li class="toclevel-2 tocsection-8">
|
||||
<a href="#Requirements"><span class="tocnumber">4.1</span> <span class="toctext">Requirements</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-9">
|
||||
<a href="#Hardware"><span class="tocnumber">4.1.1</span> <span class="toctext">Hardware</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-10">
|
||||
<a href="#Internet-connectivity"><span class="tocnumber">4.1.2</span> <span class="toctext">Internet-connectivity</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-11">
|
||||
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">4.1.3</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-12">
|
||||
<a href="#xfer"><span class="tocnumber">4.3</span> <span class="toctext">xfer</span></a>
|
||||
<a href="#gfs_script"><span class="tocnumber">4.2</span> <span class="toctext">gfs script</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-13">
|
||||
<a href="#fsdb"><span class="tocnumber">4.4</span> <span class="toctext">fsdb</span></a>
|
||||
<a href="#Terms"><span class="tocnumber">4.3</span> <span class="toctext">Terms</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-14">
|
||||
<a href="#lnki"><span class="tocnumber">4.3.1</span> <span class="toctext">lnki</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-15">
|
||||
<a href="#orig"><span class="tocnumber">4.3.2</span> <span class="toctext">orig</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-16">
|
||||
<a href="#xfer"><span class="tocnumber">4.3.3</span> <span class="toctext">xfer</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-17">
|
||||
<a href="#fsdb"><span class="tocnumber">4.3.4</span> <span class="toctext">fsdb</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-18">
|
||||
<a href="#Examples"><span class="tocnumber">4.4</span> <span class="toctext">Examples</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-19">
|
||||
<a href="#Simple_Wikipedia_example_with_documentation"><span class="tocnumber">4.4.1</span> <span class="toctext">Simple Wikipedia example with documentation</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-20">
|
||||
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">4.4.2</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-14">
|
||||
<a href="#Script:_Simple_Wikipedia_example_with_documentation"><span class="tocnumber">5</span> <span class="toctext">Script: Simple Wikipedia example with documentation</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-15">
|
||||
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">6</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-16">
|
||||
<a href="#Change_log"><span class="tocnumber">7</span> <span class="toctext">Change log</span></a>
|
||||
<li class="toclevel-1 tocsection-21">
|
||||
<a href="#Change_log"><span class="tocnumber">5</span> <span class="toctext">Change log</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
@@ -115,64 +147,333 @@
|
||||
<span class="mw-headline" id="Overview">Overview</span>
|
||||
</h2>
|
||||
<p>
|
||||
The download-thumbs script downloads all thumbs for pages in a specific wiki. It works in the following way:
|
||||
The <code>make</code> script works in the following way:
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
It loads a page.
|
||||
Loads the wikitext for a page.
|
||||
</li>
|
||||
<li>
|
||||
It converts the wikitext to HTML
|
||||
Converts the wikitext to HTML and saves it.
|
||||
</li>
|
||||
<li>
|
||||
Gathers a list of [[File]] links.
|
||||
</li>
|
||||
<li>
|
||||
Repeats for each page until there are no more pages
|
||||
</li>
|
||||
<li>
|
||||
Downloads the list of [[File]] to create the XOWA file databases.
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Process">Process</span>
|
||||
</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Open up a terminal
|
||||
<ul>
|
||||
<li>
|
||||
If thumb mode is enabled, it compiles a list of [[File]] links.
|
||||
On Windows, run <code>cmd</code>
|
||||
</li>
|
||||
<li>
|
||||
If HTML-dump mode is enabled, it saves the HTML into XOWA html databases
|
||||
On Linux / Mac OS X, run the Terminal app
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
It repeats until there are no more pages
|
||||
</li>
|
||||
<li>
|
||||
If thumb mode, it does the following additional steps
|
||||
Change to the xowa root directory
|
||||
<ul>
|
||||
<li>
|
||||
It analyzes the list of [[File]] links to come up with a unique list of thumbs.
|
||||
For example, if xowa is setup in <code>C:\xowa</code>, run <code>cd C:\xowa</code>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Create a text file in your xowa root folder called <code>make_xowa.gfs</code> with a text-editor.
|
||||
<ul>
|
||||
<li>
|
||||
For Windows, Notepad++ is recommended
|
||||
</li>
|
||||
<li>
|
||||
It downloads the thumbs and creates the XOWA file databases.
|
||||
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Copy each of the scripts below to the text file
|
||||
</li>
|
||||
<li>
|
||||
Run the following command. Make sure to match the jar path and jar file
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
Wait for the script to complete
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Script">Script</span>
|
||||
</h2>
|
||||
<p>
|
||||
The <code>make</code> script should be run in 3 parts:
|
||||
</p>
|
||||
<ol>
|
||||
<li>
|
||||
<code>make_commons</code> script: Builds <b>commons.wikimedia.org</b> which is needed to provide image metadata for the download
|
||||
</li>
|
||||
<li>
|
||||
<code>make_wikidata</code> script: Builds <b>www.wikidata.org</b> which needed for data from {{#property}} calls or Module code.
|
||||
</li>
|
||||
<li>
|
||||
<code>make_wiki</code> script: Build the actual wiki
|
||||
</li>
|
||||
</ol>
|
||||
<p>
|
||||
Note that other wikis can re-use the same commons and wikidata. For example, if you want to build enwiki and dewiki, you only need to build <code>make_commons</code> and <code>make_wikidata</code> once.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_commons"><code>make_commons</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build commons database; this only needs to be done once, whenever commons is updated
|
||||
add ('commons.wikimedia.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('commons.wikimedia.org' , 'text.init');
|
||||
add ('commons.wikimedia.org' , 'text.page');
|
||||
add ('commons.wikimedia.org' , 'text.term');
|
||||
add ('commons.wikimedia.org' , 'text.css');
|
||||
add ('commons.wikimedia.org' , 'wiki.page_props');
|
||||
add ('commons.wikimedia.org' , 'wiki.image');
|
||||
add ('commons.wikimedia.org' , 'file.page_regy') {build_commons = 'y'}
|
||||
add ('commons.wikimedia.org' , 'wiki.page_dump.make');
|
||||
add ('commons.wikimedia.org' , 'wiki.redirect') {commit_interval = 1000; progress_interval = 100; cleanup_interval = 100;}
|
||||
add ('commons.wikimedia.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script will take about 7 hours to complete and use 125 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_wikidata"><code>make_wikidata</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build wikidata database; this only needs to be done once, whenever wikidata is updated
|
||||
add ('www.wikidata.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'categorylinks';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('www.wikidata.org' , 'text.init');
|
||||
add ('www.wikidata.org' , 'text.page');
|
||||
add ('www.wikidata.org' , 'text.term');
|
||||
add ('www.wikidata.org' , 'text.css');
|
||||
add ('www.wikidata.org' , 'wiki.page_props');
|
||||
add ('www.wikidata.org' , 'wiki.categorylinks');
|
||||
add ('www.wikidata.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script can take about 24 hours to complete and use 250 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_wiki"><code>make_wiki</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build simple.wikipedia.org
|
||||
add ('simple.wikipedia.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'categorylinks';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'pagelinks';} // needed for sorting search results by PageRank
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'imagelinks';}
|
||||
add ('simple.wikipedia.org' , 'text.init');
|
||||
add ('simple.wikipedia.org' , 'text.page') {
|
||||
// calculate redirect_id for #REDIRECT pages. needed for html databases
|
||||
redirect_id_enabled = 'y';
|
||||
}
|
||||
add ('simple.wikipedia.org' , 'text.search');
|
||||
|
||||
// upload desktop css
|
||||
add ('simple.wikipedia.org' , 'text.css');
|
||||
|
||||
// upload mobile css
|
||||
add ('simple.wikipedia.org' , 'text.css') {css_key = 'xowa.mobile'; /* css_dir = 'C:\xowa\user\anonymous\wiki\simple.wikipedia.org-mobile\html\'; */}
|
||||
|
||||
add ('simple.wikipedia.org' , 'text.term');
|
||||
|
||||
add ('simple.wikipedia.org' , 'wiki.page_props');
|
||||
add ('simple.wikipedia.org' , 'wiki.categorylinks');
|
||||
|
||||
// create local "page" tables in each "text" database for "lnki_temp"
|
||||
add ('simple.wikipedia.org' , 'wiki.page_dump.make');
|
||||
|
||||
// create a redirect table for pages in the File namespace
|
||||
add ('simple.wikipedia.org' , 'wiki.redirect') {commit_interval = 1000; progress_interval = 100; cleanup_interval = 100;}
|
||||
|
||||
// create an "image" table to get the metadata for all files in the current wiki
|
||||
add ('simple.wikipedia.org' , 'wiki.image');
|
||||
|
||||
// create an "imagelinks" table to find out which images are used for the wiki
|
||||
add ('simple.wikipedia.org' , 'wiki.imagelinks');
|
||||
|
||||
// parse all page-to-page links
|
||||
add ('simple.wikipedia.org' , 'wiki.page_link');
|
||||
|
||||
// calculate a score for each page using the page-to-page links
|
||||
add ('simple.wikipedia.org' , 'search.page__page_score') {iteration_max = 100;}
|
||||
|
||||
// update link score statistics for the search tables
|
||||
add ('simple.wikipedia.org' , 'search.link__link_score') {page_rank_enabled = 'y';}
|
||||
|
||||
// update word count statistics for the search_word table
|
||||
add ('simple.wikipedia.org' , 'search.word__link_count');
|
||||
|
||||
// cleanup all downloaded files as well as temporary files
|
||||
add ('simple.wikipedia.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
|
||||
// v2 html generator; allows for multi-threaded / multi-machine builds
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
|
||||
|
||||
// NOTE: must change manual_now
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
|
||||
cfg {
|
||||
num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
|
||||
load_all_imglinks = 'y';
|
||||
|
||||
// uncomment the following 3 lines if using the build script as a "worker" helping a "server"
|
||||
// num_pages_in_pool = 32000;
|
||||
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
|
||||
// wkr_machine_name = 'worker_machine_1'
|
||||
}
|
||||
}
|
||||
|
||||
// note that if multi-machine mode is enabled, all worker directories must be manually copied to the server directory (a build command will be added later)
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.make');
|
||||
|
||||
// aggregate the lnkis
|
||||
add ('simple.wikipedia.org' , 'file.lnki_regy');
|
||||
|
||||
// generate orig metadata for files in the current wiki (for example, for pages in en.wikipedia.org/wiki/File:*)
|
||||
add ('simple.wikipedia.org' , 'file.page_regy') {build_commons = 'n';}
|
||||
|
||||
// generate all orig metadata for all lnkis
|
||||
add ('simple.wikipedia.org' , 'file.orig_regy');
|
||||
|
||||
// generate list of files to download based on "orig_regy" and XOWA image code
|
||||
add ('simple.wikipedia.org' , 'file.xfer_temp.thumb');
|
||||
|
||||
// aggregate list one more time
|
||||
add ('simple.wikipedia.org' , 'file.xfer_regy');
|
||||
|
||||
// identify images that have already been downloaded
|
||||
add ('simple.wikipedia.org' , 'file.xfer_regy_update');
|
||||
|
||||
// download images. This step may also take a long time, depending on how many images are needed
|
||||
add ('simple.wikipedia.org' , 'file.fsdb_make') {
|
||||
commit_interval = 1000; progress_interval = 200; select_interval = 10000;
|
||||
ns_ids = '0|4|14';
|
||||
|
||||
// specify whether original wiki databases are v1 (.sqlite3) or v2 (.xowa)
|
||||
src_bin_mgr__fsdb_version = 'v1';
|
||||
|
||||
// always redownload certain files
|
||||
src_bin_mgr__fsdb_skip_wkrs = 'page_gt_1|small_size';
|
||||
|
||||
// allow downloads from wikimedia
|
||||
src_bin_mgr__wmf_enabled = 'y';
|
||||
}
|
||||
|
||||
// generate registry of original metadata by file title
|
||||
add ('simple.wikipedia.org' , 'file.orig_reg');
|
||||
|
||||
// drop page_dump tables
|
||||
add ('simple.wikipedia.org' , 'wiki.page_dump.drop');
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Change the <code>manual_now</code> above to match the first day of the current month. For example, if today is <code>2020-02-16</code>, change it to <code>manual_now = '2020-02-01 01:02:03'</code>.
|
||||
</li>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script can take about 1 hour to complete and use 5 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
The script for simple wikipedia is listed below.
|
||||
</p>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Requirements">Requirements</span>
|
||||
<span class="mw-headline" id="Appendix">Appendix</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="commons.wikimedia.org">commons.wikimedia.org</span>
|
||||
<span class="mw-headline" id="Requirements">Requirements</span>
|
||||
</h3>
|
||||
<p>
|
||||
You will need the latest version of commons.wikimedia.org. Note that if you have an older version, you will have missing images or wrong size information.
|
||||
</p>
|
||||
<p>
|
||||
For example, if you have a commons.wikimedia.org from 2015-04-22 and are trying to import a 2015-05-17 English Wikipedia, then any new images added after 2015-04-22 will not be picked up.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="www.wikidata.org">www.wikidata.org</span>
|
||||
</h3>
|
||||
<p>
|
||||
You also need to have the latest version of www.wikidata.org. Note that English Wikipedia and other wikis uses Wikidata through the {{#property}} call or Module code. If you have an earlier version, then data will be missing or out of date.
|
||||
</p>
|
||||
<h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Hardware">Hardware</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
You should have a recent-generation machine with relatively high-performance hardware, especially if you're planning to generate images for English Wikipedia.
|
||||
You should have a recent-generation machine with relatively high-performance hardware, especially if you're planning to run the <code>make</code> script for English Wikipedia.
|
||||
</p>
|
||||
<p>
|
||||
For context, here is my current machine setup for generating the image dumps:
|
||||
@@ -195,20 +496,20 @@
|
||||
(Note: The hardware was assembled in late 2013.)
|
||||
</p>
|
||||
<p>
|
||||
For English Wikipedia, it still takes about 50 hours for the entire process.
|
||||
For English Wikipedia, it takes about 50 hours for the entire process.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Internet-connectivity_(optional)">Internet-connectivity (optional)</span>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Internet-connectivity">Internet-connectivity</span>
|
||||
</h4>
|
||||
<p>
|
||||
You should have a broadband connection to the internet. The script will need to download dump files from Wikimedia and some dump files (like English Wikipedia) will be in the 10s of GB.
|
||||
You should have a broadband connection to the internet. The script will need to download dump files from Wikimedia and some dump files (like English Wikipedia) will be in the tens of GB.
|
||||
</p>
|
||||
<p>
|
||||
You can opt to download these files separately and place them in the appropriate location beforehand. However, the script below assumes that the machine is always online. If you are offline, you will need to comment the "util.download" lines yourself.
|
||||
<br>
|
||||
</p>
|
||||
<h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Pre-existing_image_databases_for_your_wiki_(optional)">Pre-existing image databases for your wiki (optional)</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
XOWA will automatically re-use the images from existing image databases so that you do not have to redownload them. This is particularly useful for large wikis where redownloading millions of images would be unwanted.
|
||||
</p>
|
||||
@@ -223,9 +524,9 @@
|
||||
If you have v2 image databases, they should be placed in <code>/xowa/wiki/wiki_domain/prv</code>. For example, English Wikipedia should have <code>/xowa/wiki/en.wikipedia.org/prv/en.wikipedia.org-file-ns.000-db.001.xowa</code>
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="gfs">gfs</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="gfs_script">gfs script</span>
|
||||
</h3>
|
||||
<p>
|
||||
The script is written in the <code>gfs</code> format. This is a custom scripting format specific to XOWA. It is similar to JSON, but also supports commenting.
|
||||
</p>
|
||||
@@ -258,45 +559,48 @@
|
||||
Statements are grouped with curly braces. ({}). For example: <code>group {procedure1; procedure2; procedure3;}</code>
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Terms">Terms</span>
|
||||
</h2>
|
||||
<h3>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="lnki">lnki</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
A <code>lnki</code> is short for "<b>l</b>i<b>nk</b> <b>i</b>nternal". It refers to all wikitext with the double bracket syntax: [[A]]. A more elaborate example for files would be [[File:A.png|thumb|200x300px|upright=.80]]. Note that the abbreviation was chosen to differentiate it from <code>lnke</code> which is short for "<b>l</b>i<b>nk</b> <b>e</b>nternal". For the purposes of the script, all lnki data comes from the current wiki's data dump
|
||||
A <code>lnki</code> is short for "<b>l</b>i<b>nk</b> <b>i</b>nternal". It refers to all wikitext with the double bracket syntax: [[A]]. A more elaborate example for files would be [[File:A.png|thumb|200x300px|upright=.80]]. Note that the abbreviation was chosen to differentiate it from <code>lnke</code> which is short for "<b>l</b>i<b>nk</b> <b>e</b>nternal".
|
||||
</p>
|
||||
<h3>
|
||||
<p>
|
||||
For the purposes of the script, all lnki data comes from the wikitext in the current wiki's data dump
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="orig">orig</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
An <code>orig</code> is short for "<b>orig</b>inal file". It refers to the original file metadata. For the purposes of this script, all orig data comes from commons.wikimedia.org
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
</h4>
|
||||
<p>
|
||||
An <code>orig</code> is short for "<b>orig</b>inal file". It refers to the original file metadata.
|
||||
</p>
|
||||
<p>
|
||||
For the purposes of this script, all orig data comes from commons.wikimedia.org
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="xfer">xfer</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
An <code>xfer</code> is short for "transfer file". It refers to the actual file to be downloaded.
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
</h4>
|
||||
<p>
|
||||
An <code>xfer</code> is short for "transfer file". It refers to the actual file to be downloaded.
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="fsdb">fsdb</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
The <code>fsdb</code> is short for "<b>f</b>ile <b>s</b>ystem <b>d</b>ata<b>b</b>ase". It refers to the internal table format of the XOWA image databases.
|
||||
</li>
|
||||
</ul>
|
||||
</h4>
|
||||
<p>
|
||||
The <code>fsdb</code> is short for "<b>f</b>ile <b>s</b>ystem <b>d</b>ata<b>b</b>ase". It refers to the file as it is stored in the internal table format of the XOWA image databases.
|
||||
</p>
|
||||
<p>
|
||||
<br>
|
||||
</p>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Script:_Simple_Wikipedia_example_with_documentation">Script: Simple Wikipedia example with documentation</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Examples">Examples</span>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Simple_Wikipedia_example_with_documentation">Simple Wikipedia example with documentation</span>
|
||||
</h4>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
@@ -485,9 +789,9 @@ app.bldr.cmds {
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<h2>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span>
|
||||
</h2>
|
||||
</h4>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
@@ -621,6 +925,9 @@ app.bldr.run;
|
||||
<li>
|
||||
2017-02-02: updated script for multi-threaded version and new options
|
||||
</li>
|
||||
<li>
|
||||
2020-02-16: rewrote page to provide more explicit step-by-steps. Moved content to glossary
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -25,7 +25,18 @@
|
||||
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr">
|
||||
|
||||
<p>
|
||||
XOWA can generate two types of dumps: file-dumps and html-dumps
|
||||
XOWA can make complete wikis which will have the following:
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
All images downloaded offline
|
||||
</li>
|
||||
<li>
|
||||
All pages compiled into HTML (pages will load faster)
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
This process is run by a custom command-line <code>make</code> script.
|
||||
</p>
|
||||
<p>
|
||||
<br>
|
||||
@@ -61,53 +72,74 @@
|
||||
<a href="#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-2">
|
||||
<a href="#Requirements"><span class="tocnumber">2</span> <span class="toctext">Requirements</span></a>
|
||||
<a href="#Process"><span class="tocnumber">2</span> <span class="toctext">Process</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-3">
|
||||
<a href="#Script"><span class="tocnumber">3</span> <span class="toctext">Script</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-2 tocsection-3">
|
||||
<a href="#commons.wikimedia.org"><span class="tocnumber">2.1</span> <span class="toctext">commons.wikimedia.org</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-4">
|
||||
<a href="#www.wikidata.org"><span class="tocnumber">2.2</span> <span class="toctext">www.wikidata.org</span></a>
|
||||
<a href="#make_commons"><span class="tocnumber">3.1</span> <span class="toctext">make_commons</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-5">
|
||||
<a href="#Hardware"><span class="tocnumber">2.3</span> <span class="toctext">Hardware</span></a>
|
||||
<a href="#make_wikidata"><span class="tocnumber">3.2</span> <span class="toctext">make_wikidata</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-6">
|
||||
<a href="#Internet-connectivity_(optional)"><span class="tocnumber">2.4</span> <span class="toctext">Internet-connectivity (optional)</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-7">
|
||||
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">2.5</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
|
||||
<a href="#make_wiki"><span class="tocnumber">3.3</span> <span class="toctext">make_wiki</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-8">
|
||||
<a href="#gfs"><span class="tocnumber">3</span> <span class="toctext">gfs</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-9">
|
||||
<a href="#Terms"><span class="tocnumber">4</span> <span class="toctext">Terms</span></a>
|
||||
<li class="toclevel-1 tocsection-7">
|
||||
<a href="#Appendix"><span class="tocnumber">4</span> <span class="toctext">Appendix</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-2 tocsection-10">
|
||||
<a href="#lnki"><span class="tocnumber">4.1</span> <span class="toctext">lnki</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-11">
|
||||
<a href="#orig"><span class="tocnumber">4.2</span> <span class="toctext">orig</span></a>
|
||||
<li class="toclevel-2 tocsection-8">
|
||||
<a href="#Requirements"><span class="tocnumber">4.1</span> <span class="toctext">Requirements</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-9">
|
||||
<a href="#Hardware"><span class="tocnumber">4.1.1</span> <span class="toctext">Hardware</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-10">
|
||||
<a href="#Internet-connectivity"><span class="tocnumber">4.1.2</span> <span class="toctext">Internet-connectivity</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-11">
|
||||
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">4.1.3</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-12">
|
||||
<a href="#xfer"><span class="tocnumber">4.3</span> <span class="toctext">xfer</span></a>
|
||||
<a href="#gfs_script"><span class="tocnumber">4.2</span> <span class="toctext">gfs script</span></a>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-13">
|
||||
<a href="#fsdb"><span class="tocnumber">4.4</span> <span class="toctext">fsdb</span></a>
|
||||
<a href="#Terms"><span class="tocnumber">4.3</span> <span class="toctext">Terms</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-14">
|
||||
<a href="#lnki"><span class="tocnumber">4.3.1</span> <span class="toctext">lnki</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-15">
|
||||
<a href="#orig"><span class="tocnumber">4.3.2</span> <span class="toctext">orig</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-16">
|
||||
<a href="#xfer"><span class="tocnumber">4.3.3</span> <span class="toctext">xfer</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-17">
|
||||
<a href="#fsdb"><span class="tocnumber">4.3.4</span> <span class="toctext">fsdb</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-2 tocsection-18">
|
||||
<a href="#Examples"><span class="tocnumber">4.4</span> <span class="toctext">Examples</span></a>
|
||||
<ul>
|
||||
<li class="toclevel-3 tocsection-19">
|
||||
<a href="#Simple_Wikipedia_example_with_documentation"><span class="tocnumber">4.4.1</span> <span class="toctext">Simple Wikipedia example with documentation</span></a>
|
||||
</li>
|
||||
<li class="toclevel-3 tocsection-20">
|
||||
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">4.4.2</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-14">
|
||||
<a href="#Script:_Simple_Wikipedia_example_with_documentation"><span class="tocnumber">5</span> <span class="toctext">Script: Simple Wikipedia example with documentation</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-15">
|
||||
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">6</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
|
||||
</li>
|
||||
<li class="toclevel-1 tocsection-16">
|
||||
<a href="#Change_log"><span class="tocnumber">7</span> <span class="toctext">Change log</span></a>
|
||||
<li class="toclevel-1 tocsection-21">
|
||||
<a href="#Change_log"><span class="tocnumber">5</span> <span class="toctext">Change log</span></a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
@@ -115,64 +147,333 @@
|
||||
<span class="mw-headline" id="Overview">Overview</span>
|
||||
</h2>
|
||||
<p>
|
||||
The download-thumbs script downloads all thumbs for pages in a specific wiki. It works in the following way:
|
||||
The <code>make</code> script works in the following way:
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
It loads a page.
|
||||
Loads the wikitext for a page.
|
||||
</li>
|
||||
<li>
|
||||
It converts the wikitext to HTML
|
||||
Converts the wikitext to HTML and saves it.
|
||||
</li>
|
||||
<li>
|
||||
Gathers a list of [[File]] links.
|
||||
</li>
|
||||
<li>
|
||||
Repeats for each page until there are no more pages
|
||||
</li>
|
||||
<li>
|
||||
Downloads the list of [[File]] to create the XOWA file databases.
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Process">Process</span>
|
||||
</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Open up a terminal
|
||||
<ul>
|
||||
<li>
|
||||
If thumb mode is enabled, it compiles a list of [[File]] links.
|
||||
On Windows, run <code>cmd</code>
|
||||
</li>
|
||||
<li>
|
||||
If HTML-dump mode is enabled, it saves the HTML into XOWA html databases
|
||||
On Linux / Mac OS X, run the Terminal app
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
It repeats until there are no more pages
|
||||
</li>
|
||||
<li>
|
||||
If thumb mode, it does the following additional steps
|
||||
Change to the xowa root directory
|
||||
<ul>
|
||||
<li>
|
||||
It analyzes the list of [[File]] links to come up with a unique list of thumbs.
|
||||
For example, if xowa is setup in <code>C:\xowa</code>, run <code>cd C:\xowa</code>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Create a text file in your xowa root folder called <code>make_xowa.gfs</code> with a text-editor.
|
||||
<ul>
|
||||
<li>
|
||||
For Windows, Notepad++ is recommended
|
||||
</li>
|
||||
<li>
|
||||
It downloads the thumbs and creates the XOWA file databases.
|
||||
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
Copy each of the scripts below to the text file
|
||||
</li>
|
||||
<li>
|
||||
Run the following command. Make sure to match the jar path and jar file
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
|
||||
</p>
|
||||
<ul>
|
||||
<li>
|
||||
Wait for the script to complete
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Script">Script</span>
|
||||
</h2>
|
||||
<p>
|
||||
The <code>make</code> script should be run in 3 parts:
|
||||
</p>
|
||||
<ol>
|
||||
<li>
|
||||
<code>make_commons</code> script: Builds <b>commons.wikimedia.org</b> which is needed to provide image metadata for the download
|
||||
</li>
|
||||
<li>
|
||||
<code>make_wikidata</code> script: Builds <b>www.wikidata.org</b> which needed for data from {{#property}} calls or Module code.
|
||||
</li>
|
||||
<li>
|
||||
<code>make_wiki</code> script: Build the actual wiki
|
||||
</li>
|
||||
</ol>
|
||||
<p>
|
||||
Note that other wikis can re-use the same commons and wikidata. For example, if you want to build enwiki and dewiki, you only need to build <code>make_commons</code> and <code>make_wikidata</code> once.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_commons"><code>make_commons</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build commons database; this only needs to be done once, whenever commons is updated
|
||||
add ('commons.wikimedia.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('commons.wikimedia.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('commons.wikimedia.org' , 'text.init');
|
||||
add ('commons.wikimedia.org' , 'text.page');
|
||||
add ('commons.wikimedia.org' , 'text.term');
|
||||
add ('commons.wikimedia.org' , 'text.css');
|
||||
add ('commons.wikimedia.org' , 'wiki.page_props');
|
||||
add ('commons.wikimedia.org' , 'wiki.image');
|
||||
add ('commons.wikimedia.org' , 'file.page_regy') {build_commons = 'y'}
|
||||
add ('commons.wikimedia.org' , 'wiki.page_dump.make');
|
||||
add ('commons.wikimedia.org' , 'wiki.redirect') {commit_interval = 1000; progress_interval = 100; cleanup_interval = 100;}
|
||||
add ('commons.wikimedia.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script will take about 7 hours to complete and use 125 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_wikidata"><code>make_wikidata</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build wikidata database; this only needs to be done once, whenever wikidata is updated
|
||||
add ('www.wikidata.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'categorylinks';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('www.wikidata.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('www.wikidata.org' , 'text.init');
|
||||
add ('www.wikidata.org' , 'text.page');
|
||||
add ('www.wikidata.org' , 'text.term');
|
||||
add ('www.wikidata.org' , 'text.css');
|
||||
add ('www.wikidata.org' , 'wiki.page_props');
|
||||
add ('www.wikidata.org' , 'wiki.categorylinks');
|
||||
add ('www.wikidata.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script can take about 24 hours to complete and use 250 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
<span class="mw-headline" id="make_wiki"><code>make_wiki</code></span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Copy the following into <code>make_xowa.gfs</code>
|
||||
</li>
|
||||
</ul>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
app.cfg.set_temp('app', 'xowa.app.web.enabled', 'y');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.text', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.html', '0');
|
||||
app.cfg.set_temp('app', 'xowa.bldr.db.layout_size.file', '0');
|
||||
app.bldr.cmds {
|
||||
// build simple.wikipedia.org
|
||||
add ('simple.wikipedia.org' , 'util.cleanup') {delete_all = 'y';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'pages-articles';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'categorylinks';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'page_props';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'image';}
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'pagelinks';} // needed for sorting search results by PageRank
|
||||
add ('simple.wikipedia.org' , 'util.download') {dump_type = 'imagelinks';}
|
||||
add ('simple.wikipedia.org' , 'text.init');
|
||||
add ('simple.wikipedia.org' , 'text.page') {
|
||||
// calculate redirect_id for #REDIRECT pages. needed for html databases
|
||||
redirect_id_enabled = 'y';
|
||||
}
|
||||
add ('simple.wikipedia.org' , 'text.search');
|
||||
|
||||
// upload desktop css
|
||||
add ('simple.wikipedia.org' , 'text.css');
|
||||
|
||||
// upload mobile css
|
||||
add ('simple.wikipedia.org' , 'text.css') {css_key = 'xowa.mobile'; /* css_dir = 'C:\xowa\user\anonymous\wiki\simple.wikipedia.org-mobile\html\'; */}
|
||||
|
||||
add ('simple.wikipedia.org' , 'text.term');
|
||||
|
||||
add ('simple.wikipedia.org' , 'wiki.page_props');
|
||||
add ('simple.wikipedia.org' , 'wiki.categorylinks');
|
||||
|
||||
// create local "page" tables in each "text" database for "lnki_temp"
|
||||
add ('simple.wikipedia.org' , 'wiki.page_dump.make');
|
||||
|
||||
// create a redirect table for pages in the File namespace
|
||||
add ('simple.wikipedia.org' , 'wiki.redirect') {commit_interval = 1000; progress_interval = 100; cleanup_interval = 100;}
|
||||
|
||||
// create an "image" table to get the metadata for all files in the current wiki
|
||||
add ('simple.wikipedia.org' , 'wiki.image');
|
||||
|
||||
// create an "imagelinks" table to find out which images are used for the wiki
|
||||
add ('simple.wikipedia.org' , 'wiki.imagelinks');
|
||||
|
||||
// parse all page-to-page links
|
||||
add ('simple.wikipedia.org' , 'wiki.page_link');
|
||||
|
||||
// calculate a score for each page using the page-to-page links
|
||||
add ('simple.wikipedia.org' , 'search.page__page_score') {iteration_max = 100;}
|
||||
|
||||
// update link score statistics for the search tables
|
||||
add ('simple.wikipedia.org' , 'search.link__link_score') {page_rank_enabled = 'y';}
|
||||
|
||||
// update word count statistics for the search_word table
|
||||
add ('simple.wikipedia.org' , 'search.word__link_count');
|
||||
|
||||
// cleanup all downloaded files as well as temporary files
|
||||
add ('simple.wikipedia.org' , 'util.cleanup') {delete_tmp = 'y'; delete_by_match('*.xml|*.sql|*.bz2|*.gz');}
|
||||
|
||||
// v2 html generator; allows for multi-threaded / multi-machine builds
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
|
||||
|
||||
// NOTE: must change manual_now
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
|
||||
cfg {
|
||||
num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
|
||||
load_all_imglinks = 'y';
|
||||
|
||||
// uncomment the following 3 lines if using the build script as a "worker" helping a "server"
|
||||
// num_pages_in_pool = 32000;
|
||||
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
|
||||
// wkr_machine_name = 'worker_machine_1'
|
||||
}
|
||||
}
|
||||
|
||||
// note that if multi-machine mode is enabled, all worker directories must be manually copied to the server directory (a build command will be added later)
|
||||
add ('simple.wikipedia.org' , 'wiki.mass_parse.make');
|
||||
|
||||
// aggregate the lnkis
|
||||
add ('simple.wikipedia.org' , 'file.lnki_regy');
|
||||
|
||||
// generate orig metadata for files in the current wiki (for example, for pages in en.wikipedia.org/wiki/File:*)
|
||||
add ('simple.wikipedia.org' , 'file.page_regy') {build_commons = 'n';}
|
||||
|
||||
// generate all orig metadata for all lnkis
|
||||
add ('simple.wikipedia.org' , 'file.orig_regy');
|
||||
|
||||
// generate list of files to download based on "orig_regy" and XOWA image code
|
||||
add ('simple.wikipedia.org' , 'file.xfer_temp.thumb');
|
||||
|
||||
// aggregate list one more time
|
||||
add ('simple.wikipedia.org' , 'file.xfer_regy');
|
||||
|
||||
// identify images that have already been downloaded
|
||||
add ('simple.wikipedia.org' , 'file.xfer_regy_update');
|
||||
|
||||
// download images. This step may also take a long time, depending on how many images are needed
|
||||
add ('simple.wikipedia.org' , 'file.fsdb_make') {
|
||||
commit_interval = 1000; progress_interval = 200; select_interval = 10000;
|
||||
ns_ids = '0|4|14';
|
||||
|
||||
// specify whether original wiki databases are v1 (.sqlite3) or v2 (.xowa)
|
||||
src_bin_mgr__fsdb_version = 'v1';
|
||||
|
||||
// always redownload certain files
|
||||
src_bin_mgr__fsdb_skip_wkrs = 'page_gt_1|small_size';
|
||||
|
||||
// allow downloads from wikimedia
|
||||
src_bin_mgr__wmf_enabled = 'y';
|
||||
}
|
||||
|
||||
// generate registry of original metadata by file title
|
||||
add ('simple.wikipedia.org' , 'file.orig_reg');
|
||||
|
||||
// drop page_dump tables
|
||||
add ('simple.wikipedia.org' , 'wiki.page_dump.drop');
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<ul>
|
||||
<li>
|
||||
Change the <code>manual_now</code> above to match the first day of the current month. For example, if today is <code>2020-02-16</code>, change it to <code>manual_now = '2020-02-01 01:02:03'</code>.
|
||||
</li>
|
||||
<li>
|
||||
Run the script using the process above
|
||||
<ul>
|
||||
<li>
|
||||
For 2020-02, this script can take about 1 hour to complete and use 5 GB of disk space.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
The script for simple wikipedia is listed below.
|
||||
</p>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Requirements">Requirements</span>
|
||||
<span class="mw-headline" id="Appendix">Appendix</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="commons.wikimedia.org">commons.wikimedia.org</span>
|
||||
<span class="mw-headline" id="Requirements">Requirements</span>
|
||||
</h3>
|
||||
<p>
|
||||
You will need the latest version of commons.wikimedia.org. Note that if you have an older version, you will have missing images or wrong size information.
|
||||
</p>
|
||||
<p>
|
||||
For example, if you have a commons.wikimedia.org from 2015-04-22 and are trying to import a 2015-05-17 English Wikipedia, then any new images added after 2015-04-22 will not be picked up.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="www.wikidata.org">www.wikidata.org</span>
|
||||
</h3>
|
||||
<p>
|
||||
You also need to have the latest version of www.wikidata.org. Note that English Wikipedia and other wikis uses Wikidata through the {{#property}} call or Module code. If you have an earlier version, then data will be missing or out of date.
|
||||
</p>
|
||||
<h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Hardware">Hardware</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
You should have a recent-generation machine with relatively high-performance hardware, especially if you're planning to generate images for English Wikipedia.
|
||||
You should have a recent-generation machine with relatively high-performance hardware, especially if you're planning to run the <code>make</code> script for English Wikipedia.
|
||||
</p>
|
||||
<p>
|
||||
For context, here is my current machine setup for generating the image dumps:
|
||||
@@ -195,20 +496,20 @@
|
||||
(Note: The hardware was assembled in late 2013.)
|
||||
</p>
|
||||
<p>
|
||||
For English Wikipedia, it still takes about 50 hours for the entire process.
|
||||
For English Wikipedia, it takes about 50 hours for the entire process.
|
||||
</p>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Internet-connectivity_(optional)">Internet-connectivity (optional)</span>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Internet-connectivity">Internet-connectivity</span>
|
||||
</h4>
|
||||
<p>
|
||||
You should have a broadband connection to the internet. The script will need to download dump files from Wikimedia and some dump files (like English Wikipedia) will be in the 10s of GB.
|
||||
You should have a broadband connection to the internet. The script will need to download dump files from Wikimedia and some dump files (like English Wikipedia) will be in the tens of GB.
|
||||
</p>
|
||||
<p>
|
||||
You can opt to download these files separately and place them in the appropriate location beforehand. However, the script below assumes that the machine is always online. If you are offline, you will need to comment the "util.download" lines yourself.
|
||||
<br>
|
||||
</p>
|
||||
<h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Pre-existing_image_databases_for_your_wiki_(optional)">Pre-existing image databases for your wiki (optional)</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
XOWA will automatically re-use the images from existing image databases so that you do not have to redownload them. This is particularly useful for large wikis where redownloading millions of images would be unwanted.
|
||||
</p>
|
||||
@@ -223,9 +524,9 @@
|
||||
If you have v2 image databases, they should be placed in <code>/xowa/wiki/wiki_domain/prv</code>. For example, English Wikipedia should have <code>/xowa/wiki/en.wikipedia.org/prv/en.wikipedia.org-file-ns.000-db.001.xowa</code>
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<span class="mw-headline" id="gfs">gfs</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="gfs_script">gfs script</span>
|
||||
</h3>
|
||||
<p>
|
||||
The script is written in the <code>gfs</code> format. This is a custom scripting format specific to XOWA. It is similar to JSON, but also supports commenting.
|
||||
</p>
|
||||
@@ -258,45 +559,48 @@
|
||||
Statements are grouped with curly braces. ({}). For example: <code>group {procedure1; procedure2; procedure3;}</code>
|
||||
</li>
|
||||
</ul>
|
||||
<h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Terms">Terms</span>
|
||||
</h2>
|
||||
<h3>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="lnki">lnki</span>
|
||||
</h3>
|
||||
</h4>
|
||||
<p>
|
||||
A <code>lnki</code> is short for "<b>l</b>i<b>nk</b> <b>i</b>nternal". It refers to all wikitext with the double bracket syntax: [[A]]. A more elaborate example for files would be [[File:A.png|thumb|200x300px|upright=.80]]. Note that the abbreviation was chosen to differentiate it from <code>lnke</code> which is short for "<b>l</b>i<b>nk</b> <b>e</b>nternal". For the purposes of the script, all lnki data comes from the current wiki's data dump
|
||||
A <code>lnki</code> is short for "<b>l</b>i<b>nk</b> <b>i</b>nternal". It refers to all wikitext with the double bracket syntax: [[A]]. A more elaborate example for files would be [[File:A.png|thumb|200x300px|upright=.80]]. Note that the abbreviation was chosen to differentiate it from <code>lnke</code> which is short for "<b>l</b>i<b>nk</b> <b>e</b>nternal".
|
||||
</p>
|
||||
<h3>
|
||||
<p>
|
||||
For the purposes of the script, all lnki data comes from the wikitext in the current wiki's data dump
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="orig">orig</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
An <code>orig</code> is short for "<b>orig</b>inal file". It refers to the original file metadata. For the purposes of this script, all orig data comes from commons.wikimedia.org
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
</h4>
|
||||
<p>
|
||||
An <code>orig</code> is short for "<b>orig</b>inal file". It refers to the original file metadata.
|
||||
</p>
|
||||
<p>
|
||||
For the purposes of this script, all orig data comes from commons.wikimedia.org
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="xfer">xfer</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
An <code>xfer</code> is short for "transfer file". It refers to the actual file to be downloaded.
|
||||
</li>
|
||||
</ul>
|
||||
<h3>
|
||||
</h4>
|
||||
<p>
|
||||
An <code>xfer</code> is short for "transfer file". It refers to the actual file to be downloaded.
|
||||
</p>
|
||||
<h4>
|
||||
<span class="mw-headline" id="fsdb">fsdb</span>
|
||||
</h3>
|
||||
<ul>
|
||||
<li>
|
||||
The <code>fsdb</code> is short for "<b>f</b>ile <b>s</b>ystem <b>d</b>ata<b>b</b>ase". It refers to the internal table format of the XOWA image databases.
|
||||
</li>
|
||||
</ul>
|
||||
</h4>
|
||||
<p>
|
||||
The <code>fsdb</code> is short for "<b>f</b>ile <b>s</b>ystem <b>d</b>ata<b>b</b>ase". It refers to the file as it is stored in the internal table format of the XOWA image databases.
|
||||
</p>
|
||||
<p>
|
||||
<br>
|
||||
</p>
|
||||
<h2>
|
||||
<span class="mw-headline" id="Script:_Simple_Wikipedia_example_with_documentation">Script: Simple Wikipedia example with documentation</span>
|
||||
</h2>
|
||||
<h3>
|
||||
<span class="mw-headline" id="Examples">Examples</span>
|
||||
</h3>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Simple_Wikipedia_example_with_documentation">Simple Wikipedia example with documentation</span>
|
||||
</h4>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
@@ -485,9 +789,9 @@ app.bldr.cmds {
|
||||
}
|
||||
app.bldr.run;
|
||||
</pre>
|
||||
<h2>
|
||||
<h4>
|
||||
<span class="mw-headline" id="Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span>
|
||||
</h2>
|
||||
</h4>
|
||||
<pre class='code'>
|
||||
app.bldr.pause_at_end_('n');
|
||||
app.scripts.run_file_by_type('xowa_cfg_app');
|
||||
@@ -621,6 +925,9 @@ app.bldr.run;
|
||||
<li>
|
||||
2017-02-02: updated script for multi-threaded version and new options
|
||||
</li>
|
||||
<li>
|
||||
2020-02-16: rewrote page to provide more explicit step-by-steps. Moved content to glossary
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -89,7 +89,7 @@
|
||||
</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Navigate to <a href="http://xowa.org/home/wiki/Special:XowaCfg%3Fgrp%3Dxowa.files.general" id="xolnki_2" title="Special:XowaCfg?grp=xowa.files.general">Options: Files - General</a> and check "Download enabled"
|
||||
Navigate to <a href="http://xowa.org/home/wiki/Special:XowaCfg%3Fgrp%3Dxowa.files.general" id="xolnki_2" title="Special:XowaCfg?grp=xowa.files.general" class="xowa-visited">Options: Files - General</a> and check "Download enabled"
|
||||
</li>
|
||||
<li>
|
||||
Restart XOWA and navigate to any page. Any images will be downloaded automatically.
|
||||
|
||||
Reference in New Issue
Block a user