1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00

buildxowa_make_resume

This commit is contained in:
gnosygnu
2020-03-06 06:40:56 -05:00
parent bda10e7660
commit 4ea81258a1
8 changed files with 900 additions and 249 deletions

View File

@@ -86,59 +86,62 @@
<li class="toclevel-2 tocsection-6">
<a href="#make_wiki"><span class="tocnumber">3.3</span> <span class="toctext">make_wiki</span></a>
</li>
<li class="toclevel-2 tocsection-7">
<a href="#Resuming"><span class="tocnumber">3.4</span> <span class="toctext">Resuming</span></a>
</li>
</ul>
</li>
<li class="toclevel-1 tocsection-7">
<li class="toclevel-1 tocsection-8">
<a href="#Appendix"><span class="tocnumber">4</span> <span class="toctext">Appendix</span></a>
<ul>
<li class="toclevel-2 tocsection-8">
<li class="toclevel-2 tocsection-9">
<a href="#Requirements"><span class="tocnumber">4.1</span> <span class="toctext">Requirements</span></a>
<ul>
<li class="toclevel-3 tocsection-9">
<li class="toclevel-3 tocsection-10">
<a href="#Hardware"><span class="tocnumber">4.1.1</span> <span class="toctext">Hardware</span></a>
</li>
<li class="toclevel-3 tocsection-10">
<li class="toclevel-3 tocsection-11">
<a href="#Internet-connectivity"><span class="tocnumber">4.1.2</span> <span class="toctext">Internet-connectivity</span></a>
</li>
<li class="toclevel-3 tocsection-11">
<li class="toclevel-3 tocsection-12">
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">4.1.3</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
</li>
</ul>
</li>
<li class="toclevel-2 tocsection-12">
<li class="toclevel-2 tocsection-13">
<a href="#gfs_script"><span class="tocnumber">4.2</span> <span class="toctext">gfs script</span></a>
</li>
<li class="toclevel-2 tocsection-13">
<li class="toclevel-2 tocsection-14">
<a href="#Terms"><span class="tocnumber">4.3</span> <span class="toctext">Terms</span></a>
<ul>
<li class="toclevel-3 tocsection-14">
<li class="toclevel-3 tocsection-15">
<a href="#lnki"><span class="tocnumber">4.3.1</span> <span class="toctext">lnki</span></a>
</li>
<li class="toclevel-3 tocsection-15">
<li class="toclevel-3 tocsection-16">
<a href="#orig"><span class="tocnumber">4.3.2</span> <span class="toctext">orig</span></a>
</li>
<li class="toclevel-3 tocsection-16">
<li class="toclevel-3 tocsection-17">
<a href="#xfer"><span class="tocnumber">4.3.3</span> <span class="toctext">xfer</span></a>
</li>
<li class="toclevel-3 tocsection-17">
<li class="toclevel-3 tocsection-18">
<a href="#fsdb"><span class="tocnumber">4.3.4</span> <span class="toctext">fsdb</span></a>
</li>
</ul>
</li>
<li class="toclevel-2 tocsection-18">
<li class="toclevel-2 tocsection-19">
<a href="#Examples"><span class="tocnumber">4.4</span> <span class="toctext">Examples</span></a>
<ul>
<li class="toclevel-3 tocsection-19">
<li class="toclevel-3 tocsection-20">
<a href="#Simple_Wikipedia_example_with_documentation"><span class="tocnumber">4.4.1</span> <span class="toctext">Simple Wikipedia example with documentation</span></a>
</li>
<li class="toclevel-3 tocsection-20">
<li class="toclevel-3 tocsection-21">
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">4.4.2</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
</li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-1 tocsection-21">
<li class="toclevel-1 tocsection-22">
<a href="#Change_log"><span class="tocnumber">5</span> <span class="toctext">Change log</span></a>
</li>
</ul>
@@ -193,7 +196,7 @@
Create a text file in your xowa root folder called <code>make_xowa.gfs</code> with a text-editor.
<ul>
<li>
For Windows, Notepad++ is recommended
For Windows, Notepad++ is recommended, or any other text editor that does not have Windows line-ending. (Do not use Notepad)
</li>
<li>
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
@@ -207,9 +210,11 @@
Run the following command. Make sure to match the jar path and jar file
</li>
</ul>
<p>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</p>
<dl>
<dd>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</dd>
</dl>
<ul>
<li>
Wait for the script to complete
@@ -393,13 +398,36 @@ app.bldr.cmds {
// v2 html generator; allows for multi-threaded / multi-machine builds
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
// uncomment the next line to resume parsing. See === Resuming === below
// add ('simple.wikipedia.org' , 'wiki.mass_parse.resume');
// NOTE: must change manual_now
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
cfg {
num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
// locks time to a specific value so all pages use the same time when calling Date.Now()
manual_now = '2020-02-01 01:02:03';
// number of threads; set to 1 to skip multi-threaded behavior
num_wkrs = 8;
// enables building full-text search indexes
indexer_enabled = 'y';
// optimization; loads all templates in memory instead of loading each one from disk
load_all_templates = 'y';
// optimization; loads all imglinks in memory instead of loading each one from disk
// an imglink maps a given image (File:Abc.png) to a repo (commons vs local wiki) as well as a rename
load_all_imglinks = 'y';
// uncomment the following 3 lines if using the build script as a "worker" helping a "server"
// number of pages after which XOWA empties cache
cleanup_interval = 50;
// DEPRECATE: uncomment these 2 lines to use custom HTML zip compression
// hzip_enabled = 'y';
// hdiff_enabled ='y';
// uncomment these 3 lines if using the build script as a "worker" helping a "server"
// num_pages_in_pool = 32000;
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
// wkr_machine_name = 'worker_machine_1'
@@ -463,6 +491,50 @@ app.bldr.run;
</ul>
</li>
</ul>
<h3>
<span class="mw-headline" id="Resuming">Resuming</span>
</h3>
<p>
The <code>wiki.mass_parse.exec</code> may take many hours. For English Wikipedia, it can take up to 5 days, even with 8 threads
</p>
<p>
During this time, the build can be canceled by any of the following:
</p>
<ul>
<li>
Manual: User presses Ctrl+C
</li>
<li>
Unanticipated: Process dies or machine shuts down
</li>
</ul>
<p>
To resume the build, the following steps can be applied
</p>
<ul>
<li>
Comment out all commands before <code>wiki.mass_parse.exec</code> using a block comment
<ul>
<li>
Place a <code>/*</code> before the line with 'util.cleanup'
</li>
<li>
Place a <code>*/</code> after the line with 'wiki.mass_parse.init'
</li>
</ul>
</li>
<li>
Uncomment the line for 'wiki.mass_parse.resume'
</li>
<li>
Run the command-line again
</li>
</ul>
<dl>
<dd>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</dd>
</dl>
<h2>
<span class="mw-headline" id="Appendix">Appendix</span>
</h2>

View File

@@ -86,59 +86,62 @@
<li class="toclevel-2 tocsection-6">
<a href="#make_wiki"><span class="tocnumber">3.3</span> <span class="toctext">make_wiki</span></a>
</li>
<li class="toclevel-2 tocsection-7">
<a href="#Resuming"><span class="tocnumber">3.4</span> <span class="toctext">Resuming</span></a>
</li>
</ul>
</li>
<li class="toclevel-1 tocsection-7">
<li class="toclevel-1 tocsection-8">
<a href="#Appendix"><span class="tocnumber">4</span> <span class="toctext">Appendix</span></a>
<ul>
<li class="toclevel-2 tocsection-8">
<li class="toclevel-2 tocsection-9">
<a href="#Requirements"><span class="tocnumber">4.1</span> <span class="toctext">Requirements</span></a>
<ul>
<li class="toclevel-3 tocsection-9">
<li class="toclevel-3 tocsection-10">
<a href="#Hardware"><span class="tocnumber">4.1.1</span> <span class="toctext">Hardware</span></a>
</li>
<li class="toclevel-3 tocsection-10">
<li class="toclevel-3 tocsection-11">
<a href="#Internet-connectivity"><span class="tocnumber">4.1.2</span> <span class="toctext">Internet-connectivity</span></a>
</li>
<li class="toclevel-3 tocsection-11">
<li class="toclevel-3 tocsection-12">
<a href="#Pre-existing_image_databases_for_your_wiki_(optional)"><span class="tocnumber">4.1.3</span> <span class="toctext">Pre-existing image databases for your wiki (optional)</span></a>
</li>
</ul>
</li>
<li class="toclevel-2 tocsection-12">
<li class="toclevel-2 tocsection-13">
<a href="#gfs_script"><span class="tocnumber">4.2</span> <span class="toctext">gfs script</span></a>
</li>
<li class="toclevel-2 tocsection-13">
<li class="toclevel-2 tocsection-14">
<a href="#Terms"><span class="tocnumber">4.3</span> <span class="toctext">Terms</span></a>
<ul>
<li class="toclevel-3 tocsection-14">
<li class="toclevel-3 tocsection-15">
<a href="#lnki"><span class="tocnumber">4.3.1</span> <span class="toctext">lnki</span></a>
</li>
<li class="toclevel-3 tocsection-15">
<li class="toclevel-3 tocsection-16">
<a href="#orig"><span class="tocnumber">4.3.2</span> <span class="toctext">orig</span></a>
</li>
<li class="toclevel-3 tocsection-16">
<li class="toclevel-3 tocsection-17">
<a href="#xfer"><span class="tocnumber">4.3.3</span> <span class="toctext">xfer</span></a>
</li>
<li class="toclevel-3 tocsection-17">
<li class="toclevel-3 tocsection-18">
<a href="#fsdb"><span class="tocnumber">4.3.4</span> <span class="toctext">fsdb</span></a>
</li>
</ul>
</li>
<li class="toclevel-2 tocsection-18">
<li class="toclevel-2 tocsection-19">
<a href="#Examples"><span class="tocnumber">4.4</span> <span class="toctext">Examples</span></a>
<ul>
<li class="toclevel-3 tocsection-19">
<li class="toclevel-3 tocsection-20">
<a href="#Simple_Wikipedia_example_with_documentation"><span class="tocnumber">4.4.1</span> <span class="toctext">Simple Wikipedia example with documentation</span></a>
</li>
<li class="toclevel-3 tocsection-20">
<li class="toclevel-3 tocsection-21">
<a href="#Script:_gnosygnu's_actual_English_Wikipedia_script_(dirty;_provided_for_reference_only)"><span class="tocnumber">4.4.2</span> <span class="toctext">Script: gnosygnu's actual English Wikipedia script (dirty; provided for reference only)</span></a>
</li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-1 tocsection-21">
<li class="toclevel-1 tocsection-22">
<a href="#Change_log"><span class="tocnumber">5</span> <span class="toctext">Change log</span></a>
</li>
</ul>
@@ -193,7 +196,7 @@
Create a text file in your xowa root folder called <code>make_xowa.gfs</code> with a text-editor.
<ul>
<li>
For Windows, Notepad++ is recommended
For Windows, Notepad++ is recommended, or any other text editor that does not have Windows line-ending. (Do not use Notepad)
</li>
<li>
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
@@ -207,9 +210,11 @@
Run the following command. Make sure to match the jar path and jar file
</li>
</ul>
<p>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</p>
<dl>
<dd>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</dd>
</dl>
<ul>
<li>
Wait for the script to complete
@@ -393,13 +398,36 @@ app.bldr.cmds {
// v2 html generator; allows for multi-threaded / multi-machine builds
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
// uncomment the next line to resume parsing. See === Resuming === below
// add ('simple.wikipedia.org' , 'wiki.mass_parse.resume');
// NOTE: must change manual_now
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
cfg {
num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
// locks time to a specific value so all pages use the same time when calling Date.Now()
manual_now = '2020-02-01 01:02:03';
// number of threads; set to 1 to skip multi-threaded behavior
num_wkrs = 8;
// enables building full-text search indexes
indexer_enabled = 'y';
// optimization; loads all templates in memory instead of loading each one from disk
load_all_templates = 'y';
// optimization; loads all imglinks in memory instead of loading each one from disk
// an imglink maps a given image (File:Abc.png) to a repo (commons vs local wiki) as well as a rename
load_all_imglinks = 'y';
// uncomment the following 3 lines if using the build script as a "worker" helping a "server"
// number of pages after which XOWA empties cache
cleanup_interval = 50;
// DEPRECATE: uncomment these 2 lines to use custom HTML zip compression
// hzip_enabled = 'y';
// hdiff_enabled ='y';
// uncomment these 3 lines if using the build script as a "worker" helping a "server"
// num_pages_in_pool = 32000;
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
// wkr_machine_name = 'worker_machine_1'
@@ -463,6 +491,50 @@ app.bldr.run;
</ul>
</li>
</ul>
<h3>
<span class="mw-headline" id="Resuming">Resuming</span>
</h3>
<p>
The <code>wiki.mass_parse.exec</code> may take many hours. For English Wikipedia, it can take up to 5 days, even with 8 threads
</p>
<p>
During this time, the build can be canceled by any of the following:
</p>
<ul>
<li>
Manual: User presses Ctrl+C
</li>
<li>
Unanticipated: Process dies or machine shuts down
</li>
</ul>
<p>
To resume the build, the following steps can be applied
</p>
<ul>
<li>
Comment out all commands before <code>wiki.mass_parse.exec</code> using a block comment
<ul>
<li>
Place a <code>/*</code> before the line with 'util.cleanup'
</li>
<li>
Place a <code>*/</code> after the line with 'wiki.mass_parse.init'
</li>
</ul>
</li>
<li>
Uncomment the line for 'wiki.mass_parse.resume'
</li>
<li>
Run the command-line again
</li>
</ul>
<dl>
<dd>
<code>java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n</code>
</dd>
</dl>
<h2>
<span class="mw-headline" id="Appendix">Appendix</span>
</h2>