@@ -193,7 +196,7 @@
Create a text file in your xowa root folder called make_xowa.gfs with a text-editor.
- For Windows, Notepad++ is recommended
+ For Windows, Notepad++ is recommended, or any other text editor that does not have Windows line-ending. (Do not use Notepad)
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
@@ -207,9 +210,11 @@
Run the following command. Make sure to match the jar path and jar file
-
- java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
-
+
+
+ java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
+
+
Wait for the script to complete
@@ -393,13 +398,36 @@ app.bldr.cmds {
// v2 html generator; allows for multi-threaded / multi-machine builds
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
+// uncomment the next line to resume parsing. See === Resuming === below
+// add ('simple.wikipedia.org' , 'wiki.mass_parse.resume');
+
// NOTE: must change manual_now
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
cfg {
- num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
+ // locks time to a specific value so all pages use the same time when calling Date.Now()
+ manual_now = '2020-02-01 01:02:03';
+
+ // number of threads; set to 1 to skip multi-threaded behavior
+ num_wkrs = 8;
+
+ // enables building full-text search indexes
+ indexer_enabled = 'y';
+
+ // optimization; loads all templates in memory instead of loading each one from disk
+ load_all_templates = 'y';
+
+ // optimization; loads all imglinks in memory instead of loading each one from disk
+ // an imglink maps a given image (File:Abc.png) to a repo (commons vs local wiki) as well as a rename
load_all_imglinks = 'y';
-
- // uncomment the following 3 lines if using the build script as a "worker" helping a "server"
+
+ // number of pages after which XOWA empties cache
+ cleanup_interval = 50;
+
+ // DEPRECATE: uncomment these 2 lines to use custom HTML zip compression
+ // hzip_enabled = 'y';
+ // hdiff_enabled ='y';
+
+ // uncomment these 3 lines if using the build script as a "worker" helping a "server"
// num_pages_in_pool = 32000;
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
// wkr_machine_name = 'worker_machine_1'
@@ -463,6 +491,50 @@ app.bldr.run;
+
+ Resuming
+
+
+ The wiki.mass_parse.exec may take many hours. For English Wikipedia, it can take up to 5 days, even with 8 threads
+
+
+ During this time, the build can be canceled by any of the following:
+
+
+
+ Manual: User presses Ctrl+C
+
+
+ Unanticipated: Process dies or machine shuts down
+
+
+
+ To resume the build, the following steps can be applied
+
+
+
+ Comment out all commands before wiki.mass_parse.exec using a block comment
+
+
+ Place a /* before the line with 'util.cleanup'
+
+
+ Place a */ after the line with 'wiki.mass_parse.init'
+
+
+
+
+ Uncomment the line for 'wiki.mass_parse.resume'
+
+
+ Run the command-line again
+
+
+
+
+ java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
+
@@ -193,7 +196,7 @@
Create a text file in your xowa root folder called make_xowa.gfs with a text-editor.
- For Windows, Notepad++ is recommended
+ For Windows, Notepad++ is recommended, or any other text editor that does not have Windows line-ending. (Do not use Notepad)
For other systems, you can use a text-editor like Atom, jEdit, or whatever you're most comfortable with
@@ -207,9 +210,11 @@
Run the following command. Make sure to match the jar path and jar file
-
- java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
-
+
+
+ java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
+
+
Wait for the script to complete
@@ -393,13 +398,36 @@ app.bldr.cmds {
// v2 html generator; allows for multi-threaded / multi-machine builds
add ('simple.wikipedia.org' , 'wiki.mass_parse.init') {cfg {ns_ids = '0|4|14|8';}}
+// uncomment the next line to resume parsing. See === Resuming === below
+// add ('simple.wikipedia.org' , 'wiki.mass_parse.resume');
+
// NOTE: must change manual_now
add ('simple.wikipedia.org' , 'wiki.mass_parse.exec') {
cfg {
- num_wkrs = 8; load_all_templates = 'y'; cleanup_interval = 50; hzip_enabled = 'y'; hdiff_enabled ='y'; manual_now = '2020-02-01 01:02:03';
+ // locks time to a specific value so all pages use the same time when calling Date.Now()
+ manual_now = '2020-02-01 01:02:03';
+
+ // number of threads; set to 1 to skip multi-threaded behavior
+ num_wkrs = 8;
+
+ // enables building full-text search indexes
+ indexer_enabled = 'y';
+
+ // optimization; loads all templates in memory instead of loading each one from disk
+ load_all_templates = 'y';
+
+ // optimization; loads all imglinks in memory instead of loading each one from disk
+ // an imglink maps a given image (File:Abc.png) to a repo (commons vs local wiki) as well as a rename
load_all_imglinks = 'y';
-
- // uncomment the following 3 lines if using the build script as a "worker" helping a "server"
+
+ // number of pages after which XOWA empties cache
+ cleanup_interval = 50;
+
+ // DEPRECATE: uncomment these 2 lines to use custom HTML zip compression
+ // hzip_enabled = 'y';
+ // hdiff_enabled ='y';
+
+ // uncomment these 3 lines if using the build script as a "worker" helping a "server"
// num_pages_in_pool = 32000;
// mgr_url = '\\server_machine_name\xowa\wiki\en.wikipedia.org\tmp\xomp\';
// wkr_machine_name = 'worker_machine_1'
@@ -463,6 +491,50 @@ app.bldr.run;
+
+ Resuming
+
+
+ The wiki.mass_parse.exec may take many hours. For English Wikipedia, it can take up to 5 days, even with 8 threads
+
+
+ During this time, the build can be canceled by any of the following:
+
+
+
+ Manual: User presses Ctrl+C
+
+
+ Unanticipated: Process dies or machine shuts down
+
+
+
+ To resume the build, the following steps can be applied
+
+
+
+ Comment out all commands before wiki.mass_parse.exec using a block comment
+
+
+ Place a /* before the line with 'util.cleanup'
+
+
+ Place a */ after the line with 'wiki.mass_parse.init'
+
+
+
+
+ Uncomment the line for 'wiki.mass_parse.resume'
+
+
+ Run the command-line again
+
+
+
+
+ java -jar C:\xowa\xowa_windows_64.jar --app_mode cmd --cmd_file C:\xowa\make_xowa.gfs --show_license n --show_args n
+
checked to automatically download images / retrieve images from database
-
-
-
ImageMagick and Inkscape must be installed (See Dev/File/Setup)
-
-
An internet connection must be available, or a local tarball must be set up. (See Archive/Usage/Offline_images)
-
-
+
checked: Read tab uses HTML databases. Note that this will be faster, but there may be some inaccuracies. (These inaccuracies are being worked on)
-
unchecked to leave images unprocessed
-
-
-
-
-
-
-
Cache
-
-
-
-
-
-
-
-
-
-
Minimum disk space used
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Enter a minimum size for the cache to use (in MB)
-
-
-
This is an advanced configuration tweak. When the cache reaches its maximum size, it will delete files to free space. It will continue deleting files until the minimum size is reached.
-
-
-
For example:
-
-
-
-
If the max is set to 100 MB
-
-
... and the current size of all files in the cache is 99.9 MB
-
-
... and a 200 KB file is added
-
-
Then the cache size will be temporarily 100.1 MB
-
-
... triggering the delete mechanism
-
-
... which will reduce the current size of all files to 75 MB
+
unchecked: Read tab uses Wikitext. Note that this will be slower, but more accurate.
@@ -213,29 +151,117 @@
-
+
-
Maximum disk space used
+
HTML tab mode
-
-
+
+
-
+
+
-
-
+
+
-
-
Enter a maximum size for the cache to use (in MB)
+
+
Choose one of the following to show in HTML tab:
+
+
+
+
Shown: HTML as generated by XOWA
+
+
SWT browser: HTML as currently shown in SWT browser (useful for Special pages and javascript calls)
+
+
Saved for HTML DB: HTML as saved in the HTML database. Note that this is for developer purposes only.
+
+
Loaded for HTML DB: HTML as saved in the HTML database but post-processed with image urls. Note that this is for developer purposes only.
+
+
+
+
+
+
+
Indicators
+
+
+
+
+
+
+
+
+
+
Enabled
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose one of the following.
+
+
+
+
checked : Shows indicator in top-right corner if page is coming from wikitext or HTML databases
+
+
unchecked : Shows nothing
+
+
+
+
+
+
+
+
+
+
+
+
+
HTML when Wikitext DB
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
HTML to show for Wikitext databases
@@ -243,97 +269,29 @@
-
+
-
Reduce cache to min
+
HTML when HTML DB
-
-
+
+
-
+
-
-
+
+
-
-
Press to reduce the cache to the minimum now (typically 75 MB).
-