Kiwix
Contents
Kiwix#
With Kiwix it is possible to have a copy of Wikipedia and of other websites and read them offline.
See also
Kiwix lets you access free knowledge – even offline 1
A collection of scripts I have written and/or adapted that I currently use on my systems as automated tasks 2
Base directory for downloading zim files 3
Download large file in python with requests - Stack Overflow 4
python - Performant replacement for IPython !shell-command magic - Stack Overflow 5
shutil — High-level file operations — Python 3.10.3 documentation 6
Setup#
install the dependencies
apt-get install python3-requests python3-bs4 python3-yaml aria2 kiwix-tools kiwix
install fpyutils. See reference
create a new user
useradd --system -s /bin/bash -U kiwix passwd kiwix usermod -aG jobs kiwix
create the jobs directories. See reference
mkdir -p /home/jobs/{scripts,services}/by-user/kiwix
create the
script
/home/jobs/scripts/by-user/kiwix/kiwix_manage.py#1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# kiwix_manage.py 5# 6# Copyright (C) 2020-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com) 7# 8# This program is free software: you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation, either version 3 of the License, or 11# (at your option) any later version. 12# 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program. If not, see <http://www.gnu.org/licenses/>. 20"""kiwix_manage.py.""" 21 22######## 23# Main # 24######## 25 26import datetime 27import pathlib 28import re 29import shlex 30import shutil 31import subprocess 32import sys 33import urllib.parse 34 35import bs4 36import fpyutils 37import requests 38import yaml 39 40 41def separate_pattern_from_string(string: str, pattern: str) -> tuple: 42 r"""Separate a pattern from a string.""" 43 string_without_pattern = remove_component(string, pattern) 44 component = find_component(string, pattern) 45 46 return string_without_pattern, component 47 48 49def separate_pattern_from_strings(strings: list, pattern: str) -> dict: 50 r"""Separate a batch of strings from their patterns.""" 51 for string in strings: 52 if not isinstance(string, str): 53 raise TypeError 54 55 elements = dict() 56 # Populate the date elements. 57 for string in strings: 58 common, component = separate_pattern_from_string(string, pattern) 59 if common not in elements: 60 # Build an empty list of length corresponding 61 # to the number of entries without the date component in the file name. 62 elements[common] = list() 63 elements[common].append(component) 64 65 return elements 66 67 68def filter_max_elements_in_dict_nested_lists(elements: dict) -> dict: 69 r"""Given a dictionary with lists, find the maxium elements of each list and remove the other elements.""" 70 # Get the most recent dates by group and rebuild the URLs. 71 filtered_elements = dict() 72 for element in elements: 73 filtered_elements[element] = max(elements[element]) 74 75 return filtered_elements 76 77 78def get_most_recent_elements(uris: list, date_regex_pattern: str, 79 date_format_string: str) -> dict: 80 r"""Filter elements by date and return the most recent ones.""" 81 for uri in uris: 82 if not isinstance(uri, str): 83 raise TypeError 84 85 elements = separate_pattern_from_strings(uris, date_regex_pattern) 86 # Transform the date strings to datetime objects. 87 for element in elements: 88 i = 0 89 while i < len(elements[element]): 90 elements[element][i] = str_to_datetime(elements[element][i], 91 date_format_string) 92 i += 1 93 return filter_max_elements_in_dict_nested_lists(elements) 94 95 96def filter_uris_by_pattern(uris: list, pattern: str): 97 r"""Filter URIs by regex pattern.""" 98 filtered_uris = list() 99 for uri in uris: 100 if find_component(uri, pattern) is not None: 101 filtered_uris.append(uri) 102 103 return filtered_uris 104 105 106def compare_uris(local_uris: dict, remote_uris: dict) -> tuple: 107 r"""Given two sets of URIs select the actions to do with the elements each one by placing them into two new sets.""" 108 files_to_download = list() 109 files_to_delete = list() 110 for remote in remote_uris: 111 exists_locally = False 112 for local in local_uris: 113 if remote == local: 114 exists_locally = True 115 # Get the element in the local files list corresponding to the current remote file. 116 # Only download fresh files. 117 if local_uris[local] < remote_uris[remote]: 118 files_to_download.append( 119 rebuild_uri_with_date(remote, remote_uris[remote], 120 '%Y-%m')) 121 files_to_delete.append( 122 rebuild_uri_with_date(local, local_uris[local], 123 '%Y-%m')) 124 if not exists_locally: 125 files_to_download.append( 126 rebuild_uri_with_date(remote, remote_uris[remote], '%Y-%m')) 127 128 return files_to_download, files_to_delete 129 130 131def download_files(files_to_download: list, downloader: str, 132 downloader_args: str, root_url: str, file_directory: str): 133 r"""Download a batch of files.""" 134 delete_temporary_directory = False 135 for i, download in enumerate(files_to_download): 136 full_remote_uri = rebuild_uri(root_url, download) 137 full_local_uri = rebuild_uri(file_directory, download) 138 if i == len(files_to_download) - 1: 139 delete_temporary_directory = True 140 download_binary_file(full_remote_uri, full_local_uri, downloader, 141 downloader_args, 0o700, 142 delete_temporary_directory) 143 144 145def delete_files(files_to_delete: list, file_directory: str): 146 r"""Delete a batch of files.""" 147 for delete in files_to_delete: 148 full_local_uri = rebuild_uri(file_directory, delete) 149 delete_file(full_local_uri) 150 151 152######### 153# Utils # 154######### 155 156 157def get_relative_path(path: str) -> str: 158 r"""Get the last component of a path.""" 159 return str(pathlib.Path(path).name) 160 161 162def get_relative_paths(paths: list) -> list: 163 r"""Get the last components of a list of paths.""" 164 relative = list() 165 for path in paths: 166 relative.append(get_relative_path(path)) 167 168 return relative 169 170 171def get_last_path_component_from_url(url: str) -> str: 172 r"""Transform a string to a datetime object.""" 173 component = urllib.parse.urlsplit(url).path 174 return get_relative_path(component) 175 176 177def remove_component(element: str, pattern: str) -> str: 178 r"""Remove the date component from the name.""" 179 return re.split(pattern, element)[0] 180 181 182def find_component(element: str, pattern: str) -> str: 183 r"""Return the date component from the name.""" 184 f = re.findall(pattern, element) 185 if len(f) == 1: 186 return f[0] 187 else: 188 return None 189 190 191def str_to_datetime(date: str, 192 date_formatting_string: str) -> datetime.datetime: 193 r"""Transform a string into a datetime object.""" 194 return datetime.datetime.strptime(date, date_formatting_string) 195 196 197def datetime_to_str(date: datetime.datetime, 198 date_formatting_string: str) -> str: 199 r"""Transform a datetime object into a string.""" 200 return datetime.datetime.strftime(date, date_formatting_string) 201 202 203def rebuild_uri(uri_base: str, path: str) -> str: 204 """Rebuild a URI by a trailing forward slash if necessary and a path. 205 206 ..note: see https://stackoverflow.com/a/59818095 207 """ 208 uri_base = uri_base if uri_base.endswith('/') else f"{uri_base}/" 209 return uri_base + path 210 211 212def rebuild_uri_with_date(uri_base: str, 213 date: datetime, 214 date_formatting_string: str, 215 extension: str = '.zim') -> str: 216 r"""Rebuild the original URI which has been stripped from the date and file extension components.""" 217 return uri_base + datetime_to_str(date, date_formatting_string) + extension 218 219 220def get_a_href_elements_from_url(url: str) -> list: 221 r"""Given a url, download the file and, if it is an HTML file, find all "a href" elements.""" 222 soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser') 223 # Get the content of the HTML tag. 224 return [link.get('href') for link in soup.find_all('a')] 225 226 227def download_binary_file_requests(url: str, destination: str): 228 r"""Download a binary file with Python Requests.""" 229 # See https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests/39217788#39217788 230 # 231 # Copyright (C) 2016 John Zwinck @ Stack Exchange (https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests/39217788#39217788) 232 # Copyright (C) 2020 Martijn Pieters @ Stack Exchange (https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests/39217788#39217788) 233 # Copyright (C) 2020 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com) 234 # 235 # This script is licensed under a 236 # Creative Commons Attribution-ShareAlike 4.0 International License. 237 # 238 # You should have received a copy of the license along with this 239 # work. If not, see <http://creativecommons.org/licenses/by-sa/4.0/>. 240 241 with requests.get(url, stream=True) as r: 242 with open(destination, 'wb') as f: 243 shutil.copyfileobj(r.raw, f) 244 245 246def download_binary_file_aria2c(downloader_args: str, 247 parent_directory: str, 248 url: str, 249 destination: str, 250 temporary_directory: str = 'tmp', 251 delete_temporary_directory: bool = False): 252 r"""Download a binary file with aria2.""" 253 p = shlex.quote(parent_directory) 254 d = shlex.quote(destination) 255 u = shlex.quote(url) 256 # Get the relative path. 257 t = str(pathlib.Path(shlex.quote(temporary_directory)).name) 258 pt = str(pathlib.Path(p, t)) 259 ptd = str(pathlib.Path(pt, d)) 260 261 # Save the file to a temporary file so that if the download is interrupted 262 # the pipeline does not detect that the file exists. 263 command = 'aria2c ' + downloader_args + ' --dir=' + pt + ' --out=' + d + ' ' + u 264 try: 265 return_code = fpyutils.shell.execute_command_live_output(command) 266 if return_code == 0: 267 try: 268 shutil.move(ptd, p) 269 if delete_temporary_directory: 270 try: 271 # See https://docs.python.org/3/library/shutil.html?highlight=shutil#shutil.rmtree.avoids_symlink_attacks 272 if shutil.rmtree.avoids_symlink_attacks: 273 shutil.rmtree(pt) 274 else: 275 raise shutil.Error 276 except shutil.Error as e: 277 print(e) 278 except shutil.Error as e: 279 print(e) 280 else: 281 sys.exit(1) 282 except subprocess.SubprocessError as e: 283 print(e) 284 raise e 285 sys.exit(1) 286 287 288def get_parent_directory_name(path: str) -> str: 289 r"""Get parent directory name.""" 290 return str(pathlib.Path(path).parent) 291 292 293def pre_download_hooks(destination: str, permissions: int = 0o700): 294 r"""Run selected actions before downloading the files.""" 295 pathlib.Path(destination).parent.mkdir(mode=permissions, 296 parents=True, 297 exist_ok=True) 298 299 300def post_download_hooks(path: str, permissions: str): 301 r"""Run selected actions after downloading the files.""" 302 # Change file permissions. 303 pathlib.Path(path).chmod(permissions) 304 305 306def download_binary_file(url: str, 307 destination: str, 308 downloader: str = 'requests', 309 downloader_args: str = str(), 310 permissions: int = 0o700, 311 delete_temporary_directory: bool = False): 312 r"""Download a binary file.""" 313 if downloader not in ['requests', 'aria2c']: 314 raise ValueError 315 316 print('Downloading ' + url + ' as ' + destination) 317 pre_download_hooks(destination, permissions) 318 if downloader == 'requests': 319 download_binary_file_requests(url, destination) 320 elif downloader == 'aria2c': 321 download_binary_file_aria2c(downloader_args, 322 get_parent_directory_name(destination), 323 url, get_relative_path(destination), 'tmp', 324 delete_temporary_directory) 325 post_download_hooks(destination, permissions) 326 327 328def delete_file(file: str): 329 r"""Delete a file.""" 330 print('Deleting ' + file) 331 pathlib.Path(file).unlink() 332 333 334def list_directory_files(directory: str) -> list: 335 r"""Get a list of files in a directory.""" 336 files = list() 337 p = pathlib.Path(directory) 338 if p.is_dir(): 339 for child in p.iterdir(): 340 if child.is_file(): 341 files.append(str(child)) 342 343 return files 344 345 346def run_kiwix_server(url_root_location: str, threads: int, port: int, 347 directory: str, options: list): 348 r"""Serve the ZIM files.""" 349 opts = str() 350 for o in options: 351 opts += ' ' + shlex.quote(o) 352 353 command = 'kiwix-serve --urlRootLocation ' + shlex.quote( 354 url_root_location) + ' --threads ' + shlex.quote( 355 str(threads)) + ' --port ' + shlex.quote(str( 356 port)) + ' ' + opts + ' ' + shlex.quote(directory) + '/*.zim' 357 fpyutils.shell.execute_command_live_output(command) 358 359 360def pipeline(): 361 r"""Run the pipeline.""" 362 # Load the configuration. 363 configuration_file = shlex.quote(sys.argv[1]) 364 action = shlex.quote(sys.argv[2]) 365 config = yaml.load(open(configuration_file, 'r'), Loader=yaml.SafeLoader) 366 serve = config['serve'] 367 downloads = config['downloads'] 368 if 'options' in serve: 369 options = serve['options'] 370 else: 371 options = list() 372 373 if action == '--serve': 374 run_kiwix_server(serve['url root location'], serve['threads'], 375 serve['port'], serve['directory'], options) 376 elif action == '--download': 377 for section in downloads: 378 root_url = rebuild_uri(downloads[section]['root url'], str()) 379 380 remote_uris = get_a_href_elements_from_url(root_url) 381 remote_uris = filter_uris_by_pattern( 382 remote_uris, 383 downloads[section]['regex patterns']['files to download']) 384 remote_uris = filter_uris_by_pattern( 385 remote_uris, downloads[section]['regex patterns']['date']) 386 remote_uris = get_relative_paths(remote_uris) 387 388 most_recent_remote_uris = get_most_recent_elements( 389 remote_uris, downloads[section]['regex patterns']['date'], 390 downloads[section]['date transformation string']) 391 392 local_uris = list_directory_files( 393 downloads[section]['download directory']) 394 local_uris = filter_uris_by_pattern( 395 local_uris, downloads[section]['regex patterns']['date']) 396 local_uris = get_relative_paths(local_uris) 397 398 most_recent_local_uris = get_most_recent_elements( 399 local_uris, downloads[section]['regex patterns']['date'], 400 downloads[section]['date transformation string']) 401 402 files_to_download, files_to_delete = compare_uris( 403 most_recent_local_uris, most_recent_remote_uris) 404 405 download_files(files_to_download, 406 downloads[section]['downloader']['name'], 407 downloads[section]['downloader']['args'], root_url, 408 downloads[section]['download directory']) 409 delete_files(files_to_delete, 410 downloads[section]['download directory']) 411 412 413if __name__ == '__main__': 414 pipeline()
create a
configuration file
/home/jobs/scripts/by-user/kiwix/kiwix_manage.yaml#1# 2# kiwix_manage.yaml 3# 4# Copyright (C) 2020-2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com) 5# 6# This program is free software: you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation, either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <http://www.gnu.org/licenses/>. 18 19serve: 20 threads: 24 21 port: 8888 22 directory: '/data/WEBSERVER/kiwix' 23 url root location: '/kiwix' 24 25 # A list of raw options for kiwix-serve. 26 # See 27 # kiwix-serve --help 28 options: [] 29 # - '--verbose' 30 31downloads: 32 wikipedia en: 33 root url: 'https://download.kiwix.org/zim/wikipedia' 34 regex patterns: 35 files to download: '^(wikipedia_en_physics_mini|wikipedia_en_computer_maxi|wikipedia_en_history_maxi|wikipedia_en_mathematics_maxi|wikipedia_en_medicine_maxi)' 36 date: '\d\d\d\d-\d\d' 37 date transformation string: '%Y-%m' 38 download directory: '/data/WEBSERVER/kiwix' 39 downloader: 40 # Supported downloaders: {aria2c,requests} 41 name: 'aria2c' 42 args: '--continue=true --max-concurrent-downloads=3 --max-connection-per-server=3 --split=3 --min-split-size=1M --max-overall-download-limit=256K' 43 wiversity en: 44 root url: 'https://download.kiwix.org/zim/wikiversity/' 45 regex patterns: 46 files to download: '^(wikiversity_en_all_maxi)' 47 date: '\d\d\d\d-\d\d' 48 date transformation string: '%Y-%m' 49 download directory: '/data/WEBSERVER/kiwix' 50 downloader: 51 name: 'aria2c' 52 args: '--continue=true --max-concurrent-downloads=3 --max-connection-per-server=3 --split=3 --min-split-size=1M --max-overall-download-limit=256K'
create the data directory which must be accessible by the
kiwix
usermkdir /data/WEBSERVER/kiwix chmod 700 /data/WEBSERVER/kiwix chown kiwix:kiwix /data/WEBSERVER/kiwix
use this
Systemd service file
to serve the content/home/jobs/services/by-user/kiwix/kiwix-manage.serve.service#1# 2# kiwix-manage.serve.service 3# 4# Copyright (C) 2020,2022 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com) 5# 6# This program is free software: you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation, either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <http://www.gnu.org/licenses/>. 18 19[Unit] 20Description=Serve kiwix files 21Wants=network.target 22After=network.target 23 24[Service] 25Type=simple 26ExecStart=/home/jobs/scripts/by-user/kiwix/kiwix_manage.py /home/jobs/scripts/by-user/kiwix/kiwix_manage.yaml --serve 27User=kiwix 28Group=kiwix 29Restart=always 30 31[Install] 32WantedBy=multi-user.target
use this
Systemd service file
to download the content/home/jobs/services/by-user/kiwix/kiwix-manage.download.service#1# 2# kiwix-manage.download.service 3# 4# Copyright (C) 2020 Franco Masotti (franco \D\o\T masotti {-A-T-} tutanota \D\o\T com) 5# 6# This program is free software: you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation, either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <http://www.gnu.org/licenses/>. 18 19[Unit] 20Description=Download kiwix files 21Wants=network-online.target 22After=network-online.target 23 24[Service] 25Type=simple 26ExecStart=/home/jobs/scripts/by-user/kiwix/kiwix_manage.py /home/jobs/scripts/by-user/kiwix/kiwix_manage.yaml --download 27User=kiwix 28Group=kiwix 29 30[Install] 31WantedBy=multi-user.target
fix the permissions
chown -R kiwix:kiwix /home/jobs/{scripts,services}/by-user/kiwix chmod 700 -R /home/jobs/{scripts,services}/by-user/kiwix
Download#

Download status of a zim file#
run the deploy script
start downloading the files
systemctl start kiwix-manage.download.service
wait for the files to be downloaded before going to the serve section
Serve#

Example of served files by Kiwix#
run the deploy script
modify the reverse proxy port of your webserver configuration with
8888
Important
After downloading new files you must rerun this service
Footnotes
- 1
https://www.kiwix.org/en/ unknown license
- 2
https://software.franco.net.eu.org/frnmst/automated-tasks GNU GPLv3+, copyright (c) 2019-2022, Franco Masotti
- 3
https://download.kiwix.org/zim/ unknown license
- 4
https://stackoverflow.com/a/39217788 CC BY-SA 4.0, copyright (c) 2016, 2021, John Zwinck, Daniel F (at stackoverflow.com)
- 5
https://stackoverflow.com/a/53811881 CC BY-SA 4.0, copyright (c) 2018, Tom Hale (at stackoverflow.com)
- 6
https://docs.python.org/3/library/shutil.html?highlight=shutil#shutil.rmtree.avoids_symlink_attacks Python Software Foundation License Version 2, © Copyright 2001-2022, Python Software Foundation