diff options
| author | Guilhem Moulin <guilhem@fripost.org> | 2026-03-06 10:52:43 +0100 |
|---|---|---|
| committer | Guilhem Moulin <guilhem@fripost.org> | 2026-03-06 16:06:41 +0100 |
| commit | ca91a579770c89d25aefae220079bf336fa88dc9 (patch) | |
| tree | cb1f49adacf12b0cb15b6430b0446fbee2135814 /geodata-download | |
| parent | 94438a900d3fb933a33aed4d2ffeb8809e966c46 (diff) | |
Rename "webmap" to the less generic "geodata".
The database has uses beyond the webmap.
Diffstat (limited to 'geodata-download')
| -rwxr-xr-x | geodata-download | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/geodata-download b/geodata-download new file mode 100755 index 0000000..5e191ad --- /dev/null +++ b/geodata-download @@ -0,0 +1,291 @@ +#!/usr/bin/python3 + +#---------------------------------------------------------------------- +# Backend utilities for the Klimatanalys Norr project (download common layers) +# Copyright © 2024-2025 Guilhem Moulin <info@guilhem.se> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +#---------------------------------------------------------------------- + +# pylint: disable=invalid-name, missing-module-docstring, fixme +# pylint: enable=invalid-name + +from os import ( + O_RDONLY, + O_WRONLY, + O_CREAT, + O_TRUNC, + O_CLOEXEC, + O_PATH, + O_DIRECTORY, + O_TMPFILE, + path as os_path, + curdir as os_curdir, +) +import os +import sys +from fcntl import flock, LOCK_EX +import logging +from time import time, monotonic as time_monotonic +import argparse +import itertools +from pathlib import Path +from email.utils import parsedate_to_datetime, formatdate +from typing import Optional, NoReturn, Never +import requests + +import common +from common import parse_config_dl, getSourcePathLockFileName + +def download_trystream(url : str, **kwargs) -> requests.Response: + """GET a url, trying a number of times. Return immediately after the + first chunk is received""" + + max_retries = kwargs.pop('max_retries', 10) + f = kwargs.pop('session', None) + if f is None: + f = requests + for i in itertools.count(1): + try: + r = f.get(url, **kwargs, stream=True) + except (requests.Timeout, requests.ConnectionError): + if i < max_retries: + logging.error('timeout') + continue + raise + + r.raise_for_status() + return r + +class DownloadTooLarge(Exception): + """Exception raised when a downloaded file exceeds max-size""" + def __init__(self, max_size : int) -> Never: + super().__init__(f'Payload exceeds max-size ({max_size})') + +# pylint: disable-next=dangerous-default-value +def download(dest : str, + dl : dict[str, dict[str, str|int]], + dir_fd : Optional[int] = None, + headers : dict[str, str] = {}, + session : Optional[requests.sessions.Session] = None, + progress = None) -> None: + """Process a single download recipe""" + + url = None if dl is None else dl.get('url', None) + if url is None: + logging.error('%s has no source URL, ignoring', dest) + return + max_size = dl.get('max-size', 2**26) # 64MiB + logging.info('Downloading %s…', url) + dest_path = Path(dest) + dest_tmp = str(dest_path.with_stem(f'.{dest_path.stem}.new')) + try: + # delete any leftover + os.unlink(dest_tmp, dir_fd=dir_fd) + except FileNotFoundError: + pass + + start = time_monotonic() + r = download_trystream(url, headers=headers, session=session, timeout=30) + if r.status_code == 304: + logging.info('%s: %d Not Modified', dest, r.status_code) + return + + body_size = r.headers.get('Content-Length', None) + last_modified = r.headers.get('Last-Modified', None) + if last_modified is not None: + try: + last_modified = parsedate_to_datetime(last_modified) + last_modified = last_modified.timestamp() + except ValueError: + logging.exception('Could not parse Last-Modified value') + last_modified = None + + size = 0 + pbar = None + + # XXX we can't use TemporaryFile as it uses O_EXCL, cf. + # https://discuss.python.org/t/temporaryfile-contextmanager-that-allows-creating-a-directory-entry-on-success/19094/2 + fd = os.open(os_path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd) + try: + if progress is not None: + pbar = progress( + total=int(body_size) if body_size is not None else float('inf'), + leave=False, + unit_scale=True, + unit_divisor=1024, + unit='B' + ) + with os.fdopen(fd, mode='wb', closefd=False) as fp: + for chunk in r.iter_content(chunk_size=2**16): + chunk_size = len(chunk) + if pbar is not None: + pbar.update(chunk_size) + size += chunk_size + if max_size is not None and size > max_size: + raise DownloadTooLarge(max_size) + fp.write(chunk) + r = None + + if last_modified is not None: + os.utime(fd, times=(last_modified, last_modified)) + + # XXX unfortunately there is no way for linkat() to clobber the destination, + # so we use a temporary file; it's racy, but thanks to O_TMPFILE better + # (shorter race) than if we were dumping chunks in a named file descriptor + os.link(f'/proc/self/fd/{fd}', dest_tmp, dst_dir_fd=dir_fd, follow_symlinks=True) + finally: + os.close(fd) + if pbar is not None: + pbar.close() + + try: + # atomic rename (ensures output is never partially written) + os.rename(dest_tmp, dest, src_dir_fd=dir_fd, dst_dir_fd=dir_fd) + except (OSError, ValueError) as e: + try: + os.unlink(dest_tmp, dir_fd=dir_fd) + finally: + raise e + + elapsed = time_monotonic() - start + logging.info('%s: Downloaded %s in %s (%s/s)', dest, + common.format_bytes(size), + common.format_time(elapsed), + common.format_bytes(int(size/elapsed))) + +# pylint: disable-next=missing-function-docstring +def main() -> NoReturn: + common.init_logger(app=os_path.basename(__file__), level=logging.INFO) + + parser = argparse.ArgumentParser(description='Download or update GIS layers.') + parser.add_argument('--cachedir', default=os_curdir, + help=f'destination directory for downloaded files (default: {os_curdir})') + parser.add_argument('--lockdir', default=None, + help='optional directory for lock files') + parser.add_argument('--quiet', action='store_true', + help='skip progress bars even when stderr is a TTY') + parser.add_argument('--debug', action='count', default=0, + help=argparse.SUPPRESS) + parser.add_argument('--exit-code', default=True, action=argparse.BooleanOptionalAction, + help='whether to exit with status 1 in case of download failures') + parser.add_argument('--force', default=False, action='store_true', + help='always download regardless of age') + parser.add_argument('groupname', nargs='*', help='group layer name(s) to process') + args = parser.parse_args() + + if args.debug > 0: # pylint: disable=duplicate-code + logging.getLogger().setLevel(logging.DEBUG) + if args.debug > 1: + from http.client import HTTPConnection # pylint: disable=import-outside-toplevel + HTTPConnection.debuglevel = 1 + requests_log = logging.getLogger('urllib3') + requests_log.setLevel(logging.DEBUG) + requests_log.propagate = True + + config = common.parse_config(groupnames=None if args.groupname == [] else args.groupname) + downloads = parse_config_dl(config.get('downloads', [])) + + rv = 0 + download_paths = set() + for layername, layerdef in config.get('layers', {}).items(): + sources = layerdef.get('sources', None) + if sources is None or len(sources) < 1: + logging.warning('Layer "%s" has no source, ignoring', layername) + continue + for idx, source in enumerate(sources): + if 'source' not in source: + continue + source = source['source'] + path = None if source is None else source.get('path', None) + if path is None: + logging.error('Source #%d of layer "%s" has no path, ignoring', + idx, layername) + rv = 1 + elif path not in downloads: + logging.warning('Ignoring unknown source of path "%s" from layer "%s"', + path, layername) + else: + download_paths.add(path) + + if args.quiet or not sys.stderr.isatty(): + pbar = None + else: + from tqdm import tqdm # pylint: disable=import-outside-toplevel + pbar = tqdm + + # intentionally leave the dirfd open until the program terminates + opendir_args = O_RDONLY|O_CLOEXEC|O_PATH|O_DIRECTORY + destdir_fd = os.open(args.cachedir, opendir_args) + lockdir_fd = None if args.lockdir is None else os.open(args.lockdir, opendir_args) + + session_requests = requests.Session() + + for dest in download_paths: + dl = downloads[dest] + headers = {} + user_agent = config.get('User-Agent', None) + if user_agent is not None: + headers['User-Agent'] = user_agent + + try: + # create parent directories + destdir = os_path.dirname(dest) + common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True) + + # place an exclusive lock on a lockfile as the destination can be used by other layers + # hence might be updated in parallel + if lockdir_fd is not None: + umask = os.umask(0o002) + lockfile = getSourcePathLockFileName(dest) + try: + # use O_TRUNC to bump lockfile's mtime + lock_fd = os.open(lockfile, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode=0o664, + dir_fd=lockdir_fd) + finally: + os.umask(umask) + + try: + if lockdir_fd is not None: + logging.debug('flock("%s", LOCK_EX)', lockfile) + flock(lock_fd, LOCK_EX) + try: + st = os.stat(dest, dir_fd=destdir_fd) + except (OSError, ValueError): + # the file doesn't exist, or stat() failed for some reason + pass + else: + if not args.force: + max_age = dl.get('max-age', 6*3600) # 6h + if max_age is not None: + s = max_age + max(st.st_ctime, st.st_mtime) - time() + if s > 0: + logging.info('%s: Too young, try again in %s', dest, + common.format_time(s)) + continue + headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, + localtime=False, usegmt=True) + download(dest, dl, dir_fd=destdir_fd, + headers=headers, session=session_requests, + progress=pbar) + finally: + if lockdir_fd is not None: + os.close(lock_fd) + except Exception: # pylint: disable=broad-exception-caught + logging.exception('Could not download %s as %s', dl.get('url', '[N/A]'), dest) + if args.exit_code: + rv = 1 + sys.exit(rv) + +main() |
