diff options
author | Guilhem Moulin <guilhem@fripost.org> | 2025-04-18 11:42:07 +0200 |
---|---|---|
committer | Guilhem Moulin <guilhem@fripost.org> | 2025-04-19 19:25:16 +0200 |
commit | c689b2d07828985e881423357c7ab42877f64909 (patch) | |
tree | faaeef9e341f6258d25bba0963b14758eca27b84 /webmap-download | |
parent | 2abf2297aabb355b72c6ae9e0aaf350f7a6cbe9d (diff) |
Factor sources in config.yml.
This avoid duplications when the same source file is used multiple times
(either by the same layer or by multiple layers). This change breaks
webmap-import, but that one will be refactored shortly.
It also breaks webmap-import-mrr.py, which is no longer used since
mineralrattigheter.zip can be downloaded from SGU's site directly.
Diffstat (limited to 'webmap-download')
-rwxr-xr-x | webmap-download | 154 |
1 files changed, 111 insertions, 43 deletions
diff --git a/webmap-download b/webmap-download index 2d31a19..a8a444a 100755 --- a/webmap-download +++ b/webmap-download @@ -2,7 +2,7 @@ #---------------------------------------------------------------------- # Backend utilities for the Klimatanalys Norr project (download common layers) -# Copyright © 2024 Guilhem Moulin <info@guilhem.se> +# Copyright © 2024-2025 Guilhem Moulin <info@guilhem.se> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -21,7 +21,20 @@ # pylint: disable=invalid-name,missing-module-docstring # pylint: enable=invalid-name -from os import O_RDONLY, O_WRONLY, O_CREAT, O_TRUNC, O_CLOEXEC, O_PATH, O_DIRECTORY, O_TMPFILE +from os import ( + O_RDONLY, + O_WRONLY, + O_CREAT, + O_TRUNC, + O_CLOEXEC, + O_PATH, + O_DIRECTORY, + O_TMPFILE, + path as os_path, + curdir as os_curdir, + pardir as os_pardir, + sep as os_sep +) import os import sys from fcntl import flock, LOCK_EX @@ -32,7 +45,7 @@ import itertools from pathlib import Path from email.utils import parsedate_to_datetime, formatdate from hashlib import sha256 -from typing import Any, Optional, NoReturn, Never +from typing import Optional, NoReturn, Never import requests import common @@ -64,7 +77,7 @@ class DownloadTooLarge(Exception): # pylint: disable-next=dangerous-default-value def download(dest : str, - dl : Optional[dict[str, Any]], + dl : dict[str, dict[str, str|int]], dir_fd : Optional[int] = None, headers : dict[str, str] = {}, session : Optional[requests.sessions.Session] = None, @@ -109,7 +122,7 @@ def download(dest : str, # XXX we can't use TemporaryFile as it uses O_EXCL, cf. # https://discuss.python.org/t/temporaryfile-contextmanager-that-allows-creating-a-directory-entry-on-success/19094/2 - fd = os.open(os.path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd) + fd = os.open(os_path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd) try: if progress is not None: pbar = progress( @@ -157,12 +170,77 @@ def download(dest : str, common.format_time(elapsed), common.format_bytes(int(size/elapsed))) +class BadConfiguration(Exception): + """Exception raised when there is a bad configuration""" + def __init__(self, message : str, config_path : Optional[Path] = None) -> Never: + if config_path is not None: + message = str(config_path) + ': ' + message + super().__init__(message) + +def _check_key_type(k : str, v : str, known_keys : list[type, tuple[set[str]]]) -> bool: + for t, ks in known_keys: + if k in ks and isinstance(v, t): + return True + return False + +def parse_config_dl(downloads) -> dict[str, dict[str, str|int]]: + """Parse and validate the "downloads" section from the configuration dictionary""" + + if not isinstance(downloads, list): + raise BadConfiguration(f'Invalid download recipe: {downloads}') + + known_keys = [ + (str, {'path', 'url'}), + (int, {'max-age', 'max-size'}) + ] + + destinations = {} + known_keys_set = {k for _,ks in known_keys for k in ks} + for dl in downloads: + if 'url' in dl: + dls = [dl] + elif 'basedir' in dl and 'baseurl' in dl and 'files' in dl and 'path' not in dl: + dls = [] + for filename in dl['files']: + dl2 = { + 'path' : os_path.join(dl['basedir'], filename), + 'url' : dl['baseurl'] + filename + } + for k, v in dl.items(): + if k not in ('basedir', 'baseurl', 'files'): + dl2[k] = v + dls.append(dl2) + else: + raise BadConfiguration(f'Invalid download recipe: {dl}') + + for dl in dls: + path = dl.get('path', None) + if path is None or path in ('', os_curdir, os_pardir) or path.endswith(os_sep): + raise BadConfiguration(f'Invalid destination path "{path}"') + if path in destinations: + raise BadConfiguration(f'Duplicate download recipe for "{path}"') + dl2 = {} + for k, v in dl.items(): + if k == 'path': + continue + if k not in known_keys_set: + logging.warning('Ignoring unknown setting "%s" in download recipe for "%s"', + k, path) + elif not _check_key_type(k, v, known_keys): + logging.warning('Ignoring setting "%s" in download recipe for "%s"' + ' (invalid type)', k, path) + else: + dl2[k] = v + destinations[path] = dl2 + + return destinations + def main() -> NoReturn: # pylint: disable=missing-function-docstring - common.init_logger(app=os.path.basename(__file__), level=logging.INFO) + common.init_logger(app=os_path.basename(__file__), level=logging.INFO) parser = argparse.ArgumentParser(description='Download or update GIS layers.') - parser.add_argument('--cachedir', default=os.curdir, - help=f'destination directory for downloaded files (default: {os.curdir})') + parser.add_argument('--cachedir', default=os_curdir, + help=f'destination directory for downloaded files (default: {os_curdir})') parser.add_argument('--lockdir', default=None, help='optional directory for lock files') parser.add_argument('--quiet', action='store_true', @@ -184,13 +262,24 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring requests_log.propagate = True config = common.parse_config(groupnames=None if args.groupname == [] else args.groupname) + downloads = parse_config_dl(config.get('downloads', [])) - sources = [] - for name, layerdefs in config.get('layers', {}).items(): - for layerdef in layerdefs['sources']: - sourcedef = layerdef.get('source', {}) - sourcedef['layername'] = name - sources.append(sourcedef) + rv = 0 + download_paths = set() + for layername, layerdef in config.get('layers', {}).items(): + source = layerdef.get('source', None) + if source is None: + logging.error('Layer "%s" has no source, ignoring', layername) + rv = 1 + continue + path = source.get('path', None) + if path is None: + logging.error('Layer "%s" has no source path, ignoring', layername) + rv = 1 + elif path not in downloads: + logging.warning('Ignoring unknown source of path "%s" from layer "%s"', path, layername) + else: + download_paths.add(path) if args.quiet or not sys.stderr.isatty(): pbar = None @@ -205,27 +294,8 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring session_requests = requests.Session() - rv = 0 - downloads = set() - for source in sources: - dl = source.get('download', None) - dl_module = None if dl is None else dl.get('module', None) - if dl_module is None: - fetch = download - else: - dl_module = __import__(dl_module) - fetch = dl_module.download - - cache = source.get('cache', None) - dest = None if cache is None else cache.get('path', None) - if dest is None: - continue - - dest = str(dest) # convert from Path() - if dest in downloads: - logging.info('%s was already downloaded, skipping', dest) - continue - + for dest in download_paths: + dl = downloads[dest] headers = {} user_agent = config.get('User-Agent', None) if user_agent is not None: @@ -233,7 +303,7 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring try: # create parent directories - destdir = os.path.dirname(dest) + destdir = os_path.dirname(dest) common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True) # place an exclusive lock on a lockfile as the destination can be used by other layers @@ -253,7 +323,7 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring # the file doesn't exist, or stat() failed for some reason pass else: - max_age = cache.get('max-age', 6*3600) # 6h + max_age = dl.get('max-age', 6*3600) # 6h if max_age is not None: s = max_age + max(st.st_ctime, st.st_mtime) - time() if s > 0: @@ -261,16 +331,14 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring dest, common.format_time(s)) continue headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, localtime=False, usegmt=True) - fetch(dest, dl, dir_fd=destdir_fd, - headers=headers, session=session_requests, - progress=pbar) - downloads.add(dest) + download(dest, dl, dir_fd=destdir_fd, + headers=headers, session=session_requests, + progress=pbar) finally: if lockdir_fd is not None: os.close(lock_fd) except Exception: # pylint: disable=broad-exception-caught - logging.exception('Could not download %s as %s', - dl.get('url', source['layername']), dest) + logging.exception('Could not download %s as %s', dl.get('url', '[N/A]'), dest) if args.exit_code: rv = 1 sys.exit(rv) |