aboutsummaryrefslogtreecommitdiffstats
path: root/webmap-download
diff options
context:
space:
mode:
authorGuilhem Moulin <guilhem@fripost.org>2025-04-18 11:42:07 +0200
committerGuilhem Moulin <guilhem@fripost.org>2025-04-19 19:25:16 +0200
commitc689b2d07828985e881423357c7ab42877f64909 (patch)
treefaaeef9e341f6258d25bba0963b14758eca27b84 /webmap-download
parent2abf2297aabb355b72c6ae9e0aaf350f7a6cbe9d (diff)
Factor sources in config.yml.
This avoid duplications when the same source file is used multiple times (either by the same layer or by multiple layers). This change breaks webmap-import, but that one will be refactored shortly. It also breaks webmap-import-mrr.py, which is no longer used since mineralrattigheter.zip can be downloaded from SGU's site directly.
Diffstat (limited to 'webmap-download')
-rwxr-xr-xwebmap-download154
1 files changed, 111 insertions, 43 deletions
diff --git a/webmap-download b/webmap-download
index 2d31a19..a8a444a 100755
--- a/webmap-download
+++ b/webmap-download
@@ -2,7 +2,7 @@
#----------------------------------------------------------------------
# Backend utilities for the Klimatanalys Norr project (download common layers)
-# Copyright © 2024 Guilhem Moulin <info@guilhem.se>
+# Copyright © 2024-2025 Guilhem Moulin <info@guilhem.se>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -21,7 +21,20 @@
# pylint: disable=invalid-name,missing-module-docstring
# pylint: enable=invalid-name
-from os import O_RDONLY, O_WRONLY, O_CREAT, O_TRUNC, O_CLOEXEC, O_PATH, O_DIRECTORY, O_TMPFILE
+from os import (
+ O_RDONLY,
+ O_WRONLY,
+ O_CREAT,
+ O_TRUNC,
+ O_CLOEXEC,
+ O_PATH,
+ O_DIRECTORY,
+ O_TMPFILE,
+ path as os_path,
+ curdir as os_curdir,
+ pardir as os_pardir,
+ sep as os_sep
+)
import os
import sys
from fcntl import flock, LOCK_EX
@@ -32,7 +45,7 @@ import itertools
from pathlib import Path
from email.utils import parsedate_to_datetime, formatdate
from hashlib import sha256
-from typing import Any, Optional, NoReturn, Never
+from typing import Optional, NoReturn, Never
import requests
import common
@@ -64,7 +77,7 @@ class DownloadTooLarge(Exception):
# pylint: disable-next=dangerous-default-value
def download(dest : str,
- dl : Optional[dict[str, Any]],
+ dl : dict[str, dict[str, str|int]],
dir_fd : Optional[int] = None,
headers : dict[str, str] = {},
session : Optional[requests.sessions.Session] = None,
@@ -109,7 +122,7 @@ def download(dest : str,
# XXX we can't use TemporaryFile as it uses O_EXCL, cf.
# https://discuss.python.org/t/temporaryfile-contextmanager-that-allows-creating-a-directory-entry-on-success/19094/2
- fd = os.open(os.path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd)
+ fd = os.open(os_path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd)
try:
if progress is not None:
pbar = progress(
@@ -157,12 +170,77 @@ def download(dest : str,
common.format_time(elapsed),
common.format_bytes(int(size/elapsed)))
+class BadConfiguration(Exception):
+ """Exception raised when there is a bad configuration"""
+ def __init__(self, message : str, config_path : Optional[Path] = None) -> Never:
+ if config_path is not None:
+ message = str(config_path) + ': ' + message
+ super().__init__(message)
+
+def _check_key_type(k : str, v : str, known_keys : list[type, tuple[set[str]]]) -> bool:
+ for t, ks in known_keys:
+ if k in ks and isinstance(v, t):
+ return True
+ return False
+
+def parse_config_dl(downloads) -> dict[str, dict[str, str|int]]:
+ """Parse and validate the "downloads" section from the configuration dictionary"""
+
+ if not isinstance(downloads, list):
+ raise BadConfiguration(f'Invalid download recipe: {downloads}')
+
+ known_keys = [
+ (str, {'path', 'url'}),
+ (int, {'max-age', 'max-size'})
+ ]
+
+ destinations = {}
+ known_keys_set = {k for _,ks in known_keys for k in ks}
+ for dl in downloads:
+ if 'url' in dl:
+ dls = [dl]
+ elif 'basedir' in dl and 'baseurl' in dl and 'files' in dl and 'path' not in dl:
+ dls = []
+ for filename in dl['files']:
+ dl2 = {
+ 'path' : os_path.join(dl['basedir'], filename),
+ 'url' : dl['baseurl'] + filename
+ }
+ for k, v in dl.items():
+ if k not in ('basedir', 'baseurl', 'files'):
+ dl2[k] = v
+ dls.append(dl2)
+ else:
+ raise BadConfiguration(f'Invalid download recipe: {dl}')
+
+ for dl in dls:
+ path = dl.get('path', None)
+ if path is None or path in ('', os_curdir, os_pardir) or path.endswith(os_sep):
+ raise BadConfiguration(f'Invalid destination path "{path}"')
+ if path in destinations:
+ raise BadConfiguration(f'Duplicate download recipe for "{path}"')
+ dl2 = {}
+ for k, v in dl.items():
+ if k == 'path':
+ continue
+ if k not in known_keys_set:
+ logging.warning('Ignoring unknown setting "%s" in download recipe for "%s"',
+ k, path)
+ elif not _check_key_type(k, v, known_keys):
+ logging.warning('Ignoring setting "%s" in download recipe for "%s"'
+ ' (invalid type)', k, path)
+ else:
+ dl2[k] = v
+ destinations[path] = dl2
+
+ return destinations
+
def main() -> NoReturn: # pylint: disable=missing-function-docstring
- common.init_logger(app=os.path.basename(__file__), level=logging.INFO)
+ common.init_logger(app=os_path.basename(__file__), level=logging.INFO)
parser = argparse.ArgumentParser(description='Download or update GIS layers.')
- parser.add_argument('--cachedir', default=os.curdir,
- help=f'destination directory for downloaded files (default: {os.curdir})')
+ parser.add_argument('--cachedir', default=os_curdir,
+ help=f'destination directory for downloaded files (default: {os_curdir})')
parser.add_argument('--lockdir', default=None,
help='optional directory for lock files')
parser.add_argument('--quiet', action='store_true',
@@ -184,13 +262,24 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring
requests_log.propagate = True
config = common.parse_config(groupnames=None if args.groupname == [] else args.groupname)
+ downloads = parse_config_dl(config.get('downloads', []))
- sources = []
- for name, layerdefs in config.get('layers', {}).items():
- for layerdef in layerdefs['sources']:
- sourcedef = layerdef.get('source', {})
- sourcedef['layername'] = name
- sources.append(sourcedef)
+ rv = 0
+ download_paths = set()
+ for layername, layerdef in config.get('layers', {}).items():
+ source = layerdef.get('source', None)
+ if source is None:
+ logging.error('Layer "%s" has no source, ignoring', layername)
+ rv = 1
+ continue
+ path = source.get('path', None)
+ if path is None:
+ logging.error('Layer "%s" has no source path, ignoring', layername)
+ rv = 1
+ elif path not in downloads:
+ logging.warning('Ignoring unknown source of path "%s" from layer "%s"', path, layername)
+ else:
+ download_paths.add(path)
if args.quiet or not sys.stderr.isatty():
pbar = None
@@ -205,27 +294,8 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring
session_requests = requests.Session()
- rv = 0
- downloads = set()
- for source in sources:
- dl = source.get('download', None)
- dl_module = None if dl is None else dl.get('module', None)
- if dl_module is None:
- fetch = download
- else:
- dl_module = __import__(dl_module)
- fetch = dl_module.download
-
- cache = source.get('cache', None)
- dest = None if cache is None else cache.get('path', None)
- if dest is None:
- continue
-
- dest = str(dest) # convert from Path()
- if dest in downloads:
- logging.info('%s was already downloaded, skipping', dest)
- continue
-
+ for dest in download_paths:
+ dl = downloads[dest]
headers = {}
user_agent = config.get('User-Agent', None)
if user_agent is not None:
@@ -233,7 +303,7 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring
try:
# create parent directories
- destdir = os.path.dirname(dest)
+ destdir = os_path.dirname(dest)
common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True)
# place an exclusive lock on a lockfile as the destination can be used by other layers
@@ -253,7 +323,7 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring
# the file doesn't exist, or stat() failed for some reason
pass
else:
- max_age = cache.get('max-age', 6*3600) # 6h
+ max_age = dl.get('max-age', 6*3600) # 6h
if max_age is not None:
s = max_age + max(st.st_ctime, st.st_mtime) - time()
if s > 0:
@@ -261,16 +331,14 @@ def main() -> NoReturn: # pylint: disable=missing-function-docstring
dest, common.format_time(s))
continue
headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, localtime=False, usegmt=True)
- fetch(dest, dl, dir_fd=destdir_fd,
- headers=headers, session=session_requests,
- progress=pbar)
- downloads.add(dest)
+ download(dest, dl, dir_fd=destdir_fd,
+ headers=headers, session=session_requests,
+ progress=pbar)
finally:
if lockdir_fd is not None:
os.close(lock_fd)
except Exception: # pylint: disable=broad-exception-caught
- logging.exception('Could not download %s as %s',
- dl.get('url', source['layername']), dest)
+ logging.exception('Could not download %s as %s', dl.get('url', '[N/A]'), dest)
if args.exit_code:
rv = 1
sys.exit(rv)