From 4fe584a6e3eb9a32977ed545146db267c83f3788 Mon Sep 17 00:00:00 2001 From: Guilhem Moulin Date: Thu, 16 May 2024 20:14:52 +0200 Subject: Add `webmap-download` script. --- common.py | 171 +++++++++++++++++++++++++++++++++++++++++ config.yml | 179 ++++++++++++++++++++++++++++++++++++++++++ webmap-download | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 585 insertions(+) create mode 100644 common.py create mode 100644 config.yml create mode 100755 webmap-download diff --git a/common.py b/common.py new file mode 100644 index 0000000..e4456af --- /dev/null +++ b/common.py @@ -0,0 +1,171 @@ +import os, sys +from os import path +from fnmatch import fnmatchcase +from pathlib import Path, PosixPath +from urllib.parse import urlparse, urlunparse +from stat import S_ISDIR +from xdg.BaseDirectory import xdg_config_home +import logging +import yaml +import __main__ as main + +def load_config(path=None, groupnames=None): + main_script = os.path.basename(main.__file__) + if path is None: + for p in [Path(), + Path(xdg_config_home).joinpath('webmap'), + PosixPath('/etc').joinpath('webmap')]: + p = p.joinpath('config.yml') + if p.exists(): + path = str(p) + break + with open(path, 'r') as fp: + config = yaml.safe_load(fp) + layers = config.get('layers', {}) + + # validate sources + destinations = {} + for name, layer in layers.items(): + if isinstance(layer, dict): + layers[name] = layer = [layer] + + for sourcedef in layer: + source = sourcedef.get('source', None) + if source is None: + continue + download = source.get('download', None) + if download is None: + url = None + script = None + elif isinstance(download, str): + url = download + script = None + source['download'] = download = { 'url': url } + else: + url = download.get('url', None) + script = download.get('script', None) + if url is None: + urlp = None + else: + urlp = urlparse(url) + if urlp is None: + raise Exception(f'urlparse({url}) failed') + + cache = source.get('cache', None) + if cache is None or isinstance(cache, str): + source['cache'] = { 'path': cache } + else: + cache = cache.get('path', None) + + if cache is None or cache in ['', os.curdir, os.pardir] or cache.endswith(os.sep): + # infer filename from the source URL + if urlp is None or urlp.path is None or urlp.path == '' or urlp.path.endswith('/'): + raise Exception(f'Layer "{name}": Could not infer filename from URL {url}') + p = PosixPath(urlp.path) + if p is None or p.name is None or p.name == '': + raise Exception(f'Invalid PosixPath({urlp.path})') + if cache is None or cache == '': + cache = Path() + else: + cache = Path(cache) + cache = cache.joinpath(p.name) + else: + cache = Path(cache) + source['cache']['path'] = cache + + v = { 'url': urlp, 'script': main_script if script is None else script } + if cache in destinations and destinations[cache] != v: + # allow destination conflicts, but only when the source URL and script match + raise Exception(f'Destination conflict for layer "{name}"') + destinations[cache] = v + + # filter layers that are not of interest + if groupnames is not None: + layernames = [] + layer_groups = config.get('layer-groups', {}) + for groupname in groupnames: + if groupname not in layer_groups: + if groupname in layers: + # fallback to layer names + layernames.append(groupname) + else: + logging.error('Unknown group/layer name "%s"', groupname) + exit(1) + else: + patterns = layer_groups[groupname] + if isinstance(patterns, str): + patterns = [patterns] + for pat in patterns: + has_match = False + for layername in layers: + if fnmatchcase(layername, pat): + if layername in layernames: + logging.debug('Layer "%s" was already added, skipping', layername) + else: + layernames.append(layername) + has_match = True + if has_match: + pass + elif pat in layers: + # fallback to exact match + if pat in layernames: + logging.debug('Layer "%s" was already added, skipping', pat) + else: + layernames.append(pat) + else: + logging.warning('Group name "%s" does not match anything', groupname) + + layers = { name: layers[name] for name in layernames } + + config['layers'] = layers + sys.modules[__name__].config = config + + +###### +# The function definitions below are taken from cpython's source code +# and augmented with dir_fd. + +# Is a path a directory? +# (From genericpath.py.) +def isdir(path, dir_fd=None, follow_symlinks=True): + try: + st = os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks) + except (OSError, ValueError): + return False + return S_ISDIR(st.st_mode) + +# Does a path exist? +# (From genericpath.py.) +def exists(path, dir_fd=None, follow_symlinks=True): + try: + os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks) + except (OSError, ValueError): + return False + return True + +# Create a leaf directory and all intermediate ones. +# (From os.py.) +def makedirs(name, mode=0o777, exist_ok=False, dir_fd=None, logging=None): + head, tail = path.split(name) + if not tail: + head, tail = path.split(head) + if head and tail and not exists(head, dir_fd=dir_fd): + try: + makedirs(head, exist_ok=exist_ok, dir_fd=dir_fd, logging=logging) + except FileExistsError: + # Defeats race condition when another thread created the path + pass + cdir = os.curdir + if isinstance(tail, bytes): + cdir = bytes(os.curdir, 'ASCII') + if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists + return + if logging is not None: + logging.debug('mkdir("%s", 0%o)', name, mode) + try: + os.mkdir(name, mode, dir_fd=dir_fd) + except OSError: + # Cannot rely on checking for EEXIST, since the operating system + # could give priority to other errors like EACCES or EROFS + if not exist_ok or not isdir(name, dir_fd=dir_fd): + raise diff --git a/config.yml b/config.yml new file mode 100644 index 0000000..6aa7abf --- /dev/null +++ b/config.yml @@ -0,0 +1,179 @@ +--- +# Take User-Agent value from Tor Browser 13.0.15 (based on Mozilla Firefox 115.11.0esr) +User-Agent: 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' + +# Map group names to one or more pattern of layer name(s). This is a +# convenience feature for systemd template units. +layer-groups: + nvr: 'nvr:*' + sks: 'sks:*' + + +layers: +# # Keys are layer names in the output dataset. Values are list of dictionaries +# # specifying source data and conversion information for these layers. If a +# # layer has a single source, then it can be inlined instead. +# layer1_name: +# - source: +# download: +# # source:download:url: URL from where to download the source file. +# # source:download can be used as an alias when source:download:url is +# # its only subkey. +# url: 'https://example.net/path/to/layer.zip' +# # source:download:max-size: The maximum size to download in bytes. An +# # error is raised when the payload size exceeds this value. +# # (Default: 67108864, in other words 64MiB) +# max-size: 1073741824 +# # source:download:script: Basename of the download script to use for +# # that layer. The entry is ignored when the main script doesn't match. +# script: webmap-download +# cache: +# # source:cache:path: Local path (relative to --cachedir) where to +# # (atomically) save the downloaded file. The same path can be used by +# # multiple entries as long as their pairs (source:download:url, +# # source:download:script) match. Any parent directories are created if +# # needed. +# # If the path is empty or ends with a '/' character then it treated as a +# # directory and the last component of source:download:url implicitly +# # used as filename. In that case an error is raised if no filename can +# # be derived from the URL. +# # source:cache can be used as an alias when source:cache:path is its +# # only subkey. +# path: path/to/sub/dir/ +# # source:cache:max-age: Maximum age for caching, in number of seconds +# # ago. If source:cache:path exists and its mtime and/or ctime is newer +# # than this value then no HTTP query is made. +# # (Default: 21600, in other words 6h) +# max-age: 86400 + + 'nvr:TILLTRADESFORBUD': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/TILLTRADESFORBUD.zip' + cache: naturvardsregistret/ + + 'nvr:NP': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NP.zip' + cache: naturvardsregistret/ + + 'nvr:NR': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NR.zip' + cache: naturvardsregistret/ + + 'nvr:NVO': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NVO.zip' + cache: naturvardsregistret/ + + 'nvr:DVO': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/DVO.zip' + cache: naturvardsregistret/ + + 'nvr:KR': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/KR.zip' + cache: naturvardsregistret/ + + 'nvr:VSO': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/VSO.zip' + cache: naturvardsregistret/ + + 'nvr:LBSO': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/LBSO.zip' + cache: naturvardsregistret/ + + 'nvr:OBO': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/OBO.zip' + cache: naturvardsregistret/ + + 'nvr:NM': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NM.zip' + cache: naturvardsregistret/ + + 'nvr:IF': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/IF.zip' + cache: naturvardsregistret/ + + 'nvr:SPA_Rikstackande': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/SPA_Rikstackande.zip' + cache: naturvardsregistret/ + + 'nvr:SCI_Rikstackande': + source: + download: + url: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/SCI_Rikstackande.zip' + max-size: 134217728 # 128MiB + cache: naturvardsregistret/ + + 'nvr:HELCOM': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/HELCOM.zip' + cache: naturvardsregistret/ + + 'nvr:Ramsar_2018': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/Ramsar_2018.zip' + cache: naturvardsregistret/ + + 'nvr:OSPAR': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/OSPAR.zip' + cache: naturvardsregistret/ + + 'nvr:Varldsarv': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/Varldsarv.zip' + cache: naturvardsregistret/ + + 'nvr:biosfarsomraden': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/biosfarsomraden.zip' + cache: naturvardsregistret/ + + 'nvr:NVA': + source: + download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NVA.zip' + cache: naturvardsregistret/ + + 'sks:AvverkAnm': + source: + download: + url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksAvverkAnm.zip' + max-size: 134217728 # 128MiB + cache: sks/ + + 'sks:UtfordAvverk': + - source: + download: + url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2000-2015.zip' + max-size: 805306368 # 768MiB + cache: + path: sks/ + max-age: 2592000 # 30d + - source: + download: + url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2016-2019.zip' + max-size: 805306368 # 768MiB + cache: + path: sks/ + max-age: 2592000 # 30d + - source: + download: + url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2020-2022.zip' + max-size: 805306368 # 768MiB + cache: + path: sks/ + max-age: 864000 # 10d + - source: + download: + url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2023-.zip' + max-size: 1073741824 # 1GiB + cache: sks/ diff --git a/webmap-download b/webmap-download new file mode 100755 index 0000000..39c48c2 --- /dev/null +++ b/webmap-download @@ -0,0 +1,235 @@ +#!/usr/bin/python3 + +from os import O_RDONLY, O_WRONLY, O_CREAT, O_TRUNC, O_CLOEXEC, O_PATH, O_DIRECTORY, O_TMPFILE +import os, sys +from fcntl import flock, LOCK_EX +import logging +from time import time, monotonic as time_monotonic +import argparse +import itertools +from pathlib import Path +from email.utils import parsedate_to_datetime, formatdate +from hashlib import sha1 +from math import modf +import requests + +import common + +def download_trystream(url, **kwargs): + max_tries = 10 + f = kwargs.pop('session', requests) + for i in itertools.count(1): + try: + r = f.get(url, **kwargs, stream=True) + except (requests.Timeout, requests.ConnectionError): + if i < max_tries: + logging.error('timeout') + continue + raise + else: + r.raise_for_status() + return r + +def download(url, dest, dir_fd=None, headers={}, max_size=None, session=requests): + logging.info('Downloading %s…', url) + destPath = Path(dest) + dest_tmp = destPath.with_stem(f'.{destPath.stem}.new').as_posix() + try: + # delete any leftover + os.unlink(dest_tmp, dir_fd=dir_fd) + except FileNotFoundError: + pass + + start = time_monotonic() + r = download_trystream(url, headers=headers, session=session, timeout=30) + if r.status_code == requests.codes.not_modified: + # XXX shouldn't we call os.utime(dest) to bump its ctime here? + # otherwise we'll make several queries and get multiple 304 + # replies if the file is used by multiple layers + logging.info('%s: %d Not Modified', dest, r.status_code) + return + + last_modified = r.headers.get('Last-Modified', None) + if last_modified is not None: + try: + last_modified = parsedate_to_datetime(last_modified) + last_modified = last_modified.timestamp() + except ValueError: + logging.exception('Could not parse Last-Modified value') + last_modified = None + + # XXX we can't use TemporaryFile as it uses O_EXCL, cf. + # https://discuss.python.org/t/temporaryfile-contextmanager-that-allows-creating-a-directory-entry-on-success/19094/2 + fd = os.open(os.path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd) + with os.fdopen(fd, mode='wb') as fp: + size = 0 + for chunk in r.iter_content(chunk_size=2**16): + size = size + len(chunk) + if max_size is not None and size > max_size: + raise Exception(f'Payload exceeds max-size ({max_size})') + fp.write(chunk) + r = None + end = time_monotonic() + + # XXX unfortunately there is no way for linkat() to clobber the destination, + # so we use a temporary file; it's racy, but thanks to O_TMPFILE better + # (shorter race) than if we were dumping chunks in a named file descriptor + os.link(f'/proc/self/fd/{fp.fileno()}', dest_tmp, + dst_dir_fd=dir_fd, follow_symlinks=True) + + # no need to close fd here, it was taken care of by the context manager above + + try: + if last_modified is not None: + # XXX os.utime() doesn't work on file descriptors so we set mtime + # after linkat() instead + os.utime(dest_tmp, times=(last_modified, last_modified), + dir_fd=dir_fd, follow_symlinks=False) + os.rename(dest_tmp, dest, src_dir_fd=dir_fd, dst_dir_fd=dir_fd) + + elapsed = end - start + logging.info("%s: Downloaded %s in %s (%s/s)", dest, format_bytes(size), + format_time(elapsed), format_bytes(int(size/elapsed))) + except Exception as e: + try: + os.unlink(dest_tmp, dir_fd=dir_fd) + except Exception: + pass + raise e + +def format_bytes(n): + if n < 768: + return f'{n}B' + elif n < 786432: + return f'{n/1024:.2f}kiB' + elif n < 805306368: + return f'{n/1048576:.2f}MiB' + else: + return f'{n/1073741824:.2f}GiB' + +def format_time(s): + fs, s = modf(s) + m, s = divmod(int(s), 60) + h, m = divmod(m, 60) + return f'{h:02d}:{m:02d}:{s + fs:06.3f}' + +if __name__ == '__main__': + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) + + parser = argparse.ArgumentParser(description='Download or update GIS layers.') + parser.add_argument('--cachedir', default=None, + help='destination directory for downloaded files (default: .)') + parser.add_argument('--lockdir', default=None, + help='directory for lock files (default: value of --cachedir option)') + parser.add_argument('--debug', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('--exit-code', action=argparse.BooleanOptionalAction, + help='whether to exit with status 1 in case of download failures') + parser.add_argument('groupname', nargs='*', help='Group(s) to process') + args = parser.parse_args() + + if args.debug: + from http.client import HTTPConnection + HTTPConnection.debuglevel = 1 + logging.getLogger().setLevel(logging.DEBUG) + requests_log = logging.getLogger("urllib3") + requests_log.setLevel(logging.DEBUG) + requests_log.propagate = True + + common.load_config(groupnames=None if args.groupname == [] else args.groupname) + + sources = [] + for name, layerdefs in common.config.get('layers', {}).items(): + for layerdef in layerdefs: + sourcedef = layerdef.get('source', {}) + sourcedef['layername'] = name + sources.append(sourcedef) + + # intentionally leave the dirfd open until the program terminates + opendir_args = O_RDONLY|O_CLOEXEC|O_PATH|O_DIRECTORY + destdir_fd = None if args.cachedir is None else os.open(args.cachedir, opendir_args) + lockdir_fd = destdir_fd if args.lockdir is None else os.open(args.lockdir, opendir_args) + + if ((destdir_fd is None and lockdir_fd is None) or + (destdir_fd is not None and lockdir_fd is not None and destdir_fd == lockdir_fd)): + lockdir_is_destdir = True + else: + destdir_st = os.fstat(destdir_fd) + lockdir_st = os.fstat(lockdir_fd) + lockdir_is_destdir = destdir_st.st_ino == lockdir_st.st_ino + + sessionRequests = requests.Session() + + rv = 0 + downloads = set() + for source in sources: + dl = source.get('download', None) + script = None if dl is None else dl.get('script', None) + if script is not None and script != os.path.basename(__file__): + logging.info('Layer "%s" is not for us (%s != %s), skipping', + source['layername'], + script, os.path.basename(__file__)) + continue + + url = None if dl is None else dl.get('url', None) + if url is None: + logging.error('Layer "%s" has no source URL, ignoring', + source['layername']) + continue + + cache = source.get('cache', None) + dest = None if cache is None else cache.get('path', None) + if dest is None: + raise Exception('Impossible') + elif url in downloads: + logging.info('%s was already downloaded, skipping', dest) + continue + + headers = {} + user_agent = common.config.get('User-Agent', None) + if user_agent is not None: + headers['User-Agent'] = user_agent + dest = str(dest) # convert from Path() + + try: + # create parent directories + destdir = os.path.dirname(dest) + common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True, logging=logging) + + # place an exclusive lock on a lockfile as the destination can be used by other layers + # hence might be updated in parallel + if lockdir_is_destdir: + lockfile = dest + '.lck' + else: + # use a flat hierarchy when lockdir != destdir as this avoids leaving empty directories + # behind when removing left overs with tmpfiles.d(5) + lockfile = sha1(dest.encode('utf-8')).hexdigest() + '.lck' + # use O_TRUNC to bump lockfile's mtime + lock_fd = os.open(lockfile, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode=0o644, dir_fd=lockdir_fd) + try: + logging.debug('flock("%s", LOCK_EX)', lockfile) + flock(lock_fd, LOCK_EX) + try: + st = os.stat(dest, dir_fd=destdir_fd) + except (OSError, ValueError): + # the file doesn't exist, or stat() failed for some reason + pass + else: + max_age = cache.get('max-age', 6*3600) # 6h + if max_age is not None: + s = max_age + max(st.st_ctime, st.st_mtime) - time() + if s > 0: + logging.info('%s: Too young, try again in %s', + dest, format_time(s)) + continue + headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, localtime=False, usegmt=True) + max_size = dl.get('max-size', 2**26) # 64MiB + download(url, dest, dir_fd=destdir_fd, max_size=max_size, + headers=headers, session=sessionRequests) + downloads.add(url) + finally: + os.close(lock_fd) + except Exception: + logging.exception('Could not download %s as %s', url, dest) + if args.exit_code: + rv = 1 + exit(rv) -- cgit v1.2.3