Add `webmap-download` script.

author: Guilhem Moulin <guilhem@fripost.org> 2024-05-16 20:14:52 +0200
committer: Guilhem Moulin <guilhem@fripost.org> 2024-06-01 19:21:49 +0200
commit: 4fe584a6e3eb9a32977ed545146db267c83f3788 (patch)
tree: f1478bea7cc935693f7a93d112bfc65b7ec8787c
parent: 141ba1a7dac55532b6d4481ede69248d0aa50f78 (diff)
3 files changed, 585 insertions, 0 deletions
diff --git a/common.py b/common.py
new file mode 100644
index 0000000..e4456af
--- /dev/null
+++ b/common.py
@@ -0,0 +1,171 @@
+import os, sys
+from os import path
+from fnmatch import fnmatchcase
+from pathlib import Path, PosixPath
+from urllib.parse import urlparse, urlunparse
+from stat import S_ISDIR
+from xdg.BaseDirectory import xdg_config_home
+import logging
+import yaml
+import __main__ as main
+
+def load_config(path=None, groupnames=None):
+    main_script = os.path.basename(main.__file__)
+    if path is None:
+        for p in [Path(),
+                  Path(xdg_config_home).joinpath('webmap'),
+                  PosixPath('/etc').joinpath('webmap')]:
+            p = p.joinpath('config.yml')
+            if p.exists():
+                path = str(p)
+                break
+    with open(path, 'r') as fp:
+        config = yaml.safe_load(fp)
+    layers = config.get('layers', {})
+
+    # validate sources
+    destinations = {}
+    for name, layer in layers.items():
+        if isinstance(layer, dict):
+            layers[name] = layer = [layer]
+
+        for sourcedef in layer:
+            source = sourcedef.get('source', None)
+            if source is None:
+                continue
+            download = source.get('download', None)
+            if download is None:
+                url = None
+                script = None
+            elif isinstance(download, str):
+                url = download
+                script = None
+                source['download'] = download = { 'url': url }
+            else:
+                url = download.get('url', None)
+                script = download.get('script', None)
+            if url is None:
+                urlp = None
+            else:
+                urlp = urlparse(url)
+                if urlp is None:
+                    raise Exception(f'urlparse({url}) failed')
+
+            cache = source.get('cache', None)
+            if cache is None or isinstance(cache, str):
+                source['cache'] = { 'path': cache }
+            else:
+                cache = cache.get('path', None)
+
+            if cache is None or cache in ['', os.curdir, os.pardir] or cache.endswith(os.sep):
+                # infer filename from the source URL
+                if urlp is None or urlp.path is None or urlp.path == '' or urlp.path.endswith('/'):
+                    raise Exception(f'Layer "{name}": Could not infer filename from URL {url}')
+                p = PosixPath(urlp.path)
+                if p is None or p.name is None or p.name == '':
+                    raise Exception(f'Invalid PosixPath({urlp.path})')
+                if cache is None or cache == '':
+                    cache = Path()
+                else:
+                    cache = Path(cache)
+                cache = cache.joinpath(p.name)
+            else:
+                cache = Path(cache)
+            source['cache']['path'] = cache
+
+            v = { 'url': urlp, 'script': main_script if script is None else script }
+            if cache in destinations and destinations[cache] != v:
+                # allow destination conflicts, but only when the source URL and script match
+                raise Exception(f'Destination conflict for layer "{name}"')
+            destinations[cache] = v
+
+    # filter layers that are not of interest
+    if groupnames is not None:
+        layernames = []
+        layer_groups = config.get('layer-groups', {})
+        for groupname in groupnames:
+            if groupname not in layer_groups:
+                if groupname in layers:
+                    # fallback to layer names
+                    layernames.append(groupname)
+                else:
+                    logging.error('Unknown group/layer name "%s"', groupname)
+                    exit(1)
+            else:
+                patterns = layer_groups[groupname]
+                if isinstance(patterns, str):
+                    patterns = [patterns]
+                for pat in patterns:
+                    has_match = False
+                    for layername in layers:
+                        if fnmatchcase(layername, pat):
+                            if layername in layernames:
+                                logging.debug('Layer "%s" was already added, skipping', layername)
+                            else:
+                                layernames.append(layername)
+                            has_match = True
+                    if has_match:
+                        pass
+                    elif pat in layers:
+                        # fallback to exact match
+                        if pat in layernames:
+                            logging.debug('Layer "%s" was already added, skipping', pat)
+                        else:
+                            layernames.append(pat)
+                    else:
+                        logging.warning('Group name "%s" does not match anything', groupname)
+
+        layers = { name: layers[name] for name in layernames }
+
+    config['layers'] = layers
+    sys.modules[__name__].config = config
+
+
+######
+# The function definitions below are taken from cpython's source code
+# and augmented with dir_fd.
+
+# Is a path a directory?
+# (From genericpath.py.)
+def isdir(path, dir_fd=None, follow_symlinks=True):
+    try:
+        st = os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks)
+    except (OSError, ValueError):
+        return False
+    return S_ISDIR(st.st_mode)
+
+# Does a path exist?
+# (From genericpath.py.)
+def exists(path, dir_fd=None, follow_symlinks=True):
+    try:
+        os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks)
+    except (OSError, ValueError):
+        return False
+    return True
+
+# Create a leaf directory and all intermediate ones.
+# (From os.py.)
+def makedirs(name, mode=0o777, exist_ok=False, dir_fd=None, logging=None):
+    head, tail = path.split(name)
+    if not tail:
+        head, tail = path.split(head)
+    if head and tail and not exists(head, dir_fd=dir_fd):
+        try:
+            makedirs(head, exist_ok=exist_ok, dir_fd=dir_fd, logging=logging)
+        except FileExistsError:
+            # Defeats race condition when another thread created the path
+            pass
+        cdir = os.curdir
+        if isinstance(tail, bytes):
+            cdir = bytes(os.curdir, 'ASCII')
+        if tail == cdir:           # xxx/newdir/. exists if xxx/newdir exists
+            return
+    if logging is not None:
+        logging.debug('mkdir("%s", 0%o)', name, mode)
+    try:
+        os.mkdir(name, mode, dir_fd=dir_fd)
+    except OSError:
+        # Cannot rely on checking for EEXIST, since the operating system
+        # could give priority to other errors like EACCES or EROFS
+        if not exist_ok or not isdir(name, dir_fd=dir_fd):
+            raise
diff --git a/config.yml b/config.yml
new file mode 100644
index 0000000..6aa7abf
--- /dev/null
+++ b/config.yml
@@ -0,0 +1,179 @@
+---
+# Take User-Agent value from Tor Browser 13.0.15 (based on Mozilla Firefox 115.11.0esr)
+User-Agent: 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
+
+# Map group names to one or more pattern of layer name(s).  This is a
+# convenience feature for systemd template units.
+layer-groups:
+  nvr: 'nvr:*'
+  sks: 'sks:*'
+
+
+layers:
+#  # Keys are layer names in the output dataset.  Values are list of dictionaries
+#  # specifying source data and conversion information for these layers.  If a
+#  # layer has a single source, then it can be inlined instead.
+#  layer1_name:
+#    - source:
+#        download:
+#          # source:download:url: URL from where to download the source file.
+#          # source:download can be used as an alias when source:download:url is
+#          # its only subkey.
+#          url: 'https://example.net/path/to/layer.zip'
+#          # source:download:max-size: The maximum size to download in bytes.  An
+#          # error is raised when the payload size exceeds this value.
+#          # (Default: 67108864, in other words 64MiB)
+#          max-size: 1073741824
+#          # source:download:script: Basename of the download script to use for
+#          # that layer.  The entry is ignored when the main script doesn't match.
+#          script: webmap-download
+#        cache:
+#          # source:cache:path: Local path (relative to --cachedir) where to
+#          # (atomically) save the downloaded file.  The same path can be used by
+#          # multiple entries as long as their pairs (source:download:url,
+#          # source:download:script) match.  Any parent directories are created if
+#          # needed.
+#          # If the path is empty or ends with a '/' character then it treated as a
+#          # directory and the last component of source:download:url implicitly
+#          # used as filename.  In that case an error is raised if no filename can
+#          # be derived from the URL.
+#          # source:cache can be used as an alias when source:cache:path is its
+#          # only subkey.
+#          path: path/to/sub/dir/
+#          # source:cache:max-age: Maximum age for caching, in number of seconds
+#          # ago.  If source:cache:path exists and its mtime and/or ctime is newer
+#          # than this value then no HTTP query is made.
+#          # (Default: 21600, in other words 6h)
+#          max-age: 86400
+
+  'nvr:TILLTRADESFORBUD':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/TILLTRADESFORBUD.zip'
+      cache: naturvardsregistret/
+
+  'nvr:NP':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NP.zip'
+      cache: naturvardsregistret/
+
+  'nvr:NR':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NR.zip'
+      cache: naturvardsregistret/
+
+  'nvr:NVO':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NVO.zip'
+      cache: naturvardsregistret/
+
+  'nvr:DVO':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/DVO.zip'
+      cache: naturvardsregistret/
+
+  'nvr:KR':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/KR.zip'
+      cache: naturvardsregistret/
+
+  'nvr:VSO':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/VSO.zip'
+      cache: naturvardsregistret/
+
+  'nvr:LBSO':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/LBSO.zip'
+      cache: naturvardsregistret/
+
+  'nvr:OBO':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/OBO.zip'
+      cache: naturvardsregistret/
+
+  'nvr:NM':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NM.zip'
+      cache: naturvardsregistret/
+
+  'nvr:IF':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/IF.zip'
+      cache: naturvardsregistret/
+
+  'nvr:SPA_Rikstackande':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/SPA_Rikstackande.zip'
+      cache: naturvardsregistret/
+
+  'nvr:SCI_Rikstackande':
+    source:
+      download:
+        url: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/SCI_Rikstackande.zip'
+        max-size: 134217728 # 128MiB
+      cache: naturvardsregistret/
+
+  'nvr:HELCOM':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/HELCOM.zip'
+      cache: naturvardsregistret/
+
+  'nvr:Ramsar_2018':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/Ramsar_2018.zip'
+      cache: naturvardsregistret/
+
+  'nvr:OSPAR':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/OSPAR.zip'
+      cache: naturvardsregistret/
+
+  'nvr:Varldsarv':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/Varldsarv.zip'
+      cache: naturvardsregistret/
+
+  'nvr:biosfarsomraden':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/biosfarsomraden.zip'
+      cache: naturvardsregistret/
+
+  'nvr:NVA':
+    source:
+      download: 'https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NVA.zip'
+      cache: naturvardsregistret/
+
+  'sks:AvverkAnm':
+    source:
+      download:
+        url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksAvverkAnm.zip'
+        max-size: 134217728 # 128MiB
+      cache: sks/
+
+  'sks:UtfordAvverk':
+    - source:
+        download:
+          url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2000-2015.zip'
+          max-size: 805306368 # 768MiB
+        cache:
+          path: sks/
+          max-age: 2592000 # 30d
+    - source:
+        download:
+          url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2016-2019.zip'
+          max-size: 805306368 # 768MiB
+        cache:
+          path: sks/
+          max-age: 2592000 # 30d
+    - source:
+        download:
+          url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2020-2022.zip'
+          max-size: 805306368 # 768MiB
+        cache:
+          path: sks/
+          max-age: 864000 # 10d
+    - source:
+        download:
+          url: 'https://geodpags.skogsstyrelsen.se/geodataport/data/sksUtfordAvverk-2023-.zip'
+          max-size: 1073741824 # 1GiB
+        cache: sks/
diff --git a/webmap-download b/webmap-download
new file mode 100755
index 0000000..39c48c2
--- /dev/null
+++ b/webmap-download
@@ -0,0 +1,235 @@
+#!/usr/bin/python3
+
+from os import O_RDONLY, O_WRONLY, O_CREAT, O_TRUNC, O_CLOEXEC, O_PATH, O_DIRECTORY, O_TMPFILE
+import os, sys
+from fcntl import flock, LOCK_EX
+import logging
+from time import time, monotonic as time_monotonic
+import argparse
+import itertools
+from pathlib import Path
+from email.utils import parsedate_to_datetime, formatdate
+from hashlib import sha1
+from math import modf
+import requests
+
+import common
+
+def download_trystream(url, **kwargs):
+    max_tries = 10
+    f = kwargs.pop('session', requests)
+    for i in itertools.count(1):
+        try:
+            r = f.get(url, **kwargs, stream=True)
+        except (requests.Timeout, requests.ConnectionError):
+            if i < max_tries:
+                logging.error('timeout')
+                continue
+            raise
+        else:
+            r.raise_for_status()
+            return r
+
+def download(url, dest, dir_fd=None, headers={}, max_size=None, session=requests):
+    logging.info('Downloading %s…', url)
+    destPath = Path(dest)
+    dest_tmp = destPath.with_stem(f'.{destPath.stem}.new').as_posix()
+    try:
+        # delete any leftover
+        os.unlink(dest_tmp, dir_fd=dir_fd)
+    except FileNotFoundError:
+        pass
+
+    start = time_monotonic()
+    r = download_trystream(url, headers=headers, session=session, timeout=30)
+    if r.status_code == requests.codes.not_modified:
+        # XXX shouldn't we call os.utime(dest) to bump its ctime here?
+        # otherwise we'll make several queries and get multiple 304
+        # replies if the file is used by multiple layers
+        logging.info('%s: %d Not Modified', dest, r.status_code)
+        return
+
+    last_modified = r.headers.get('Last-Modified', None)
+    if last_modified is not None:
+        try:
+            last_modified = parsedate_to_datetime(last_modified)
+            last_modified = last_modified.timestamp()
+        except ValueError:
+            logging.exception('Could not parse Last-Modified value')
+            last_modified = None
+
+    # XXX we can't use TemporaryFile as it uses O_EXCL, cf.
+    # https://discuss.python.org/t/temporaryfile-contextmanager-that-allows-creating-a-directory-entry-on-success/19094/2
+    fd = os.open(os.path.dirname(dest), O_WRONLY|O_CLOEXEC|O_TMPFILE, mode=0o644, dir_fd=dir_fd)
+    with os.fdopen(fd, mode='wb') as fp:
+        size = 0
+        for chunk in r.iter_content(chunk_size=2**16):
+            size = size + len(chunk)
+            if max_size is not None and size > max_size:
+                raise Exception(f'Payload exceeds max-size ({max_size})')
+            fp.write(chunk)
+        r = None
+        end = time_monotonic()
+
+        # XXX unfortunately there is no way for linkat() to clobber the destination,
+        # so we use a temporary file; it's racy, but thanks to O_TMPFILE better
+        # (shorter race) than if we were dumping chunks in a named file descriptor
+        os.link(f'/proc/self/fd/{fp.fileno()}', dest_tmp,
+            dst_dir_fd=dir_fd, follow_symlinks=True)
+
+    # no need to close fd here, it was taken care of by the context manager above
+
+    try:
+        if last_modified is not None:
+            # XXX os.utime() doesn't work on file descriptors so we set mtime
+            # after linkat() instead
+            os.utime(dest_tmp, times=(last_modified, last_modified),
+                dir_fd=dir_fd, follow_symlinks=False)
+        os.rename(dest_tmp, dest, src_dir_fd=dir_fd, dst_dir_fd=dir_fd)
+
+        elapsed = end - start
+        logging.info("%s: Downloaded %s in %s (%s/s)", dest, format_bytes(size),
+            format_time(elapsed), format_bytes(int(size/elapsed)))
+    except Exception as e:
+        try:
+            os.unlink(dest_tmp, dir_fd=dir_fd)
+        except Exception:
+            pass
+        raise e
+
+def format_bytes(n):
+    if n < 768:
+        return f'{n}B'
+    elif n < 786432:
+        return f'{n/1024:.2f}kiB'
+    elif n < 805306368:
+        return f'{n/1048576:.2f}MiB'
+    else:
+        return f'{n/1073741824:.2f}GiB'
+
+def format_time(s):
+    fs, s = modf(s)
+    m, s = divmod(int(s), 60)
+    h, m = divmod(m, 60)
+    return f'{h:02d}:{m:02d}:{s + fs:06.3f}'
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
+
+    parser = argparse.ArgumentParser(description='Download or update GIS layers.')
+    parser.add_argument('--cachedir', default=None,
+        help='destination directory for downloaded files (default: .)')
+    parser.add_argument('--lockdir', default=None,
+        help='directory for lock files (default: value of --cachedir option)')
+    parser.add_argument('--debug', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--exit-code', action=argparse.BooleanOptionalAction,
+        help='whether to exit with status 1 in case of download failures')
+    parser.add_argument('groupname', nargs='*', help='Group(s) to process')
+    args = parser.parse_args()
+
+    if args.debug:
+        from http.client import HTTPConnection
+        HTTPConnection.debuglevel = 1
+        logging.getLogger().setLevel(logging.DEBUG)
+        requests_log = logging.getLogger("urllib3")
+        requests_log.setLevel(logging.DEBUG)
+        requests_log.propagate = True
+
+    common.load_config(groupnames=None if args.groupname == [] else args.groupname)
+
+    sources = []
+    for name, layerdefs in common.config.get('layers', {}).items():
+        for layerdef in layerdefs:
+            sourcedef = layerdef.get('source', {})
+            sourcedef['layername'] = name
+            sources.append(sourcedef)
+
+    # intentionally leave the dirfd open until the program terminates
+    opendir_args = O_RDONLY|O_CLOEXEC|O_PATH|O_DIRECTORY
+    destdir_fd = None       if args.cachedir is None else os.open(args.cachedir, opendir_args)
+    lockdir_fd = destdir_fd if args.lockdir  is None else os.open(args.lockdir, opendir_args)
+
+    if ((destdir_fd is None and lockdir_fd is None) or
+            (destdir_fd is not None and lockdir_fd is not None and destdir_fd == lockdir_fd)):
+        lockdir_is_destdir = True
+    else:
+        destdir_st = os.fstat(destdir_fd)
+        lockdir_st = os.fstat(lockdir_fd)
+        lockdir_is_destdir = destdir_st.st_ino == lockdir_st.st_ino
+
+    sessionRequests = requests.Session()
+
+    rv = 0
+    downloads = set()
+    for source in sources:
+        dl = source.get('download', None)
+        script = None if dl is None else dl.get('script', None)
+        if script is not None and script != os.path.basename(__file__):
+            logging.info('Layer "%s" is not for us (%s != %s), skipping',
+                source['layername'],
+                script, os.path.basename(__file__))
+            continue
+
+        url = None if dl is None else dl.get('url', None)
+        if url is None:
+            logging.error('Layer "%s" has no source URL, ignoring',
+                source['layername'])
+            continue
+
+        cache = source.get('cache', None)
+        dest = None if cache is None else cache.get('path', None)
+        if dest is None:
+            raise Exception('Impossible')
+        elif url in downloads:
+            logging.info('%s was already downloaded, skipping', dest)
+            continue
+
+        headers = {}
+        user_agent = common.config.get('User-Agent', None)
+        if user_agent is not None:
+            headers['User-Agent'] = user_agent
+        dest = str(dest) # convert from Path()
+
+        try:
+            # create parent directories
+            destdir = os.path.dirname(dest)
+            common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True, logging=logging)
+
+            # place an exclusive lock on a lockfile as the destination can be used by other layers
+            # hence might be updated in parallel
+            if lockdir_is_destdir:
+                lockfile = dest + '.lck'
+            else:
+                # use a flat hierarchy when lockdir != destdir as this avoids leaving empty directories
+                # behind when removing left overs with tmpfiles.d(5)
+                lockfile = sha1(dest.encode('utf-8')).hexdigest() + '.lck'
+            # use O_TRUNC to bump lockfile's mtime
+            lock_fd = os.open(lockfile, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode=0o644, dir_fd=lockdir_fd)
+            try:
+                logging.debug('flock("%s", LOCK_EX)', lockfile)
+                flock(lock_fd, LOCK_EX)
+                try:
+                    st = os.stat(dest, dir_fd=destdir_fd)
+                except (OSError, ValueError):
+                    # the file doesn't exist, or stat() failed for some reason
+                    pass
+                else:
+                    max_age = cache.get('max-age', 6*3600) # 6h
+                    if max_age is not None:
+                        s = max_age + max(st.st_ctime, st.st_mtime) - time()
+                        if s > 0:
+                            logging.info('%s: Too young, try again in %s',
+                                dest, format_time(s))
+                            continue
+                    headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, localtime=False, usegmt=True)
+                max_size = dl.get('max-size', 2**26) # 64MiB
+                download(url, dest, dir_fd=destdir_fd, max_size=max_size,
+                    headers=headers, session=sessionRequests)
+                downloads.add(url)
+            finally:
+                os.close(lock_fd)
+        except Exception:
+            logging.exception('Could not download %s as %s', url, dest)
+            if args.exit_code:
+                rv = 1
+    exit(rv)
author	Guilhem Moulin <guilhem@fripost.org>	2024-05-16 20:14:52 +0200
committer	Guilhem Moulin <guilhem@fripost.org>	2024-06-01 19:21:49 +0200
commit	4fe584a6e3eb9a32977ed545146db267c83f3788 (patch)
tree	f1478bea7cc935693f7a93d112bfc65b7ec8787c
parent	141ba1a7dac55532b6d4481ede69248d0aa50f78 (diff)