diff options
Diffstat (limited to 'webmap-download')
-rwxr-xr-x | webmap-download | 88 |
1 files changed, 58 insertions, 30 deletions
diff --git a/webmap-download b/webmap-download index 917f178..2d31a19 100755 --- a/webmap-download +++ b/webmap-download @@ -18,8 +18,12 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. #---------------------------------------------------------------------- +# pylint: disable=invalid-name,missing-module-docstring +# pylint: enable=invalid-name + from os import O_RDONLY, O_WRONLY, O_CREAT, O_TRUNC, O_CLOEXEC, O_PATH, O_DIRECTORY, O_TMPFILE -import os, sys +import os +import sys from fcntl import flock, LOCK_EX import logging from time import time, monotonic as time_monotonic @@ -28,34 +32,53 @@ import itertools from pathlib import Path from email.utils import parsedate_to_datetime, formatdate from hashlib import sha256 +from typing import Any, Optional, NoReturn, Never import requests import common -def download_trystream(url, **kwargs): - max_tries = 10 - f = kwargs.pop('session', requests) +def download_trystream(url : str, **kwargs) -> requests.Response: + """GET a url, trying a number of times. Return immediately after the + first chunk is received""" + + max_retries = kwargs.pop('max_retries', 10) + f = kwargs.pop('session', None) + if f is None: + f = requests for i in itertools.count(1): try: r = f.get(url, **kwargs, stream=True) except (requests.Timeout, requests.ConnectionError): - if i < max_tries: + if i < max_retries: logging.error('timeout') continue raise - else: - r.raise_for_status() - return r -def download(url, dest, dir_fd=None, headers={}, session=requests, progress=None): + r.raise_for_status() + return r + +class DownloadTooLarge(Exception): + """Exception raised when a downloaded file exceeds max-size""" + def __init__(self, max_size : int) -> Never: + super().__init__(f'Payload exceeds max-size ({max_size})') + +# pylint: disable-next=dangerous-default-value +def download(dest : str, + dl : Optional[dict[str, Any]], + dir_fd : Optional[int] = None, + headers : dict[str, str] = {}, + session : Optional[requests.sessions.Session] = None, + progress = None) -> None: + """Process a single download recipe""" + url = None if dl is None else dl.get('url', None) if url is None: logging.error('%s has no source URL, ignoring', dest) return max_size = dl.get('max-size', 2**26) # 64MiB logging.info('Downloading %s…', url) - destPath = Path(dest) - dest_tmp = str(destPath.with_stem(f'.{destPath.stem}.new')) + dest_path = Path(dest) + dest_tmp = str(dest_path.with_stem(f'.{dest_path.stem}.new')) try: # delete any leftover os.unlink(dest_tmp, dir_fd=dir_fd) @@ -64,7 +87,7 @@ def download(url, dest, dir_fd=None, headers={}, session=requests, progress=None start = time_monotonic() r = download_trystream(url, headers=headers, session=session, timeout=30) - if r.status_code == requests.codes.not_modified: + if r.status_code == 304: # XXX shouldn't we call os.utime(dest) to bump its ctime here? # otherwise we'll make several queries and get multiple 304 # replies if the file is used by multiple layers @@ -103,12 +126,12 @@ def download(url, dest, dir_fd=None, headers={}, session=requests, progress=None pbar.update(chunk_size) size += chunk_size if max_size is not None and size > max_size: - raise Exception(f'Payload exceeds max-size ({max_size})') + raise DownloadTooLarge(max_size) fp.write(chunk) r = None if last_modified is not None: - os.utime(fd, times=(last_modified, last_modified), follow_symlinks=True) + os.utime(fd, times=(last_modified, last_modified)) # XXX unfortunately there is no way for linkat() to clobber the destination, # so we use a temporary file; it's racy, but thanks to O_TMPFILE better @@ -129,10 +152,12 @@ def download(url, dest, dir_fd=None, headers={}, session=requests, progress=None raise e elapsed = time_monotonic() - start - logging.info("%s: Downloaded %s in %s (%s/s)", dest, common.format_bytes(size), - common.format_time(elapsed), common.format_bytes(int(size/elapsed))) + logging.info('%s: Downloaded %s in %s (%s/s)', dest, + common.format_bytes(size), + common.format_time(elapsed), + common.format_bytes(int(size/elapsed))) -if __name__ == '__main__': +def main() -> NoReturn: # pylint: disable=missing-function-docstring common.init_logger(app=os.path.basename(__file__), level=logging.INFO) parser = argparse.ArgumentParser(description='Download or update GIS layers.') @@ -152,16 +177,16 @@ if __name__ == '__main__': if args.debug > 0: logging.getLogger().setLevel(logging.DEBUG) if args.debug > 1: - from http.client import HTTPConnection + from http.client import HTTPConnection # pylint: disable=import-outside-toplevel HTTPConnection.debuglevel = 1 - requests_log = logging.getLogger("urllib3") + requests_log = logging.getLogger('urllib3') requests_log.setLevel(logging.DEBUG) requests_log.propagate = True - common.load_config(groupnames=None if args.groupname == [] else args.groupname) + config = common.parse_config(groupnames=None if args.groupname == [] else args.groupname) sources = [] - for name, layerdefs in common.config.get('layers', {}).items(): + for name, layerdefs in config.get('layers', {}).items(): for layerdef in layerdefs['sources']: sourcedef = layerdef.get('source', {}) sourcedef['layername'] = name @@ -170,7 +195,7 @@ if __name__ == '__main__': if args.quiet or not sys.stderr.isatty(): pbar = None else: - from tqdm import tqdm + from tqdm import tqdm # pylint: disable=import-outside-toplevel pbar = tqdm # intentionally leave the dirfd open until the program terminates @@ -178,7 +203,7 @@ if __name__ == '__main__': destdir_fd = os.open(args.cachedir, opendir_args) lockdir_fd = None if args.lockdir is None else os.open(args.lockdir, opendir_args) - sessionRequests = requests.Session() + session_requests = requests.Session() rv = 0 downloads = set() @@ -202,21 +227,22 @@ if __name__ == '__main__': continue headers = {} - user_agent = common.config.get('User-Agent', None) + user_agent = config.get('User-Agent', None) if user_agent is not None: headers['User-Agent'] = user_agent try: # create parent directories destdir = os.path.dirname(dest) - common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True, logging=logging) + common.makedirs(destdir, mode=0o755, dir_fd=destdir_fd, exist_ok=True) # place an exclusive lock on a lockfile as the destination can be used by other layers # hence might be updated in parallel if lockdir_fd is not None: lockfile = sha256(dest.encode('utf-8')).hexdigest() + '.lck' # use O_TRUNC to bump lockfile's mtime - lock_fd = os.open(lockfile, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode=0o644, dir_fd=lockdir_fd) + lock_fd = os.open(lockfile, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode=0o644, + dir_fd=lockdir_fd) try: if lockdir_fd is not None: logging.debug('flock("%s", LOCK_EX)', lockfile) @@ -235,16 +261,18 @@ if __name__ == '__main__': dest, common.format_time(s)) continue headers['If-Modified-Since'] = formatdate(timeval=st.st_mtime, localtime=False, usegmt=True) - fetch(dl, dest, dir_fd=destdir_fd, - headers=headers, session=sessionRequests, + fetch(dest, dl, dir_fd=destdir_fd, + headers=headers, session=session_requests, progress=pbar) downloads.add(dest) finally: if lockdir_fd is not None: os.close(lock_fd) - except Exception: + except Exception: # pylint: disable=broad-exception-caught logging.exception('Could not download %s as %s', dl.get('url', source['layername']), dest) if args.exit_code: rv = 1 - exit(rv) + sys.exit(rv) + +main() |