From ece6ee1ed031d0187fc475c3007d55af5aad74fe Mon Sep 17 00:00:00 2001 From: Mike Saffitz Date: Sun, 5 Mar 2023 09:56:55 -0800 Subject: [PATCH] Retry file downloads on exception up to 10 times, skip already downloaded files. This addresses common "connection reset by peer" and other potential connectivity issues on download. Suggestions for future improvement include configurable numbers of attempts, sleep intervals between attempts, and use a common file download directory (e.g. instead of creating a new one for each export). The latter would allow more incremental behavior of file downloads. --- exporter.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/exporter.py b/exporter.py index 2149b67..d993cdc 100755 --- a/exporter.py +++ b/exporter.py @@ -398,21 +398,44 @@ def parse_replies(threads, users): return body -def save_files(out_dir): +def download_file(destination_path, url, attempt = 0): + if os.path.exists(destination_path): + print("Skipping existing %s" % destination_path) + return True + + print(f"Downloading file on attempt {attempt} to {destination_path}") + + try: + response = requests.get(url, headers=HEADERS) + with open(destination_path, "wb") as fh: + fh.write(response.content) + except Exception as err: + print(f"Unexpected error on {destination_path} attempt {attempt}; {err=}, {type(err)=}") + return False + else: + return True + +def save_files(file_dir): total = 0 start = default_timer() for file_info in get_file_list(): url = file_info["url_private"] file_info["name"] = sanitize_filename(file_info["name"]) destination_filename = "{id}-{name}".format(**file_info) - files_dir = os.path.join(out_dir, "files") - os.makedirs(files_dir, exist_ok=True) - destination_path = os.path.join(files_dir, destination_filename) - print("Downloading file to %s" % destination_path) - response = requests.get(url, headers=HEADERS) - with open(destination_path, "wb") as fh: - fh.write(response.content) + os.makedirs(file_dir, exist_ok=True) + destination_path = os.path.join(file_dir, destination_filename) + + download_success = False + attempt = 1 + while not download_success and attempt <= 10: + download_success = download_file(destination_path, url, attempt) + attempt += 1 + + if not download_success: + raise Exception("Failed to download from {url} after {attempt} tries") + total += 1 + end = default_timer() seconds = int(end - start) print("Downloaded %i files in %i seconds" % (total, seconds))