-
Notifications
You must be signed in to change notification settings - Fork 370
Multi-part download for url_download in utils/download.py
#6247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 4 commits
3aa3af6
52656fa
c97438c
a9d7c22
63a6aec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,15 +22,19 @@ | |
| import shutil | ||
| import socket | ||
| import sys | ||
| import time | ||
| import typing | ||
| import urllib.parse | ||
| from multiprocessing import Process | ||
| from concurrent.futures import ThreadPoolExecutor | ||
| from urllib.error import HTTPError | ||
| from urllib.request import urlopen | ||
|
|
||
| from avocado.utils import crypto, output | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
| USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | ||
|
|
||
|
|
||
| def url_open(url, data=None, timeout=5): | ||
| """ | ||
|
|
@@ -83,24 +87,86 @@ def _url_download(url, filename, data): | |
| src_file.close() | ||
|
|
||
|
|
||
| def url_download(url, filename, data=None, timeout=300): | ||
| def _download_range( | ||
| url: str, | ||
| start: int, | ||
| end: int, | ||
| part_file: str, | ||
| retries: int = 3, | ||
| delay: int = 1, | ||
| timeout: int = 300, | ||
| ): | ||
| """ | ||
| Retrieve a file from given url. | ||
|
|
||
| :param url: source URL. | ||
| :param filename: destination path. | ||
| :param data: (optional) data to post. | ||
| Downloads file within start and end byte range into part_files | ||
| :param url: URL to download. | ||
| :param start: Start byte range. | ||
| :param end: End byte range. | ||
| :param part_file: Part file name. | ||
| :param retries: Number of retries. | ||
| :param delay: Delay in seconds. | ||
| :param timeout: Timeout in seconds. | ||
| """ | ||
| for attempt in range(1, retries + 1): | ||
| try: | ||
| req = urllib.request.Request( | ||
| url, headers={"User-Agent": USER_AGENT, "Range": f"bytes={start}-{end}"} | ||
| ) | ||
| with urlopen(req, timeout=timeout) as r, open(part_file, "wb") as f: | ||
| shutil.copyfileobj(r, f, length=1024 * 1024) # 1MB chunks | ||
| return | ||
| except (socket.timeout, TimeoutError, HTTPError) as e: | ||
| if attempt == retries: | ||
| raise e | ||
| time.sleep(delay) | ||
|
|
||
|
|
||
| def url_download( | ||
| url: str, | ||
| filename: str, | ||
| data: typing.Optional[bytes] = None, | ||
| timeout: int = 300, | ||
| segments: int = 4, | ||
|
SOORAJTS2001 marked this conversation as resolved.
Outdated
|
||
| ): | ||
| """ | ||
| Multipart downloader using thread pool executor by content-length splitting | ||
| :param url: URL to download. | ||
| :param filename: Filename to save the downloaded file | ||
| :param data: (optional) data to POST Request | ||
| :param timeout: (optional) default timeout in seconds. | ||
| :return: `None`. | ||
| :param segments: How much should we split the download into | ||
| """ | ||
| process = Process(target=_url_download, args=(url, filename, data)) | ||
| log.info("Fetching %s -> %s", url, filename) | ||
| process.start() | ||
| process.join(timeout) | ||
| if process.is_alive(): | ||
| process.terminate() | ||
| process.join() | ||
| raise OSError("Aborting downloading. Timeout was reached.") | ||
| headers = urllib.request.urlopen( | ||
| urllib.request.Request(url, method="HEAD"), timeout=10 | ||
| ).headers # Using HEAD method to get the content length | ||
| size = int(headers.get("Content-Length", -1)) | ||
| supports_range = "bytes" in headers.get("Accept-Ranges", "").lower() | ||
|
|
||
| if size <= 0 or data or not supports_range: | ||
| # if the server doesn't provide the size or accepted range / if we want sent data to the server (POST Method), | ||
| # switch to single download with urlopen | ||
| _url_download(url=url, filename=filename, data=data) | ||
| return | ||
|
Comment on lines
+139
to
+148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for HEAD request. The HEAD request can raise - headers = urllib.request.urlopen(
- urllib.request.Request(url, method="HEAD"), timeout=10
- ).headers # Using HEAD method to get the content length
- size = int(headers.get("Content-Length", -1))
- supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
-
- if size <= 0 or data or not supports_range:
+ try:
+ head_response = urllib.request.urlopen(
+ urllib.request.Request(url, method="HEAD"), timeout=timeout
+ )
+ headers = head_response.headers
+ size = int(headers.get("Content-Length", -1))
+ supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
+ except (socket.timeout, HTTPError, urllib.error.URLError):
+ size = -1
+ supports_range = False
+
+ if size <= 0 or data or not supports_range:Note: You'll need to add
🧰 Tools🪛 Ruff (0.14.6)137-139: Audit URL open for permitted schemes. Allowing use of (S310) 138-138: Audit URL open for permitted schemes. Allowing use of (S310) |
||
|
|
||
| part_size = size // segments | ||
|
|
||
| def task(i: int): | ||
| start = i * part_size | ||
| end = size - 1 if i == segments - 1 else (start + part_size - 1) | ||
| part_file = f"{filename}.part{i}" | ||
| _download_range( | ||
| url=url, start=start, end=end, part_file=part_file, timeout=timeout | ||
| ) | ||
| return part_file | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| with ThreadPoolExecutor(max_workers=segments) as executor: | ||
| part_files = list(executor.map(task, range(segments))) | ||
|
|
||
| # Merge the split files and remove them | ||
| with open(filename, "wb") as f: | ||
| for part in part_files: | ||
| with open(part, "rb") as pf: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may cause the unintentional overwrite of files, such as: Will overwrite existing
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @clebergnu, |
||
| shutil.copyfileobj(pf, f) | ||
| os.remove(part) | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
SOORAJTS2001 marked this conversation as resolved.
Outdated
|
||
|
|
||
|
|
||
| def url_download_interactive(url, output_file, title="", chunk_size=102400): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason for this specific user agent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not really, Servers do not inherently require a user agent to enable a download, as the User-Agent header is technically optional according to HTTP specifications. However, many servers are configured to block or modify responses for requests without a user agent or with an unrecognized one