Skip to content
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 82 additions & 16 deletions avocado/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,19 @@
import shutil
import socket
import sys
import time
import typing
import urllib.parse
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor
from urllib.error import HTTPError
from urllib.request import urlopen

from avocado.utils import crypto, output

log = logging.getLogger(__name__)

USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for this specific user agent?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, Servers do not inherently require a user agent to enable a download, as the User-Agent header is technically optional according to HTTP specifications. However, many servers are configured to block or modify responses for requests without a user agent or with an unrecognized one



def url_open(url, data=None, timeout=5):
"""
Expand Down Expand Up @@ -83,24 +87,86 @@ def _url_download(url, filename, data):
src_file.close()


def url_download(url, filename, data=None, timeout=300):
def _download_range(
url: str,
start: int,
end: int,
part_file: str,
retries: int = 3,
delay: int = 1,
timeout: int = 300,
):
"""
Retrieve a file from given url.

:param url: source URL.
:param filename: destination path.
:param data: (optional) data to post.
Downloads file within start and end byte range into part_files
:param url: URL to download.
:param start: Start byte range.
:param end: End byte range.
:param part_file: Part file name.
:param retries: Number of retries.
:param delay: Delay in seconds.
:param timeout: Timeout in seconds.
"""
for attempt in range(1, retries + 1):
try:
req = urllib.request.Request(
url, headers={"User-Agent": USER_AGENT, "Range": f"bytes={start}-{end}"}
)
with urlopen(req, timeout=timeout) as r, open(part_file, "wb") as f:
shutil.copyfileobj(r, f, length=1024 * 1024) # 1MB chunks
return
except (socket.timeout, TimeoutError, HTTPError) as e:
if attempt == retries:
raise e
time.sleep(delay)


def url_download(
url: str,
filename: str,
data: typing.Optional[bytes] = None,
timeout: int = 300,
segments: int = 4,
Comment thread
SOORAJTS2001 marked this conversation as resolved.
Outdated
):
"""
Multipart downloader using thread pool executor by content-length splitting
:param url: URL to download.
:param filename: Filename to save the downloaded file
:param data: (optional) data to POST Request
:param timeout: (optional) default timeout in seconds.
:return: `None`.
:param segments: How much should we split the download into
"""
process = Process(target=_url_download, args=(url, filename, data))
log.info("Fetching %s -> %s", url, filename)
process.start()
process.join(timeout)
if process.is_alive():
process.terminate()
process.join()
raise OSError("Aborting downloading. Timeout was reached.")
headers = urllib.request.urlopen(
urllib.request.Request(url, method="HEAD"), timeout=10
).headers # Using HEAD method to get the content length
size = int(headers.get("Content-Length", -1))
supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()

if size <= 0 or data or not supports_range:
# if the server doesn't provide the size or accepted range / if we want sent data to the server (POST Method),
# switch to single download with urlopen
_url_download(url=url, filename=filename, data=data)
return
Comment on lines +139 to +148
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Missing error handling for HEAD request.

The HEAD request can raise HTTPError, URLError, or socket.timeout exceptions, causing the function to crash without falling back to _url_download. Additionally, the hardcoded 10-second timeout ignores the timeout parameter.

-    headers = urllib.request.urlopen(
-        urllib.request.Request(url, method="HEAD"), timeout=10
-    ).headers  # Using HEAD method to get the content length
-    size = int(headers.get("Content-Length", -1))
-    supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
-
-    if size <= 0 or data or not supports_range:
+    try:
+        head_response = urllib.request.urlopen(
+            urllib.request.Request(url, method="HEAD"), timeout=timeout
+        )
+        headers = head_response.headers
+        size = int(headers.get("Content-Length", -1))
+        supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
+    except (socket.timeout, HTTPError, urllib.error.URLError):
+        size = -1
+        supports_range = False
+
+    if size <= 0 or data or not supports_range:

Note: You'll need to add urllib.error to the imports for URLError.

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.14.6)

137-139: Audit URL open for permitted schemes. Allowing use of file: or custom schemes is often unexpected.

(S310)


138-138: Audit URL open for permitted schemes. Allowing use of file: or custom schemes is often unexpected.

(S310)


part_size = size // segments

def task(i: int):
start = i * part_size
end = size - 1 if i == segments - 1 else (start + part_size - 1)
part_file = f"{filename}.part{i}"
_download_range(
url=url, start=start, end=end, part_file=part_file, timeout=timeout
)
return part_file
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

with ThreadPoolExecutor(max_workers=segments) as executor:
part_files = list(executor.map(task, range(segments)))

# Merge the split files and remove them
with open(filename, "wb") as f:
for part in part_files:
with open(part, "rb") as pf:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may cause the unintentional overwrite of files, such as:

avocado.utils.download.url_download('https://avocado-project.org/data/assets/jeos-25-64.qcow2.xz', '/tmp/t.xz')

Will overwrite existing /tmp/t.xz.part* files. Ideally the temporary files will have temporary names (like a suffix) and also there will be protection against overwriting existing ones.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @clebergnu,
Shouldn't the temp files be overwrited for same file download every time, because we don't know whether those files are completely downloaded or not

shutil.copyfileobj(pf, f)
os.remove(part)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
Comment thread
SOORAJTS2001 marked this conversation as resolved.
Outdated


def url_download_interactive(url, output_file, title="", chunk_size=102400):
Expand Down
Loading