Skip to content
25 changes: 22 additions & 3 deletions hca/dss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datetime import datetime
from fnmatch import fnmatchcase
import hashlib
import shutil
import os
import re
import tempfile
Expand Down Expand Up @@ -196,7 +197,8 @@ def download(self,
metadata_files=('*',),
data_files=('*',),
num_retries=10,
min_delay_seconds=0.25):
min_delay_seconds=0.25,
delete_cache=False):
"""
Download a bundle and save it to the local filesystem as a directory.

Expand All @@ -216,6 +218,9 @@ def download(self,
:param int num_retries: The initial quota of download failures to accept before exiting due to
failures. The number of retries increase and decrease as file chucks succeed and fail.
:param float min_delay_seconds: The minimum number of seconds to wait in between retries.
:param bool delete_cache: When downloading files, the folder '.hca' contains duplicate hardlinks that serve as
Comment thread
DailyDreaming marked this conversation as resolved.
Outdated
a cache when downloading. Specifying this option will delete that cache after the
files are downloaded.
Comment thread
DailyDreaming marked this conversation as resolved.
Outdated

Download a bundle and save it to the local filesystem as a directory.
By default, all data and metadata files are downloaded. To disable the downloading of data files,
Expand Down Expand Up @@ -249,14 +254,18 @@ def download(self,
if errors:
raise RuntimeError('{} file(s) failed to download'.format(errors))

if delete_cache:
shutil.rmtree(self._dir_path(download_dir))

# FIXME: Formatting of help messages is broken
def download_manifest(self,
manifest,
replica,
layout='none',
num_retries=10,
min_delay_seconds=0.25,
download_dir=''):
download_dir='',
delete_cache=False):
"""
Process the given manifest file in TSV (tab-separated values) format and download the files referenced by it.

Expand All @@ -272,6 +281,9 @@ def download_manifest(self,
:param float min_delay_seconds: The minimum number of seconds to wait in between retries for downloading any
file
:param str download_dir: The directory into which to download
:param bool delete_cache: When downloading files, the folder '.hca' contains duplicate hardlinks that serve as
Comment thread
DailyDreaming marked this conversation as resolved.
Outdated
a cache when downloading. Specifying this option will delete that cache after the
files are downloaded.
Comment thread
DailyDreaming marked this conversation as resolved.
Outdated

Files are always downloaded to a cache / filestore directory called '.hca'. This directory is created in the
current directory where download is initiated. A copy of the manifest used is also written to the current
Expand Down Expand Up @@ -315,6 +327,9 @@ def download_manifest(self,
else:
raise ValueError('Invalid layout {} not one of [none, bundle]'.format(layout))

if delete_cache:
shutil.rmtree(self._dir_path(download_dir))

def _download_manifest_filestore(self,
manifest,
replica,
Expand Down Expand Up @@ -610,6 +625,10 @@ def _do_download_file(self, dss_file, fh, num_retries, min_delay_seconds):
raise
return hasher.hexdigest()

@classmethod
def _dir_path(cls, download_dir):
Comment thread
DailyDreaming marked this conversation as resolved.
Outdated
return os.path.join(download_dir, '.hca', 'v2')

@classmethod
def _file_path(cls, checksum, download_dir):
"""
Expand All @@ -620,7 +639,7 @@ def _file_path(cls, checksum, download_dir):
"""
checksum = checksum.lower()
file_prefix = '_'.join(['files'] + list(map(str, cls.DIRECTORY_NAME_LENGTHS)))
path_pieces = [download_dir, '.hca', 'v2', file_prefix]
path_pieces = [cls._dir_path(download_dir), file_prefix]
checksum_index = 0
assert(sum(cls.DIRECTORY_NAME_LENGTHS) <= len(checksum))
for prefix_length in cls.DIRECTORY_NAME_LENGTHS:
Expand Down