Skip to content

Commit

Permalink
storage initializer: fix s3 download
Browse files Browse the repository at this point in the history
when downloading the specified file, keep the name of the file itself.
when downloading the specified folder, keep the name of the folder itself.

Signed-off-by: JimmyYang20 <[email protected]>
  • Loading branch information
JimmyYang20 committed Oct 29, 2021
1 parent 1c5c537 commit ccb066a
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 29 deletions.
8 changes: 6 additions & 2 deletions scripts/storage-initializer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ python3 download.py s3://models/classification/model.tar.gz /tmp/models/
export S3_ENDPOINT_URL=https://play.min.io
export ACCESS_KEY_ID=Q3AM3UQ867SPQQA43P2F
export SECRET_ACCESS_KEY=zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG
python3 download.py s3://datasets/mnist /tmp/mnist
# we then download the content of mnist directory into /tmp/mnist/

python3 download.py s3://datasets/mnist/1.jpg /tmp
# we then download the file 1.jpg into /tmp, and result is /tmp/1.jpg.

python3 download.py s3://datasets/mnist /tmp
# we then download the folder mnist into /tmp, and result is /tmp/mnist.

```
3. http server:
Expand Down
63 changes: 36 additions & 27 deletions scripts/storage-initializer/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,35 +175,44 @@ def download_s3_with_multi_files(download_files,
total_count, base_uri, base_out_dir)


def _download_s3(client, uri, out_dir):
bucket_args = uri.replace(_S3_PREFIX, "", 1).split("/", 1)
def _download_s3(client, s3_url, out_dir):
"""
The function downloads specified file or folder to local directory address.
this function supports:
1. when downloading the specified file, keep the name of the file itself.
2. when downloading the specified folder, keep the name of the folder itself.
Parameters:
client: s3 client
s3_url(string): url in s3, e.g. file url: s3://dev/data/data.txt, directory url: s3://dev/data
out_dir(string): local directory address, e.g. /tmp/data/
Returns:
int: files of number in s3_url
"""

bucket_args = s3_url.replace(_S3_PREFIX, "", 1).split("/", 1)
bucket_name = bucket_args[0]
bucket_path = len(bucket_args) > 1 and bucket_args[1] or ""

objects = client.list_objects(bucket_name,
prefix=bucket_path,
recursive=True,
use_api_v1=True)
count = 0

for obj in objects:
# Replace any prefix from the object key with out_dir
subdir_object_key = obj.object_name[len(bucket_path):].strip("/")
# fget_object handles directory creation if does not exist
if not obj.is_dir:
local_file = os.path.join(
out_dir,
subdir_object_key or os.path.basename(obj.object_name)
)
LOG.debug("downloading count:%d, file:%s",
count, subdir_object_key)
client.fget_object(bucket_name, obj.object_name, local_file)
_extract_compress(local_file, out_dir)

count += 1

return count
bucket_path = len(bucket_args) > 1 and os.path.normpath(bucket_args[1]) or ""

objects = client.list_objects(bucket_name, prefix=bucket_path, use_api_v1=True)
for o in objects:
if not o.is_dir:
client.fget_object(bucket_name, o.object_name, os.path.join(out_dir, os.path.basename(o.object_name)))
return 1
else:
count = 0
objects = client.list_objects(bucket_name, prefix=bucket_path, recursive=True, use_api_v1=True)
root_path, _ = os.path.split(os.path.normpath(bucket_path))
for obj in objects:
if not obj.is_dir:
object_file_path = os.path.join(out_dir, os.path.relpath(obj.object_name, root_path))
client.fget_object(bucket_name, obj.object_name, object_file_path)
count += 1

return count

return 0

def download_local(uri, out_dir=None):
local_path = uri.replace(_LOCAL_PREFIX, "/", 1)
Expand Down

0 comments on commit ccb066a

Please sign in to comment.