From ccb066a3df16bb6cb88cc3f33b42d7bfe5fa8c4a Mon Sep 17 00:00:00 2001 From: JimmyYang20 Date: Thu, 28 Oct 2021 17:30:55 +0800 Subject: [PATCH] storage initializer: fix s3 download when downloading the specified file, keep the name of the file itself. when downloading the specified folder, keep the name of the folder itself. Signed-off-by: JimmyYang20 --- scripts/storage-initializer/README.md | 8 +++- scripts/storage-initializer/download.py | 63 ++++++++++++++----------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/scripts/storage-initializer/README.md b/scripts/storage-initializer/README.md index 6f4278208..a2133d533 100644 --- a/scripts/storage-initializer/README.md +++ b/scripts/storage-initializer/README.md @@ -12,8 +12,12 @@ python3 download.py s3://models/classification/model.tar.gz /tmp/models/ export S3_ENDPOINT_URL=https://play.min.io export ACCESS_KEY_ID=Q3AM3UQ867SPQQA43P2F export SECRET_ACCESS_KEY=zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG -python3 download.py s3://datasets/mnist /tmp/mnist -# we then download the content of mnist directory into /tmp/mnist/ + +python3 download.py s3://datasets/mnist/1.jpg /tmp +# we then download the file 1.jpg into /tmp, and result is /tmp/1.jpg. + +python3 download.py s3://datasets/mnist /tmp +# we then download the folder mnist into /tmp, and result is /tmp/mnist. ``` 3. http server: diff --git a/scripts/storage-initializer/download.py b/scripts/storage-initializer/download.py index 8d86196e5..5fa17e36c 100644 --- a/scripts/storage-initializer/download.py +++ b/scripts/storage-initializer/download.py @@ -175,35 +175,44 @@ def download_s3_with_multi_files(download_files, total_count, base_uri, base_out_dir) -def _download_s3(client, uri, out_dir): - bucket_args = uri.replace(_S3_PREFIX, "", 1).split("/", 1) +def _download_s3(client, s3_url, out_dir): + """ + The function downloads specified file or folder to local directory address. + this function supports: + 1. when downloading the specified file, keep the name of the file itself. + 2. when downloading the specified folder, keep the name of the folder itself. + + Parameters: + client: s3 client + s3_url(string): url in s3, e.g. file url: s3://dev/data/data.txt, directory url: s3://dev/data + out_dir(string): local directory address, e.g. /tmp/data/ + + Returns: + int: files of number in s3_url + """ + + bucket_args = s3_url.replace(_S3_PREFIX, "", 1).split("/", 1) bucket_name = bucket_args[0] - bucket_path = len(bucket_args) > 1 and bucket_args[1] or "" - - objects = client.list_objects(bucket_name, - prefix=bucket_path, - recursive=True, - use_api_v1=True) - count = 0 - - for obj in objects: - # Replace any prefix from the object key with out_dir - subdir_object_key = obj.object_name[len(bucket_path):].strip("/") - # fget_object handles directory creation if does not exist - if not obj.is_dir: - local_file = os.path.join( - out_dir, - subdir_object_key or os.path.basename(obj.object_name) - ) - LOG.debug("downloading count:%d, file:%s", - count, subdir_object_key) - client.fget_object(bucket_name, obj.object_name, local_file) - _extract_compress(local_file, out_dir) - - count += 1 - - return count + bucket_path = len(bucket_args) > 1 and os.path.normpath(bucket_args[1]) or "" + objects = client.list_objects(bucket_name, prefix=bucket_path, use_api_v1=True) + for o in objects: + if not o.is_dir: + client.fget_object(bucket_name, o.object_name, os.path.join(out_dir, os.path.basename(o.object_name))) + return 1 + else: + count = 0 + objects = client.list_objects(bucket_name, prefix=bucket_path, recursive=True, use_api_v1=True) + root_path, _ = os.path.split(os.path.normpath(bucket_path)) + for obj in objects: + if not obj.is_dir: + object_file_path = os.path.join(out_dir, os.path.relpath(obj.object_name, root_path)) + client.fget_object(bucket_name, obj.object_name, object_file_path) + count += 1 + + return count + + return 0 def download_local(uri, out_dir=None): local_path = uri.replace(_LOCAL_PREFIX, "/", 1)