From db733af9a4531f56305956b5d2dec87d93ab8125 Mon Sep 17 00:00:00 2001 From: Matthijs Piek <2212627+MatthijsPiek@users.noreply.github.com> Date: Thu, 9 Nov 2023 11:04:20 +0100 Subject: [PATCH] Reduce the amount of ListRequests for a recursive put for targets not ending in "/" --- s3fs/core.py | 44 ++++++++++++++++++++--------------------- s3fs/tests/test_s3fs.py | 37 ++++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/s3fs/core.py b/s3fs/core.py index 5d0247f4..d2d7d159 100644 --- a/s3fs/core.py +++ b/s3fs/core.py @@ -718,7 +718,7 @@ async def _lsdir( except ClientError as e: raise translate_boto_error(e) - if delimiter and files and not versions: + if delimiter and not versions: self.dircache[path] = files return files return self.dircache[path] @@ -1001,13 +1001,18 @@ async def _ls(self, path, detail=False, refresh=False, versions=False): def _exists_in_cache(self, path, bucket, key, version_id): fullpath = "/".join((bucket, key)) + entries = None try: entries = self._ls_from_cache(fullpath) except FileNotFoundError: - return False + pass if entries is None: - return None + children = [key for key in self.dircache.keys() if key.startswith(fullpath)] + if len(children) == 0: + return None + else: + return True # if children exist, the parent also exists if not self.version_aware or version_id is None: return True @@ -1036,27 +1041,22 @@ async def _exists(self, path): return True except FileNotFoundError: return False - elif self.dircache.get(bucket, False): - return True else: - try: - if self._ls_from_cache(bucket): - return True - except FileNotFoundError: - # might still be a bucket we can access but don't own - pass - try: - await self._call_s3( - "list_objects_v2", MaxKeys=1, Bucket=bucket, **self.req_kw - ) - return True - except Exception: - pass - try: - await self._call_s3("get_bucket_location", Bucket=bucket, **self.req_kw) + if self._exists_in_cache(path, bucket, key, version_id): return True - except Exception: - return False + else: + try: + await self._call_s3( + "list_objects_v2", MaxKeys=1, Bucket=bucket, **self.req_kw + ) + return True + except Exception: + pass + try: + await self._call_s3("get_bucket_location", Bucket=bucket, **self.req_kw) + return True + except Exception: + return False exists = sync_wrapper(_exists) diff --git a/s3fs/tests/test_s3fs.py b/s3fs/tests/test_s3fs.py index b243ff1f..af248431 100644 --- a/s3fs/tests/test_s3fs.py +++ b/s3fs/tests/test_s3fs.py @@ -2613,11 +2613,12 @@ def test_get_directory_recursive(s3, tmpdir): assert target_fs.find(target) == [os.path.join(target, "file")] -def test_put_directory_recursive(s3, tmpdir): +def test_put_directory_recursive(s3, tmpdir, request_count): src = os.path.join(tmpdir, "src") - src_file = os.path.join(src, "file") + deep_src_path = os.path.join(src, "very", "deep") + src_file = os.path.join(deep_src_path, "file") source_fs = fsspec.filesystem("file") - source_fs.mkdir(src) + source_fs.mkdir(deep_src_path) source_fs.touch(src_file) target = test_bucket_name + "/target" @@ -2629,18 +2630,42 @@ def test_put_directory_recursive(s3, tmpdir): assert s3.isdir(target) if loop == 0: - assert s3.find(target) == [target + "/file"] + assert s3.find(target) == [target + "/very/deep/file"] else: - assert sorted(s3.find(target)) == [target + "/file", target + "/src/file"] + assert sorted(s3.find(target)) == [target + "/src/very/deep/file", target + "/very/deep/file"] s3.rm(target, recursive=True) # put with slash assert not s3.exists(target) + + for loop in range(2): + s3.dircache.clear() + request_count.clear() + s3.put(src + "/", target, recursive=True) + + assert request_count.get("PutObject", 0) == 1 + assert request_count.get("ListObjectsV2", 0) <= 1 + assert s3.isdir(target) - assert s3.find(target) == [target + "/file"] + assert s3.find(target) == [target + "/very/deep/file"] + + +@pytest.fixture() +def request_count(s3): + count = {} + + def count_requests(request, operation_name, **kwargs): + nonlocal count + count[operation_name] = count.get(operation_name, 0) + 1 + + s3.s3.meta.events.register("request-created.s3", count_requests) + + yield count + + s3.s3.meta.events.unregister("request-created.s3", count_requests) def test_cp_two_files(s3):