From d0ac6fe228217b7429893f634eb7e07793aabe9b Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 25 Jun 2021 02:03:57 -0700 Subject: [PATCH] Move check on file existence in CAS to backend for better efficiency --- metaflow/datastore/content_addressed_store.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/metaflow/datastore/content_addressed_store.py b/metaflow/datastore/content_addressed_store.py index c7352e502cf..8a854a82efb 100644 --- a/metaflow/datastore/content_addressed_store.py +++ b/metaflow/datastore/content_addressed_store.py @@ -83,9 +83,12 @@ def save_blobs(self, blobs, raw=False): results.append(self.save_blobs_result( uri=self._backend.full_uri(path) if raw else None, key=sha)) - if self._backend.is_file(path): - # This already exists in the backing datastore so we can skip it - continue + # We will not check if the file exists, doing it in the backend + # directly as it will most likely be more efficient (in the S3 + # backend, this is done in parallel for example). We do pay the + # additional packing cost but this should be more minor compared to + # a S3 access for example. + # Compute the meta information to store with the file meta = { 'cas_raw': raw, @@ -97,10 +100,7 @@ def save_blobs(self, blobs, raw=False): blob = self._pack_v1(blob) to_save[path] = (blob, meta) - # We don't actually want to overwrite but by saying =True, we avoid - # checking again saving some operations. We are already sure we are not - # sending duplicate files since we already checked. - self._backend.save_bytes(to_save, overwrite=True) + self._backend.save_bytes(to_save, overwrite=False) return results def load_blobs(self, keys, force_raw=False):