neuro-inc · asvetlov · Jan 14, 2022 · Jan 12, 2022 · Jan 13, 2022 · Jan 13, 2022
diff --git a/CHANGELOG.D/2523.bugfix b/CHANGELOG.D/2523.bugfix
@@ -0,0 +1 @@
+Fixed memory leak in `neuro blob cp`.
diff --git a/neuro-sdk/src/neuro_sdk/_azure_bucket_provider.py b/neuro-sdk/src/neuro_sdk/_azure_bucket_provider.py
@@ -137,7 +137,10 @@ async def head_blob(self, key: str) -> BucketEntry:
             )
 
     async def put_blob(
-        self, key: str, body: Union[AsyncIterator[bytes], bytes]
+        self,
+        key: str,
+        body: Union[AsyncIterator[bytes], bytes],
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         blob_client = self._client.get_blob_client(key)
         if isinstance(body, bytes):
@@ -147,6 +150,8 @@ async def put_blob(
             async for data in body:
                 block_id = secrets.token_hex(16)
                 await blob_client.stage_block(block_id, data)
+                if progress:
+                    await progress(len(data))
                 blocks.append(BlobBlock(block_id=block_id))
             await blob_client.commit_block_list(blocks)
 

diff --git a/neuro-sdk/src/neuro_sdk/_bucket_base.py b/neuro-sdk/src/neuro_sdk/_bucket_base.py
@@ -107,6 +107,7 @@ async def put_blob(
         self,
         key: str,
         body: Union[AsyncIterator[bytes], bytes],
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         pass
 

diff --git a/neuro-sdk/src/neuro_sdk/_buckets.py b/neuro-sdk/src/neuro_sdk/_buckets.py
@@ -6,6 +6,8 @@
     AbstractSet,
     Any,
     AsyncIterator,
+    Awaitable,
+    Callable,
     Dict,
     Iterable,
     Mapping,
@@ -116,10 +118,14 @@ async def read_chunks(
                 yield chunk
 
     async def write_chunks(
-        self, path: PurePosixPath, body: AsyncIterator[bytes], offset: int = 0
+        self,
+        path: PurePosixPath,
+        body: AsyncIterator[bytes],
+        offset: int = 0,
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         assert offset == 0, "Buckets do not support offset write"
-        await self._provider.put_blob(self._as_file_key(path), body)
+        await self._provider.put_blob(self._as_file_key(path), body, progress)
 
     @asyncgeneratorcontextmanager
     async def iter_dir(self, path: PurePosixPath) -> AsyncIterator[PurePosixPath]:

diff --git a/neuro-sdk/src/neuro_sdk/_file_utils.py b/neuro-sdk/src/neuro_sdk/_file_utils.py
@@ -2,9 +2,19 @@
 import asyncio
 import errno
 import logging
+from contextlib import asynccontextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import AbstractSet, AsyncIterator, Generic, Optional, Tuple, TypeVar
+from typing import (
+    AbstractSet,
+    AsyncIterator,
+    Awaitable,
+    Callable,
+    Generic,
+    Optional,
+    Tuple,
+    TypeVar,
+)
 
 from yarl import URL
 
@@ -31,7 +41,7 @@
 
 
 TIME_THRESHOLD = 1.0
-MAX_OPEN_FILES = 20
+MAX_OPEN_FILES = 10
 READ_SIZE = 2 ** 20  # 1 MiB
 
 
@@ -85,7 +95,11 @@ async def read(self, path: FS_PATH) -> bytes:
 
     @abc.abstractmethod
     async def write_chunks(
-        self, path: FS_PATH, body: AsyncIterator[bytes], offset: int = 0
+        self,
+        path: FS_PATH,
+        body: AsyncIterator[bytes],
+        offset: int = 0,
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         pass
 
@@ -153,26 +167,42 @@ async def stat(self, path: Path) -> "FileSystem.BasicStat[Path]":
             size=stat.st_size,
         )
 
-    @asyncgeneratorcontextmanager
-    async def read_chunks(self, path: Path, offset: int = 0) -> AsyncIterator[bytes]:
+    @asynccontextmanager
+    async def read_chunks(
+        self, path: Path, offset: int = 0
+    ) -> AsyncIterator[AsyncIterator[bytes]]:
         loop = asyncio.get_event_loop()
         async with self._file_sem:
-            with path.open("rb") as stream:
-                stream.seek(offset)
-                chunk = await loop.run_in_executor(None, stream.read, READ_SIZE)
-                while chunk:
-                    yield chunk
+
+            async def _gen() -> AsyncIterator[bytes]:
+                with path.open("rb") as stream:
+                    stream.seek(offset)
                     chunk = await loop.run_in_executor(None, stream.read, READ_SIZE)
+                    while chunk:
+                        yield chunk
+                        chunk = await loop.run_in_executor(None, stream.read, READ_SIZE)
+
+            gen = _gen()
+            try:
+                yield gen
+            finally:
+                await gen.aclose()  # type: ignore
 
     async def write_chunks(
-        self, path: Path, body: AsyncIterator[bytes], offset: int = 0
+        self,
+        path: Path,
+        body: AsyncIterator[bytes],
+        offset: int = 0,
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         loop = asyncio.get_event_loop()
         with path.open("rb+" if offset else "wb") as stream:
             if offset:
                 stream.seek(offset)
             async for chunk in body:
                 await loop.run_in_executor(None, stream.write, chunk)
+                if progress:
+                    await progress(len(chunk))
 
     @asyncgeneratorcontextmanager
     async def iter_dir(self, path: Path) -> AsyncIterator[Path]:
@@ -339,39 +369,28 @@ async def transfer_file(
             queue, self._transfer_file(src, dst, offset=offset, progress=async_progress)
         )
 
-    @asyncgeneratorcontextmanager
-    async def _iterate_file_with_progress(
+    async def _transfer_file(
         self,
         src: S_PATH,
         dst: D_PATH,
         *,
         offset: int = 0,
         progress: _AsyncAbstractFileProgress,
-    ) -> AsyncIterator[bytes]:
+    ) -> None:
         src_url = self.src_fs.to_url(src)
         dst_url = self.dst_fs.to_url(dst)
         size = (await self.src_fs.stat(src)).size
         async with self.src_fs.read_chunks(src, offset) as chunks:
             await progress.start(StorageProgressStart(src_url, dst_url, size))
             pos = offset
-            async for chunk in chunks:
-                pos += len(chunk)
+
+            async def _progress(bytes_sent: int) -> None:
+                nonlocal pos
+                pos += bytes_sent
                 await progress.step(StorageProgressStep(src_url, dst_url, pos, size))
-                yield chunk
-            await progress.complete(StorageProgressComplete(src_url, dst_url, size))
 
-    async def _transfer_file(
-        self,
-        src: S_PATH,
-        dst: D_PATH,
-        *,
-        offset: int = 0,
-        progress: _AsyncAbstractFileProgress,
-    ) -> None:
-        async with self._iterate_file_with_progress(
-            src, dst, offset=offset, progress=progress
-        ) as body:
-            await self.dst_fs.write_chunks(dst, body, offset)
+            await self.dst_fs.write_chunks(dst, chunks, offset, _progress)
+            await progress.complete(StorageProgressComplete(src_url, dst_url, size))
 
     async def transfer_dir(
         self,

diff --git a/neuro-sdk/src/neuro_sdk/_gcs_bucket_provider.py b/neuro-sdk/src/neuro_sdk/_gcs_bucket_provider.py
@@ -245,7 +245,10 @@ async def head_blob(self, key: str) -> BucketEntry:
             )
 
     async def put_blob(
-        self, key: str, body: Union[AsyncIterator[bytes], bytes]
+        self,
+        key: str,
+        body: Union[AsyncIterator[bytes], bytes],
+        progress: Optional[Callable[[int], Awaitable[None]]] = None,
     ) -> None:
         # Step 1: initiate multipart upload
         url = f"{self.UPLOAD_BASE_URL}/b/{self._gcs_bucket_name}/o"
@@ -281,6 +284,8 @@ async def _upload_chunk(*, final: bool = False) -> None:
                 headers={"Content-Range": (f"bytes {data_range}/{total}")},
             ):
                 pass
+            if progress:
+                await progress(size)
             uploaded_bytes += size
             buffer = b""