Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry storage operations in case of ClientError. #1107

Merged
merged 5 commits into from
Oct 8, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.D/1107.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retry storage operations in case of some errors.
79 changes: 64 additions & 15 deletions neuromation/api/storage.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import asyncio
import contextlib
import datetime
import enum
import errno
import fnmatch
import logging
import os
import re
import time
Expand All @@ -15,8 +17,10 @@
AsyncIterator,
Awaitable,
Callable,
ContextManager,
Dict,
Iterable,
Iterator,
List,
Optional,
Tuple,
Expand Down Expand Up @@ -52,6 +56,9 @@
MAX_OPEN_FILES = 100
READ_SIZE = 2 ** 20 # 1 MiB
TIME_THRESHOLD = 1.0
ATTEMPTS = 10

log = logging.getLogger(__name__)

Printer = Callable[[str], None]
ProgressQueueItem = Optional[Tuple[Callable[[Any], None], Any]]
Expand Down Expand Up @@ -384,9 +391,13 @@ async def _upload_file(
progress: AbstractFileProgress,
queue: "asyncio.Queue[ProgressQueueItem]",
) -> None:
await self.create(
dst, self._iterate_file(src_path, dst, progress=progress, queue=queue)
)
for retry in retries(f"Fail to upload {dst}"):
with retry:
await self.create(
dst,
self._iterate_file(src_path, dst, progress=progress, queue=queue),
)
break

async def upload_dir(
self,
Expand Down Expand Up @@ -428,14 +439,22 @@ async def _upload_dir(
exists = False
if update:
try:
dst_files = {
item.name: item for item in await self.ls(dst) if item.is_file()
}
for retry in retries(f"Fail to list {dst}"):
with retry:
dst_files = {
item.name: item
for item in await self.ls(dst)
if item.is_file()
}
break
exists = True
except ResourceNotFound:
update = False
if not exists:
await self.mkdir(dst, exist_ok=True)
for retry in retries(f"Fail to create {dst}"):
with retry:
await self.mkdir(dst, exist_ok=True)
break
except (FileExistsError, neuromation.api.IllegalArgumentError):
raise NotADirectoryError(errno.ENOTDIR, "Not a directory", str(dst))
await queue.put((progress.enter, StorageProgressEnterDir(src, dst)))
Expand Down Expand Up @@ -532,13 +551,19 @@ async def _download_file(
async with self._file_sem:
with dst_path.open("wb") as stream:
await queue.put((progress.start, StorageProgressStart(src, dst, size)))
pos = 0
async for chunk in self.open(src):
pos += len(chunk)
await queue.put(
(progress.step, StorageProgressStep(src, dst, pos, size))
)
await loop.run_in_executor(None, stream.write, chunk)
for retry in retries(f"Fail to download {src}"):
with retry:
pos = 0
async for chunk in self.open(src):
pos += len(chunk)
await queue.put(
(
progress.step,
StorageProgressStep(src, dst, pos, size),
)
)
await loop.run_in_executor(None, stream.write, chunk)
break
await queue.put(
(progress.complete, StorageProgressComplete(src, dst, size))
)
Expand Down Expand Up @@ -586,7 +611,12 @@ async def _download_dir(
item.name: item for item in dst_path.iterdir() if item.is_file()
},
)
folder = await self.ls(src)

for retry in retries(f"Fail to list {src}"):
with retry:
folder = await self.ls(src)
break

for child in folder:
name = child.name
if child.is_file():
Expand Down Expand Up @@ -713,3 +743,22 @@ def leave(self, data: StorageProgressLeaveDir) -> None:

def fail(self, data: StorageProgressFail) -> None:
pass


def retries(msg: str) -> Iterator[ContextManager[None]]:
sleeptime = 0.0
for r in range(ATTEMPTS)[::-1]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is reversed iteration here?
I believe that the thing may be implemented simpler:

for r in range(ATTEMPTS):
    ...
    yield retry()
else:
    raise last_error

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

if r:

@contextlib.contextmanager
def retry() -> Iterator[None]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the function should be async?
IIUC an exception raised during the work (not the first coroutine call but in the middle of the coroutine unwinding) is not handled.

It would be nice to have a test for retries() to check this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. There is asyncio.sleep() inside, so the context manager should be asynchronous.

try:
yield
except aiohttp.ClientError as err:
log.debug(f"{msg}: {err}. Retry...")
asyncio.sleep(sleeptime)

sleeptime += 0.1
yield retry()
else:
yield contextlib.ExitStack() # type: ignore