Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AWS: Retry status fetch on 'Unable to locate credentials' error. #1988

Merged
merged 2 commits into from
May 27, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
import json
import os
import pathlib
import random
import re
import subprocess
import tempfile
import textwrap
import time
import typing
from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
from typing import (Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple,
Union)
from typing_extensions import Literal
import uuid

Expand Down Expand Up @@ -1527,19 +1529,25 @@ def check_network_connection():


def _process_cli_query(
cloud: str, cluster: str, query_cmd: str, deliminiator: str,
status_map: Mapping[str, Optional[global_user_state.ClusterStatus]]
cloud: str,
cluster: str,
query_cmd: str,
deliminator: str,
status_map: Mapping[str, Optional[global_user_state.ClusterStatus]],
max_retries: int = 3,
) -> List[global_user_state.ClusterStatus]:
"""Run the cloud CLI query and returns cluster status.

Args:
cloud: The cloud provider name.
cluster: The cluster name.
query_cmd: The cloud CLI query command.
deliminiator: The deliminiator separating the status in the output
deliminator: The deliminator separating the status in the output
of the query command.
status_map: A map from the CLI status string to the corresponding
global_user_state.ClusterStatus.
max_retries: Maximum number of retries before giving up. For AWS only.

Returns:
A list of global_user_state.ClusterStatus of all existing nodes in the
cluster. The list can be empty if none of the nodes in the clusters are
Expand All @@ -1554,12 +1562,28 @@ def _process_cli_query(
f'{stdout}\n'
'**** STDERR ****\n'
f'{stderr}')

# Cloud-specific error handling.
if (cloud == str(clouds.Azure()) and returncode == 2 and
'argument --ids: expected at least one argument' in stderr):
# Azure CLI has a returncode 2 when the cluster is not found, as
# --ids <empty> is passed to the query command. In that case, the
# cluster should be considered as DOWN.
return []
if (cloud == str(clouds.AWS()) and returncode != 0 and
'Unable to locate credentials. You can configure credentials by '
'running "aws configure"' in stdout + stderr):
# AWS: has run into this rare error with spot controller (which has an
# assumed IAM role and is working fine most of the time).
#
# We do not know the root cause. For now, the hypothesis is instance
# metadata service is temporarily unavailable. So, we retry the query.
if max_retries > 0:
logger.info('Encountered AWS "Unable to locate credentials" '
'error. Retrying.')
time.sleep(random.uniform(0, 1) * 2)
return _process_cli_query(cloud, cluster, query_cmd, deliminator,
status_map, max_retries - 1)

if returncode != 0:
with ux_utils.print_exception_no_traceback():
Expand All @@ -1572,7 +1596,7 @@ def _process_cli_query(
return []

statuses = []
for s in cluster_status.split(deliminiator):
for s in cluster_status.split(deliminator):
node_status = status_map[s]
if node_status is not None:
statuses.append(node_status)
Expand Down