skypilot-org · concretevitamin · Apr 8, 2023 · Apr 8, 2023 · Apr 8, 2023 · Michaelvll
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -657,6 +657,15 @@ def _update_blocklist_on_gcp_error(
                     # {'code': 8, 'message': 'There is no more capacity in the zone "europe-west4-a"; you can try in another zone where Cloud TPU Nodes are offered (see https://cloud.google.com/tpu/docs/regions) [EID: 0x1bc8f9d790be9142]'} # pylint: disable=line-too-long
                     self._blocked_resources.add(
                         launchable_resources.copy(zone=zone.name))
+                elif code == 'RESOURCE_NOT_FOUND':
+                    # https://github.com/skypilot-org/skypilot/issues/1797
+                    # The VM may be alive on console. In the inner provision
+                    # loop we have used retries to recover but failed. The
+                    # provision loop will terminate the potentially live VMs
+                    # and move onto the next zone. Since the VM may have been
+                    # provisioned in this zone, it doesn't seem right to block
+                    # the current zone.
+                    pass
                 else:
                     assert False, error
         elif len(httperror_str) >= 1:
@@ -1524,6 +1533,18 @@ def need_ray_up(
                         'Retrying due to list request rate limit exceeded.')
                     return True
 
+                # https://github.com/skypilot-org/skypilot/issues/1797
+                # "The resource 'projects/xxx/zones/us-central1-b/instances/ray-yyy-head-<hash>-compute' was not found" # pylint: disable=line-too-long
+                pattern = (r'\'code\': \'RESOURCE_NOT_FOUND\'.*The resource'
+                           r'.*instances\/.*-compute\' was not found')
+                result = re.search(pattern, stderr)
+                if result is not None:
+                    # Retry. Unlikely will succeed if it's due to no capacity.
+                    logger.info(
+                        'Retrying due to the possibly flaky RESOURCE_NOT_FOUND '
+                        'error.')
+                    return True
+
             if ('Processing file mounts' in stdout and
                     'Running setup commands' not in stdout and
                     'Failed to setup head node.' in stderr):