skypilot-org · concretevitamin · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -50,6 +50,7 @@
 from sky.skylet import job_lib
 from sky.skylet import log_lib
 from sky.usage import usage_lib
+from sky.utils import accelerator_registry
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import controller_utils
@@ -324,9 +325,11 @@ def add_gang_scheduling_placement_group_and_setup(
             acc_name, acc_count = list(resources_dict.items())[0]
             gpu_dict = {'GPU': acc_count}
             # gpu_dict should be empty when the accelerator is not GPU.
-            # FIXME: This is a hack to make sure that we do not reserve
-            # GPU when requesting TPU.
-            if 'tpu' in acc_name.lower():
+            # TODO(zongheng,zhanghao): an alternative is to start the remote
+            # cluster with custom resource 'GPU': <n> even if the accelerator(s)
+            # are not GPU. We opt for the current solution for now.
+            if accelerator_registry.is_schedulable_non_gpu_accelerator(
+                    acc_name):
                 gpu_dict = {}
             for bundle in bundles:
                 bundle.update({
@@ -488,7 +491,8 @@ def add_ray_task(self,
             options.append(f'resources={json.dumps(ray_resources_dict)}')
 
             resources_key = list(ray_resources_dict.keys())[0]
-            if 'tpu' not in resources_key.lower():
+            if not accelerator_registry.is_schedulable_non_gpu_accelerator(
+                    resources_key):
                 # `num_gpus` should be empty when the accelerator is not GPU.
                 # FIXME: use a set of GPU types, instead of 'tpu' in the key.
 

diff --git a/sky/utils/accelerator_registry.py b/sky/utils/accelerator_registry.py
@@ -50,6 +50,22 @@
     'H100',
 ]
 
+# List of non-GPU accelerators that are supported by our backend for job queue
+# scheduling.
+_SCHEDULABLE_NON_GPU_ACCELERATORS = [
+    'tpu',
+    'inferentia',
+    'trainium',
+]
+
+
+def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
+    """Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
+    for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
+        if name in accelerator_name.lower():
+            return True
+    return False
+
 
 def canonicalize_accelerator_name(accelerator: str) -> str:
     """Returns the canonical accelerator name."""

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
@@ -1401,6 +1401,23 @@ def test_scp_huggingface(generic_cloud: str):
     run_one_test(test)
 
 
+# ---------- Inferentia. ----------
+@pytest.mark.aws
+def test_inferentia():
+    name = _get_cluster_name()
+    test = Test(
+        'test_inferentia',
+        [
+            f'sky launch -y -c {name} -t inf2.xlarge -- echo hi',
+            f'sky exec {name} --gpus Inferentia:1 echo hi',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
 # ---------- TPU. ----------
 @pytest.mark.gcp
 @pytest.mark.tpu