skypilot-org · michaelzhiluo · Nov 4, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 4, 2022
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -157,6 +157,7 @@ def fill_template(template_name: str,
         cluster_name = variables.get('cluster_name')
         output_path = _get_yaml_path_from_cluster_name(cluster_name,
                                                        output_prefix)
+
     output_path = os.path.abspath(output_path)
 
     # Add yaml file path to the template variables.
@@ -758,7 +759,10 @@ def write_cluster_config(
     yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
 
     # Use a tmp file path to avoid incomplete YAML file being re-used in the future.
-    tmp_yaml_path = yaml_path + '.tmp'
+    if isinstance(cloud, clouds.Local):
+        tmp_yaml_path = yaml_path
+    else:
+        tmp_yaml_path = yaml_path + '.tmp'
     tmp_yaml_path = fill_template(
         cluster_config_template,
         dict(

diff --git a/sky/backends/onprem_utils.py b/sky/backends/onprem_utils.py
@@ -266,7 +266,11 @@ def get_local_cluster_accelerators(
                             'T4',
                             'P4',
                             'K80',
-                            'A100',]
+                            'A100',
+                            '1080',
+                            '2080',
+                            'A5000'
+                            'A6000']
         accelerators_dict = {}
         for acc in all_accelerators:
             output_str = os.popen(f'lspci | grep \\'{acc}\\'').read()
@@ -358,9 +362,10 @@ def _stop_ray_workers(runner: command_runner.SSHCommandRunner):
 
     # Launching Ray on the head node.
     head_resources = json.dumps(custom_resources[0], separators=(',', ':'))
+    head_gpu_count = sum(list(custom_resources[0].values()))
     head_cmd = ('ray start --head --port=6379 '
                 '--object-manager-port=8076 --dashboard-port 8265 '
-                f'--resources={head_resources!r}')
+                f'--resources={head_resources!r} --num-gpus={head_gpu_count}')
 
     with console.status('[bold cyan]Launching ray cluster on head'):
         backend_utils.run_command_and_handle_ssh_failure(
@@ -399,9 +404,11 @@ def _start_ray_workers(
 
             worker_resources = json.dumps(custom_resources[idx + 1],
                                           separators=(',', ':'))
+            worker_gpu_count = sum(list(custom_resources[idx + 1].values()))
             worker_cmd = (f'ray start --address={head_ip}:6379 '
                           '--object-manager-port=8076 --dashboard-port 8265 '
-                          f'--resources={worker_resources!r}')
+                          f'--resources={worker_resources!r} '
+                          f'--num-gpus={worker_gpu_count}')
             backend_utils.run_command_and_handle_ssh_failure(
                 runner,
                 worker_cmd,

diff --git a/sky/resources.py b/sky/resources.py
@@ -204,7 +204,12 @@ def _set_accelerators(
                     except ValueError:
                         with ux_utils.print_exception_no_traceback():
                             raise ValueError(parse_error) from None
-            assert len(accelerators) == 1, accelerators
+
+            # Ignore check for the local cloud case.
+            # It is possible the accelerators dict can contain multiple
+            # types of accelerators for some on-prem clusters.
+            if not isinstance(self._cloud, clouds.Local):
+                assert len(accelerators) == 1, accelerators
 
             # Canonicalize the accelerator names.
             accelerators = {