skypilot-org · Michaelvll · Dec 9, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py
@@ -24,8 +24,6 @@ class RunPod(clouds.Cloud):
     _REPR = 'RunPod'
     _CLOUD_UNSUPPORTED_FEATURES = {
         clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
-        clouds.CloudImplementationFeatures.SPOT_INSTANCE:
-            ('Spot is not supported, as runpod API does not implement spot.'),
         clouds.CloudImplementationFeatures.MULTI_NODE:
             ('Multi-node not supported yet, as the interconnection among nodes '
              'are non-trivial on RunPod.'),
@@ -70,11 +68,8 @@ def regions_with_offering(cls, instance_type: str,
                               zone: Optional[str]) -> List[clouds.Region]:
         assert zone is None, 'RunPod does not support zones.'
         del accelerators, zone  # unused
-        if use_spot:
-            return []
-        else:
-            regions = service_catalog.get_region_zones_for_instance_type(
-                instance_type, use_spot, 'runpod')
+        regions = service_catalog.get_region_zones_for_instance_type(
+            instance_type, use_spot, 'runpod')
 
         if region is not None:
             regions = [r for r in regions if r.name == region]
@@ -176,11 +171,20 @@ def make_deploy_resources_variables(
         else:
             image_id = r.image_id[r.region]
 
+        instance_type = resources.instance_type
+        use_spot = resources.use_spot
+
+        hourly_cost = r.cloud.instance_type_to_hourly_cost(
-        hourly_cost = r.cloud.instance_type_to_hourly_cost(
+        hourly_cost = self.instance_type_to_hourly_cost(
-        hourly_cost = r.cloud.instance_type_to_hourly_cost(
+        hourly_cost = self.instance_type_to_hourly_cost(
+            instance_type=instance_type, use_spot=use_spot
+        )
+
         return {
-            'instance_type': resources.instance_type,
+            'instance_type': instance_type,
             'custom_resources': custom_resources,
             'region': region.name,
             'image_id': image_id,
+            'use_spot': use_spot,
+            'bid_per_gpu': hourly_cost,
         }
 
     def _get_feasible_launchable_resources(

diff --git a/sky/provision/runpod/api/__init__.py b/sky/provision/runpod/api/__init__.py
@@ -0,0 +1,3 @@
+"""RunPod low level API support for spot pod."""
+
+from sky.provision.runpod.api.commands import create_spot_pod
diff --git a/sky/provision/runpod/api/commands.py b/sky/provision/runpod/api/commands.py
@@ -0,0 +1,119 @@
+from typing import List, Optional
+
+from runpod import get_gpu
+from runpod import get_user
+from runpod.api.graphql import run_graphql_query
+
+from sky.provision.runpod.api.pods import generate_spot_pod_deployment_mutation
+
+
+def create_spot_pod(
+    name: str,
+    image_name: str,
+    gpu_type_id: str,
+    bid_per_gpu: float,
+    cloud_type: str = 'ALL',
+    gpu_count: Optional[int] = None,
+    min_memory_in_gb: Optional[int] = None,
+    min_vcpu_count: Optional[int] = None,
+    container_disk_in_gb: Optional[int] = None,
+    volume_in_gb: Optional[int] = None,
+    volume_mount_path: Optional[str] = None,
+    ports: Optional[str] = None,
+    start_ssh: Optional[bool] = True,
+    start_jupyter: Optional[bool] = False,
+    env: Optional[dict] = None,
+    docker_args: Optional[str] = None,
+    support_public_ip: Optional[bool] = True,
+    terminate_after: Optional[str] = None,
+    stop_after: Optional[str] = None,
+    data_center_id: Optional[str] = None,
+    country_code: Optional[str] = None,
+    network_volume_id: Optional[str] = None,
+    allowed_cuda_versions: Optional[List[str]] = None,
+    min_download: Optional[int] = None,
+    min_upload: Optional[int] = None,
+    cuda_version: Optional[str] = None,
+    template_id: Optional[str] = None,
+    volume_key: Optional[str] = None,
+) -> dict:
+    """
+    Create a Spot pod.
+
+    Parameters:
+        name (str): The name of the Pod.
+        image_name (str): The Docker image to use for the Pod environment.
+        gpu_type_id (str): The type of GPU required, e.g., 'NVIDIA RTX A6000'.
+        bid_per_gpu (float): The bid price per GPU for the spot instance.
+        cloud_type (str, optional): The type of cloud resources, default is 'ALL'.
+        gpu_count (int, optional): The number of GPUs required.
+        min_memory_in_gb (int, optional): Minimum memory (in GB) required for the instance.
+        min_vcpu_count (int, optional): Minimum number of virtual CPUs required.
+        container_disk_in_gb (int, optional): Size of the container disk in GB.
+        volume_in_gb (int, optional): Size of the volume in GB.
+        volume_mount_path (str, optional): Mount path for the volume, e.g., '/workspace'.
+        ports (str, optional): Ports to expose, formatted as 'port/protocol', e.g., '8888/http'.
+        start_ssh (bool, optional): Whether to enable SSH access to the Pod. Default is True.
+        start_jupyter (bool, optional): Whether to enable Jupyter Notebook in the Pod. Default is False.
+        env (dict, optional): Environment variables to set, provided as a dictionary of key-value pairs.
+        docker_args (str, optional): Additional Docker runtime arguments for the Pod.
+        support_public_ip (bool, optional): Whether to support public IP for the Pod. Default is True.
+        terminate_after (str, optional): Time limit after which the Pod will automatically terminate, e.g., '1h'.
+        stop_after (str, optional): Time limit after which the Pod will automatically stop, e.g., '1h'.
+        data_center_id (str, optional): Specific data center ID to target for deployment.
+        country_code (str, optional): Country code for regional targeting of deployment.
+        network_volume_id (str, optional): ID of the network volume to attach.
+        allowed_cuda_versions (List[str], optional): List of compatible CUDA versions for the Pod.
+        min_download (int, optional): Minimum network download speed (in Mbps) required.
+        min_upload (int, optional): Minimum network upload speed (in Mbps) required.
+        cuda_version (str, optional): Preferred CUDA version for the Pod.
+        template_id (str, optional): ID of the Pod template to use for deployment.
+        volume_key (str, optional): Encryption key for the Pod's attached volume.
+    :example:
+
+    >>> pod_id = create_spot_pod('test', 'runpod/stack', 'NVIDIA GeForce RTX 3070', bid_per_gpu=0.3)
+    """
+    get_gpu(gpu_type_id)
+    # refer to https://graphql-spec.runpod.io/#definition-CloudTypeEnum
+    if cloud_type not in ['ALL', 'COMMUNITY', 'SECURE']:
+        raise ValueError('cloud_type must be one of ALL, COMMUNITY or SECURE')
+
+    if network_volume_id and data_center_id is None:
+        user_info = get_user()
+        for network_volume in user_info['networkVolumes']:
+            if network_volume['id'] == network_volume_id:
+                data_center_id = network_volume['dataCenterId']
+                break
+
+    if container_disk_in_gb is None and template_id is None:
+        container_disk_in_gb = 10
+    mutation = generate_spot_pod_deployment_mutation(
+        name,
+        image_name,
+        gpu_type_id,
+        bid_per_gpu,
+        cloud_type,
+        gpu_count,
+        min_memory_in_gb,
+        min_vcpu_count,
+        container_disk_in_gb,
+        volume_in_gb,
+        volume_mount_path,
+        ports,
+        start_ssh,
+        start_jupyter,
+        env,
+        docker_args,
+        support_public_ip,
+        terminate_after,
+        stop_after,
+        data_center_id,
+        country_code,
+        network_volume_id,
+        allowed_cuda_versions,
+        volume_key,
+    )
+
+    raw_response = run_graphql_query(mutation)
+    cleaned_response = raw_response['data']['podRentInterruptable']
+    return cleaned_response
diff --git a/sky/provision/runpod/api/pods.py b/sky/provision/runpod/api/pods.py
@@ -0,0 +1,128 @@
+from typing import List, Optional
+
+"""
+This module provides functions to generate GraphQL mutations for deploying
+spot instance Pods on RunPod.
+"""
+
+# fields defined https://graphql-spec.runpod.io/#definition-PodRentInterruptableInput
+def generate_spot_pod_deployment_mutation(
+    name: str,
+    image_name: str,
+    gpu_type_id: str,
+    bid_per_gpu: float,
+    cloud_type: str = "ALL",
+    gpu_count: Optional[int] = None,
+    min_memory_in_gb: Optional[int] = None,
+    min_vcpu_count: Optional[int] = None,
+    container_disk_in_gb: Optional[int] = None,
+    volume_in_gb: Optional[int] = None,
+    volume_mount_path: Optional[str] = None,
+    ports: Optional[str] = None,
+    start_ssh: Optional[bool] = True,
+    start_jupyter: Optional[bool] = False,
+    env: Optional[dict] = None,
+    docker_args: Optional[str] = None,
+    support_public_ip: Optional[bool] = True,
+    terminate_after: Optional[str] = None,
+    stop_after: Optional[str] = None,
+    data_center_id: Optional[str] = None,
+    country_code: Optional[str] = None,
+    network_volume_id: Optional[str] = None,
+    allowed_cuda_versions: Optional[List[str]] = None,
+    min_download: Optional[int] = None,
+    min_upload: Optional[int] = None,
+    cuda_version: Optional[str] = None,
+    template_id: Optional[str] = None,
+    volume_key: Optional[str] = None,
+) -> str:
+    input_fields = []
+
+    # Required Fields
+    input_fields.append(f'name: "{name}"')
+    input_fields.append(f'imageName: "{image_name}"')
+    input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
+    input_fields.append(f'bidPerGpu: {bid_per_gpu}')
+
+    # Default Fields
+    input_fields.append(f'cloudType: {cloud_type}')
+
+    if start_ssh:
+        input_fields.append("startSsh: true")
+    if start_jupyter:
+        input_fields.append("startJupyter: true")
+    if support_public_ip:
+        input_fields.append("supportPublicIp: true")
+    else:
+        input_fields.append("supportPublicIp: false")
+
+    # Optional Fields
+    if gpu_count is not None:
+        input_fields.append(f"gpuCount: {gpu_count}")
+    if min_memory_in_gb is not None:
+        input_fields.append(f"minMemoryInGb: {min_memory_in_gb}")
+    if min_vcpu_count is not None:
+        input_fields.append(f"minVcpuCount: {min_vcpu_count}")
+    if container_disk_in_gb is not None:
+        input_fields.append(f"containerDiskInGb: {container_disk_in_gb}")
+    if volume_in_gb is not None:
+        input_fields.append(f"volumeInGb: {volume_in_gb}")
+    if volume_mount_path is not None:
+        input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
+    if ports is not None:
+        ports = ports.replace(" ", "")
+        input_fields.append(f'ports: "{ports}"')
+    if docker_args is not None:
+        input_fields.append(f'dockerArgs: "{docker_args}"')
+    if terminate_after is not None:
+        input_fields.append(f'terminateAfter: "{terminate_after}"')
+    if stop_after is not None:
+        input_fields.append(f'stopAfter: "{stop_after}"')
+    if data_center_id is not None:
+        input_fields.append(f'dataCenterId: "{data_center_id}"')
+    if country_code is not None:
+        input_fields.append(f'countryCode: "{country_code}"')
+    if network_volume_id is not None:
+        input_fields.append(f'networkVolumeId: "{network_volume_id}"')
+    if allowed_cuda_versions is not None:
+        allowed_cuda_versions_string = ", ".join(
+            [f'"{version}"' for version in allowed_cuda_versions]
+        )
+        input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")
+    if min_download is not None:
+        input_fields.append(f'minDownload: {min_download}')
+    if min_upload is not None:
+        input_fields.append(f'minUpload: {min_upload}')
+    if cuda_version is not None:
+        input_fields.append(f'cudaVersion: "{cuda_version}"')
+    if template_id is not None:
+        input_fields.append(f'templateId: "{template_id}"')
+    if volume_key is not None:
+        input_fields.append(f'volumeKey: "{volume_key}"')
+
+    if env is not None:
+        env_string = ", ".join(
+            [f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()]
+        )
+        input_fields.append(f"env: [{env_string}]")
+
+    # Format input fields
+    input_string = ", ".join(input_fields)
+    return f"""
+    mutation {{
+      podRentInterruptable(
+        input: {{
+          {input_string}
+        }}
+      ) {{
+        id
+        desiredStatus
+        imageName
+        env
+        machineId
+        machine {{
+          podHostId
+        }}
+      }}
+    }}
+    """
diff --git a/sky/provision/runpod/instance.py b/sky/provision/runpod/instance.py
@@ -89,7 +89,10 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                 disk_size=config.node_config['DiskSize'],
                 image_name=config.node_config['ImageId'],
                 ports=config.ports_to_open_on_launch,
-                public_key=config.node_config['PublicKey'])
+                public_key=config.node_config['PublicKey'],
+                preemptible=config.node_config['Preemptible'],
+                bid_per_gpu=config.node_config['bid_per_gpu'],
+            )
         except Exception as e:  # pylint: disable=broad-except
             logger.warning(f'run_instances error: {e}')
             raise

diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py
@@ -6,6 +6,7 @@
 
 from sky import sky_logging
 from sky.adaptors import runpod
+from sky.provision.runpod.api.commands import create_spot_pod
 from sky.skylet import constants
 from sky.utils import common_utils
 
@@ -100,7 +101,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
 
 
 def launch(name: str, instance_type: str, region: str, disk_size: int,
-           image_name: str, ports: Optional[List[int]], public_key: str) -> str:
+           image_name: str, ports: Optional[List[int]], public_key: str,
+           preemptible: Optional[bool], bid_per_gpu: float) -> str:
     """Launches an instance with the given parameters.
 
     Converts the instance_type to the RunPod GPU name, finds the specs for the
@@ -141,24 +143,45 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
     custom_ports_str = ''
     if ports is not None:
         custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
-
-    new_instance = runpod.runpod.create_pod(
-        name=name,
-        image_name=image_name,
-        gpu_type_id=gpu_type,
-        cloud_type=cloud_type,
-        container_disk_in_gb=disk_size,
-        min_vcpu_count=4 * gpu_quantity,
-        min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
-        gpu_count=gpu_quantity,
-        country_code=region,
-        ports=(f'22/tcp,'
-               f'{custom_ports_str}'
-               f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
-               f'{constants.SKY_REMOTE_RAY_PORT}/http'),
-        support_public_ip=True,
-        docker_args=
-        f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')
+    if preemptible is None or not preemptible:
+        new_instance = runpod.runpod.create_pod(
+            name=name,
+            image_name=image_name,
+            gpu_type_id=gpu_type,
+            cloud_type=cloud_type,
+            container_disk_in_gb=disk_size,
+            min_vcpu_count=4 * gpu_quantity,
+            min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
+            gpu_count=gpu_quantity,
+            country_code=region,
+            ports=(f'22/tcp,'
+                   f'{custom_ports_str}'
+                   f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
+                   f'{constants.SKY_REMOTE_RAY_PORT}/http'),
+            support_public_ip=True,
+            docker_args=
+            f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')
+    else:
+        new_instance = create_spot_pod(
+            name=name,
+            image_name=image_name,
+            gpu_type_id=gpu_type,
+            cloud_type=cloud_type,
+            bid_per_gpu=bid_per_gpu,
+            container_disk_in_gb=disk_size,
+            volume_in_gb=disk_size,
+            min_vcpu_count=4 * gpu_quantity,
+            min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
+            gpu_count=gpu_quantity,
+            # country_code=region,
+            ports=(f'22/tcp,'
+                   f'{custom_ports_str}'
+                   f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
+                   f'{constants.SKY_REMOTE_RAY_PORT}/http'),
+            support_public_ip=True,
+            docker_args=
+            f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'',
+        )
 
     return new_instance['id']
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""RunPod low level API support for spot pod."""

		from sky.provision.runpod.api.commands import create_spot_pod