Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] support spot pod on RunPod #4447

Merged
merged 13 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions sky/clouds/runpod.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ class RunPod(clouds.Cloud):
_REPR = 'RunPod'
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
('Spot is not supported, as runpod API does not implement spot.'),
clouds.CloudImplementationFeatures.MULTI_NODE:
('Multi-node not supported yet, as the interconnection among nodes '
'are non-trivial on RunPod.'),
Expand Down Expand Up @@ -70,11 +68,8 @@ def regions_with_offering(cls, instance_type: str,
zone: Optional[str]) -> List[clouds.Region]:
assert zone is None, 'RunPod does not support zones.'
del accelerators, zone # unused
if use_spot:
return []
else:
regions = service_catalog.get_region_zones_for_instance_type(
instance_type, use_spot, 'runpod')
regions = service_catalog.get_region_zones_for_instance_type(
instance_type, use_spot, 'runpod')

if region is not None:
regions = [r for r in regions if r.name == region]
Expand Down Expand Up @@ -176,11 +171,20 @@ def make_deploy_resources_variables(
else:
image_id = r.image_id[r.region]

instance_type = resources.instance_type
use_spot = resources.use_spot

hourly_cost = r.cloud.instance_type_to_hourly_cost(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hourly_cost = r.cloud.instance_type_to_hourly_cost(
hourly_cost = self.instance_type_to_hourly_cost(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

instance_type=instance_type, use_spot=use_spot
)

return {
'instance_type': resources.instance_type,
'instance_type': instance_type,
'custom_resources': custom_resources,
'region': region.name,
'image_id': image_id,
'use_spot': use_spot,
'bid_per_gpu': hourly_cost,
}

def _get_feasible_launchable_resources(
Expand Down
3 changes: 3 additions & 0 deletions sky/provision/runpod/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""RunPod low level API support for spot pod."""

from sky.provision.runpod.api.commands import create_spot_pod
119 changes: 119 additions & 0 deletions sky/provision/runpod/api/commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import List, Optional

from runpod import get_gpu
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
from runpod import get_user
from runpod.api.graphql import run_graphql_query

from sky.provision.runpod.api.pods import generate_spot_pod_deployment_mutation


def create_spot_pod(
name: str,
image_name: str,
gpu_type_id: str,
bid_per_gpu: float,
cloud_type: str = 'ALL',
gpu_count: Optional[int] = None,
min_memory_in_gb: Optional[int] = None,
min_vcpu_count: Optional[int] = None,
container_disk_in_gb: Optional[int] = None,
volume_in_gb: Optional[int] = None,
volume_mount_path: Optional[str] = None,
ports: Optional[str] = None,
start_ssh: Optional[bool] = True,
start_jupyter: Optional[bool] = False,
env: Optional[dict] = None,
docker_args: Optional[str] = None,
support_public_ip: Optional[bool] = True,
terminate_after: Optional[str] = None,
stop_after: Optional[str] = None,
data_center_id: Optional[str] = None,
country_code: Optional[str] = None,
network_volume_id: Optional[str] = None,
allowed_cuda_versions: Optional[List[str]] = None,
min_download: Optional[int] = None,
min_upload: Optional[int] = None,
cuda_version: Optional[str] = None,
template_id: Optional[str] = None,
volume_key: Optional[str] = None,
) -> dict:
"""
Create a Spot pod.

Parameters:
name (str): The name of the Pod.
image_name (str): The Docker image to use for the Pod environment.
gpu_type_id (str): The type of GPU required, e.g., 'NVIDIA RTX A6000'.
bid_per_gpu (float): The bid price per GPU for the spot instance.
cloud_type (str, optional): The type of cloud resources, default is 'ALL'.
gpu_count (int, optional): The number of GPUs required.
min_memory_in_gb (int, optional): Minimum memory (in GB) required for the instance.
min_vcpu_count (int, optional): Minimum number of virtual CPUs required.
container_disk_in_gb (int, optional): Size of the container disk in GB.
volume_in_gb (int, optional): Size of the volume in GB.
volume_mount_path (str, optional): Mount path for the volume, e.g., '/workspace'.
ports (str, optional): Ports to expose, formatted as 'port/protocol', e.g., '8888/http'.
start_ssh (bool, optional): Whether to enable SSH access to the Pod. Default is True.
start_jupyter (bool, optional): Whether to enable Jupyter Notebook in the Pod. Default is False.
env (dict, optional): Environment variables to set, provided as a dictionary of key-value pairs.
docker_args (str, optional): Additional Docker runtime arguments for the Pod.
support_public_ip (bool, optional): Whether to support public IP for the Pod. Default is True.
terminate_after (str, optional): Time limit after which the Pod will automatically terminate, e.g., '1h'.
stop_after (str, optional): Time limit after which the Pod will automatically stop, e.g., '1h'.
data_center_id (str, optional): Specific data center ID to target for deployment.
country_code (str, optional): Country code for regional targeting of deployment.
network_volume_id (str, optional): ID of the network volume to attach.
allowed_cuda_versions (List[str], optional): List of compatible CUDA versions for the Pod.
min_download (int, optional): Minimum network download speed (in Mbps) required.
min_upload (int, optional): Minimum network upload speed (in Mbps) required.
cuda_version (str, optional): Preferred CUDA version for the Pod.
template_id (str, optional): ID of the Pod template to use for deployment.
volume_key (str, optional): Encryption key for the Pod's attached volume.
:example:

>>> pod_id = create_spot_pod('test', 'runpod/stack', 'NVIDIA GeForce RTX 3070', bid_per_gpu=0.3)
"""
get_gpu(gpu_type_id)
# refer to https://graphql-spec.runpod.io/#definition-CloudTypeEnum
if cloud_type not in ['ALL', 'COMMUNITY', 'SECURE']:
raise ValueError('cloud_type must be one of ALL, COMMUNITY or SECURE')

if network_volume_id and data_center_id is None:
user_info = get_user()
for network_volume in user_info['networkVolumes']:
if network_volume['id'] == network_volume_id:
data_center_id = network_volume['dataCenterId']
break

if container_disk_in_gb is None and template_id is None:
container_disk_in_gb = 10
mutation = generate_spot_pod_deployment_mutation(
name,
image_name,
gpu_type_id,
bid_per_gpu,
cloud_type,
gpu_count,
min_memory_in_gb,
min_vcpu_count,
container_disk_in_gb,
volume_in_gb,
volume_mount_path,
ports,
start_ssh,
start_jupyter,
env,
docker_args,
support_public_ip,
terminate_after,
stop_after,
data_center_id,
country_code,
network_volume_id,
allowed_cuda_versions,
volume_key,
)

raw_response = run_graphql_query(mutation)
cleaned_response = raw_response['data']['podRentInterruptable']
return cleaned_response
128 changes: 128 additions & 0 deletions sky/provision/runpod/api/pods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from typing import List, Optional

"""
This module provides functions to generate GraphQL mutations for deploying
spot instance Pods on RunPod.
"""

# fields defined https://graphql-spec.runpod.io/#definition-PodRentInterruptableInput
def generate_spot_pod_deployment_mutation(
name: str,
image_name: str,
gpu_type_id: str,
bid_per_gpu: float,
cloud_type: str = "ALL",
gpu_count: Optional[int] = None,
min_memory_in_gb: Optional[int] = None,
min_vcpu_count: Optional[int] = None,
container_disk_in_gb: Optional[int] = None,
volume_in_gb: Optional[int] = None,
volume_mount_path: Optional[str] = None,
ports: Optional[str] = None,
start_ssh: Optional[bool] = True,
start_jupyter: Optional[bool] = False,
env: Optional[dict] = None,
docker_args: Optional[str] = None,
support_public_ip: Optional[bool] = True,
terminate_after: Optional[str] = None,
stop_after: Optional[str] = None,
data_center_id: Optional[str] = None,
country_code: Optional[str] = None,
network_volume_id: Optional[str] = None,
allowed_cuda_versions: Optional[List[str]] = None,
min_download: Optional[int] = None,
min_upload: Optional[int] = None,
cuda_version: Optional[str] = None,
template_id: Optional[str] = None,
volume_key: Optional[str] = None,
) -> str:
input_fields = []

# Required Fields
input_fields.append(f'name: "{name}"')
input_fields.append(f'imageName: "{image_name}"')
input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
input_fields.append(f'bidPerGpu: {bid_per_gpu}')

# Default Fields
input_fields.append(f'cloudType: {cloud_type}')

if start_ssh:
input_fields.append("startSsh: true")
if start_jupyter:
input_fields.append("startJupyter: true")
if support_public_ip:
input_fields.append("supportPublicIp: true")
else:
input_fields.append("supportPublicIp: false")

# Optional Fields
if gpu_count is not None:
input_fields.append(f"gpuCount: {gpu_count}")
if min_memory_in_gb is not None:
input_fields.append(f"minMemoryInGb: {min_memory_in_gb}")
if min_vcpu_count is not None:
input_fields.append(f"minVcpuCount: {min_vcpu_count}")
if container_disk_in_gb is not None:
input_fields.append(f"containerDiskInGb: {container_disk_in_gb}")
if volume_in_gb is not None:
input_fields.append(f"volumeInGb: {volume_in_gb}")
if volume_mount_path is not None:
input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
if ports is not None:
ports = ports.replace(" ", "")
input_fields.append(f'ports: "{ports}"')
if docker_args is not None:
input_fields.append(f'dockerArgs: "{docker_args}"')
if terminate_after is not None:
input_fields.append(f'terminateAfter: "{terminate_after}"')
if stop_after is not None:
input_fields.append(f'stopAfter: "{stop_after}"')
if data_center_id is not None:
input_fields.append(f'dataCenterId: "{data_center_id}"')
if country_code is not None:
input_fields.append(f'countryCode: "{country_code}"')
if network_volume_id is not None:
input_fields.append(f'networkVolumeId: "{network_volume_id}"')
if allowed_cuda_versions is not None:
allowed_cuda_versions_string = ", ".join(
[f'"{version}"' for version in allowed_cuda_versions]
)
input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")
if min_download is not None:
input_fields.append(f'minDownload: {min_download}')
if min_upload is not None:
input_fields.append(f'minUpload: {min_upload}')
if cuda_version is not None:
input_fields.append(f'cudaVersion: "{cuda_version}"')
if template_id is not None:
input_fields.append(f'templateId: "{template_id}"')
if volume_key is not None:
input_fields.append(f'volumeKey: "{volume_key}"')

if env is not None:
env_string = ", ".join(
[f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()]
)
input_fields.append(f"env: [{env_string}]")

# Format input fields
input_string = ", ".join(input_fields)
return f"""
mutation {{
podRentInterruptable(
input: {{
{input_string}
}}
) {{
id
desiredStatus
imageName
env
machineId
machine {{
podHostId
}}
}}
}}
"""
5 changes: 4 additions & 1 deletion sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ def run_instances(region: str, cluster_name_on_cloud: str,
disk_size=config.node_config['DiskSize'],
image_name=config.node_config['ImageId'],
ports=config.ports_to_open_on_launch,
public_key=config.node_config['PublicKey'])
public_key=config.node_config['PublicKey'],
preemptible=config.node_config['Preemptible'],
bid_per_gpu=config.node_config['bid_per_gpu'],
weih1121 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we keep the same naming style, i.e. BidPerGPU?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Updated!

)
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
raise
Expand Down
61 changes: 42 additions & 19 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from sky import sky_logging
from sky.adaptors import runpod
from sky.provision.runpod.api.commands import create_spot_pod
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
from sky.skylet import constants
from sky.utils import common_utils

Expand Down Expand Up @@ -100,7 +101,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:


def launch(name: str, instance_type: str, region: str, disk_size: int,
image_name: str, ports: Optional[List[int]], public_key: str) -> str:
image_name: str, ports: Optional[List[int]], public_key: str,
preemptible: Optional[bool], bid_per_gpu: float) -> str:
"""Launches an instance with the given parameters.

Converts the instance_type to the RunPod GPU name, finds the specs for the
Expand Down Expand Up @@ -141,24 +143,45 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
custom_ports_str = ''
if ports is not None:
custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])

new_instance = runpod.runpod.create_pod(
name=name,
image_name=image_name,
gpu_type_id=gpu_type,
cloud_type=cloud_type,
container_disk_in_gb=disk_size,
min_vcpu_count=4 * gpu_quantity,
min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
gpu_count=gpu_quantity,
country_code=region,
ports=(f'22/tcp,'
f'{custom_ports_str}'
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http'),
support_public_ip=True,
docker_args=
f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')
if preemptible is None or not preemptible:
new_instance = runpod.runpod.create_pod(
name=name,
image_name=image_name,
gpu_type_id=gpu_type,
cloud_type=cloud_type,
container_disk_in_gb=disk_size,
min_vcpu_count=4 * gpu_quantity,
min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
gpu_count=gpu_quantity,
country_code=region,
ports=(f'22/tcp,'
f'{custom_ports_str}'
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http'),
support_public_ip=True,
docker_args=
f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')
else:
new_instance = create_spot_pod(
name=name,
image_name=image_name,
gpu_type_id=gpu_type,
cloud_type=cloud_type,
bid_per_gpu=bid_per_gpu,
container_disk_in_gb=disk_size,
volume_in_gb=disk_size,
min_vcpu_count=4 * gpu_quantity,
min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
gpu_count=gpu_quantity,
# country_code=region,
ports=(f'22/tcp,'
f'{custom_ports_str}'
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http'),
support_public_ip=True,
docker_args=
f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'',
)

return new_instance['id']

Expand Down
Loading
Loading