Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Dec 20, 2024
2 parents c3e8da7 + a73a9cb commit 6d3e654
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 26 deletions.
14 changes: 7 additions & 7 deletions .buildkite/generate_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
tests/smoke_tests
├── test_*.py -> release pipeline
├── test_pre_merge.py -> pre-merge pipeline
├── test_quick_tests_core.py -> run quick tests on PR before merging
run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py`
to generate the pipeline for testing. The CI will run this script as a pre-step,
Expand Down Expand Up @@ -208,8 +208,8 @@ def _convert_release(test_files: List[str]):
extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP})


def _convert_pre_merge(test_files: List[str]):
yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
def _convert_quick_tests_core(test_files: List[str]):
yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml'
output_file_pipelines = []
for test_file in test_files:
print(f'Converting {test_file} to {yaml_file_path}')
Expand All @@ -234,18 +234,18 @@ def _convert_pre_merge(test_files: List[str]):
def main():
test_files = os.listdir('tests/smoke_tests')
release_files = []
pre_merge_files = []
quick_tests_core_files = []
for test_file in test_files:
if not test_file.startswith('test_'):
continue
test_file_path = os.path.join('tests/smoke_tests', test_file)
if "test_pre_merge" in test_file:
pre_merge_files.append(test_file_path)
if "test_quick_tests_core" in test_file:
quick_tests_core_files.append(test_file_path)
else:
release_files.append(test_file_path)

_convert_release(release_files)
_convert_pre_merge(pre_merge_files)
_convert_quick_tests_core(quick_tests_core_files)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def make_deploy_resources_variables(
tpu_requested = True
k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
else:
k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY
k8s_resource_key = kubernetes_utils.get_gpu_resource_key()

port_mode = network_utils.get_port_mode(None)

Expand Down
12 changes: 7 additions & 5 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
# case we will need to update this logic.
# TODO(Doyoung): Update the error message raised
# with the multi-host TPU support.
gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
if 'Insufficient google.com/tpu' in event_message:
extra_msg = (
f'Verify if '
Expand All @@ -192,14 +193,15 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
pod,
extra_msg,
details=event_message))
elif (('Insufficient nvidia.com/gpu'
elif ((f'Insufficient {gpu_resource_key}'
in event_message) or
('didn\'t match Pod\'s node affinity/selector'
in event_message)):
extra_msg = (
f'Verify if '
f'{pod.spec.node_selector[label_key]}'
' is available in the cluster.')
f'Verify if any node matching label '
f'{pod.spec.node_selector[label_key]} and '
f'sufficient resource {gpu_resource_key} '
f'is available in the cluster.')
raise config_lib.KubernetesError(
_lack_resource_msg('GPU',
pod,
Expand Down Expand Up @@ -728,7 +730,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
limits = pod_spec['spec']['containers'][0].get('resources',
{}).get('limits')
if limits is not None:
needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0

# TPU pods provisioned on GKE use the default containerd runtime.
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
Expand Down
24 changes: 20 additions & 4 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def detect_accelerator_resource(
nodes = get_kubernetes_nodes(context)
for node in nodes:
cluster_resources.update(node.status.allocatable.keys())
has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or
has_accelerator = (get_gpu_resource_key() in cluster_resources or
TPU_RESOURCE_KEY in cluster_resources)

return has_accelerator, cluster_resources
Expand Down Expand Up @@ -2253,10 +2253,11 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
Number of accelerators allocated or available from the node. If no
resource is found, it returns 0.
"""
assert not (GPU_RESOURCE_KEY in attribute_dict and
gpu_resource_name = get_gpu_resource_key()
assert not (gpu_resource_name in attribute_dict and
TPU_RESOURCE_KEY in attribute_dict)
if GPU_RESOURCE_KEY in attribute_dict:
return int(attribute_dict[GPU_RESOURCE_KEY])
if gpu_resource_name in attribute_dict:
return int(attribute_dict[gpu_resource_name])
elif TPU_RESOURCE_KEY in attribute_dict:
return int(attribute_dict[TPU_RESOURCE_KEY])
return 0
Expand Down Expand Up @@ -2415,3 +2416,18 @@ def process_skypilot_pods(
num_pods = len(cluster.pods)
cluster.resources_str = f'{num_pods}x {cluster.resources}'
return list(clusters.values()), jobs_controllers, serve_controllers


def get_gpu_resource_key():
"""Get the GPU resource name to use in kubernetes.
The function first checks for an environment variable.
If defined, it uses its value; otherwise, it returns the default value.
Args:
name (str): Default GPU resource name, default is "nvidia.com/gpu".
Returns:
str: The selected GPU resource name.
"""
# Retrieve GPU resource name from environment variable, if set.
# Else use default.
# E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
4 changes: 2 additions & 2 deletions sky/utils/kubernetes/gpu_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def label():
# Get the list of nodes with GPUs
gpu_nodes = []
for node in nodes:
if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity:
if kubernetes_utils.get_gpu_resource_key() in node.status.capacity:
gpu_nodes.append(node)

print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
Expand Down Expand Up @@ -142,7 +142,7 @@ def label():
if len(gpu_nodes) == 0:
print('No GPU nodes found in the cluster. If you have GPU nodes, '
'please ensure that they have the label '
f'`{kubernetes_utils.GPU_RESOURCE_KEY}: <number of GPUs>`')
f'`{kubernetes_utils.get_gpu_resource_key()}: <number of GPUs>`')
else:
print('GPU labeling started - this may take 10 min or more to complete.'
'\nTo check the status of GPU labeling jobs, run '
Expand Down
28 changes: 28 additions & 0 deletions tests/smoke_tests/test_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,34 @@ def test_use_spot(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.azure
def test_azure_spot_instance_verification():
"""Test Azure spot instance provisioning with explicit verification.
This test verifies that when --use-spot is specified for Azure:
1. The cluster launches successfully
2. The instances are actually provisioned as spot instances
"""
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
'azure-spot-verification',
[
f'sky launch -c {name} --cloud azure tests/test_yamls/minimal.yaml --use-spot -y',
f'sky logs {name} 1 --status', f'TARGET_VM_NAME="{name}"; '
'VM_INFO=$(az vm list --query "[?contains(name, \'$TARGET_VM_NAME\')].{Name:name, ResourceGroup:resourceGroup}" -o tsv); '
'[[ -z "$VM_INFO" ]] && exit 1; '
'FULL_VM_NAME=$(echo "$VM_INFO" | awk \'{print $1}\'); '
'RESOURCE_GROUP=$(echo "$VM_INFO" | awk \'{print $2}\'); '
'VM_DETAILS=$(az vm list --resource-group "$RESOURCE_GROUP" '
'--query "[?name==\'$FULL_VM_NAME\'].{Name:name, Location:location, Priority:priority}" -o table); '
'[[ -z "$VM_DETAILS" ]] && exit 1; '
'echo "VM Details:"; echo "$VM_DETAILS"; '
'echo "$VM_DETAILS" | grep -qw "Spot" && exit 0 || exit 1'
],
f'sky down -y {name}',
)
smoke_tests_utils.run_one_test(test)


@pytest.mark.gcp
def test_stop_gcp_spot():
"""Test GCP spot can be stopped, autostopped, restarted."""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
# Smoke tests for SkyPilot required before merging
# If the change includes an interface modification or touches the core API,
# the reviewer could decide it’s necessary to trigger a pre-merge test and
# leave a comment /quicktest-core will then trigger this test.
#
# Default options are set in pyproject.toml
# Example usage:
# Run all tests except for AWS and Lambda Cloud
# > pytest tests/smoke_tests/test_pre_merge.py
# > pytest tests/smoke_tests/test_quick_tests_core.py
#
# Terminate failed clusters after test finishes
# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure
# > pytest tests/smoke_tests/test_quick_tests_core.py --terminate-on-failure
#
# Re-run last failed tests
# > pytest --lf
#
# Run one of the smoke tests
# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount
# > pytest tests/smoke_tests/test_quick_tests_core.py::test_yaml_launch_and_mount
#
# Only run test for AWS + generic tests
# > pytest tests/smoke_tests/test_pre_merge.py --aws
# > pytest tests/smoke_tests/test_quick_tests_core.py --aws
#
# Change cloud for generic tests to aws
# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws
# > pytest tests/smoke_tests/test_quick_tests_core.py --generic-cloud aws

from smoke_tests import smoke_tests_utils

Expand All @@ -29,7 +33,7 @@ def test_yaml_launch_and_mount(generic_cloud: str):
test = smoke_tests_utils.Test(
'test_yaml_launch_and_mount',
[
f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml',
f'sky launch -y -c {name} tests/test_yamls/minimal_test_quick_tests_core.yaml',
smoke_tests_utils.
get_cmd_wait_until_job_status_contains_matching_job_id(
cluster_name=name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ workdir: .
num_nodes: 1

run: |
ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml
ls -l ~/aws/tests/test_yamls/minimal_test_quick_tests_core.yaml

0 comments on commit 6d3e654

Please sign in to comment.