-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core][autoscaler] Autoscaler doesn't scale up correctly when the KubeRay RayCluster is not in the goal state #48909
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -492,6 +492,124 @@ def test_pending_deletes(self): | |
}, | ||
] | ||
|
||
def test_increase_min_replicas_to_scale_up(self): | ||
# Simulate the case where users manually increase the `minReplicas` field | ||
# from 0 to $num_pods. KubeRay will create $num_pods worker Pods to meet the new | ||
# `minReplicas`, even though the `replicas` field is still 0. | ||
small_group = "small-group" | ||
num_pods = 0 | ||
assert ( | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] | ||
== small_group | ||
) | ||
for pod in self.mock_client._pod_list["items"]: | ||
if pod["metadata"]["labels"]["ray.io/group"] == small_group: | ||
num_pods += 1 | ||
assert num_pods > 0 | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] = 0 | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ | ||
"minReplicas" | ||
] = num_pods | ||
|
||
# Launching a new node and `replicas` should be | ||
# `max(replicas, minReplicas) + 1`. | ||
self.provider.launch(shape={small_group: 1}, request_id="launch-1") | ||
patches = self.mock_client.get_patches( | ||
f"rayclusters/{self.provider._cluster_name}" | ||
) | ||
assert len(patches) == 1 | ||
assert patches[0] == { | ||
"op": "replace", | ||
"path": "/spec/workerGroupSpecs/0/replicas", | ||
"value": num_pods + 1, | ||
} | ||
|
||
def test_inconsistent_pods_raycr_scale_up(self): | ||
""" | ||
Test the case where the cluster state has not yet reached the desired state. | ||
Specifically, the replicas field in the RayCluster CR does not match the actual | ||
number of Pods. | ||
""" | ||
# Check the assumptions of the test | ||
small_group = "small-group" | ||
num_pods = 0 | ||
for pod in self.mock_client._pod_list["items"]: | ||
if pod["metadata"]["labels"]["ray.io/group"] == small_group: | ||
num_pods += 1 | ||
|
||
assert ( | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] | ||
== small_group | ||
) | ||
desired_replicas = num_pods + 1 | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ | ||
"replicas" | ||
] = desired_replicas | ||
|
||
# Launch a new node. The replicas field should be incremented by 1, even though | ||
# the cluster state has not yet reached the goal state. | ||
launch_request = {"small-group": 1} | ||
self.provider.launch(shape=launch_request, request_id="launch-1") | ||
|
||
patches = self.mock_client.get_patches( | ||
f"rayclusters/{self.provider._cluster_name}" | ||
) | ||
assert len(patches) == 1 | ||
assert patches[0] == { | ||
"op": "replace", | ||
"path": "/spec/workerGroupSpecs/0/replicas", | ||
"value": desired_replicas + 1, | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is it possible to also add tests that to delete workers are handled correctly when calculating the goal state by the change for regression? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added 24ba62a |
||
|
||
def test_inconsistent_pods_raycr_scale_down(self): | ||
""" | ||
Test the case where the cluster state has not yet reached the desired state. | ||
Specifically, the replicas field in the RayCluster CR does not match the actual | ||
number of Pods. | ||
""" | ||
# Check the assumptions of the test | ||
small_group = "small-group" | ||
num_pods = 0 | ||
pod_to_delete = None | ||
for pod in self.mock_client._pod_list["items"]: | ||
if pod["metadata"]["labels"]["ray.io/group"] == small_group: | ||
num_pods += 1 | ||
pod_to_delete = pod["metadata"]["name"] | ||
assert pod_to_delete is not None | ||
|
||
assert ( | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] | ||
== small_group | ||
) | ||
desired_replicas = num_pods + 1 | ||
self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ | ||
"replicas" | ||
] = desired_replicas | ||
|
||
# Terminate a node. The replicas field should be decremented by 1, even though | ||
# the cluster state has not yet reached the goal state. | ||
self.provider.terminate(ids=[pod_to_delete], request_id="term-1") | ||
patches = self.mock_client.get_patches( | ||
f"rayclusters/{self.provider._cluster_name}" | ||
) | ||
assert len(patches) == 2 | ||
assert patches == [ | ||
{ | ||
"op": "replace", | ||
"path": "/spec/workerGroupSpecs/0/replicas", | ||
"value": desired_replicas - 1, | ||
}, | ||
{ | ||
"op": "replace", | ||
"path": "/spec/workerGroupSpecs/0/scaleStrategy", | ||
"value": { | ||
"workersToDelete": [ | ||
pod_to_delete, | ||
] | ||
}, | ||
}, | ||
] | ||
|
||
|
||
if __name__ == "__main__": | ||
if os.environ.get("PARALLEL_CI"): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: function signature as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename the function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
renamed the function: 24ba62a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oops, sorry, i meant the return types. it's currently saying it's returning a tuple of 2, while it's actually 3.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good catch! updated 2c9c3d0