-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Take into account queue length in autoscaling #5684
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -108,6 +108,9 @@ def xray_heartbeat_batch_handler(self, unused_channel, data): | |||||
message = ray.gcs_utils.HeartbeatBatchTableData.FromString( | ||||||
heartbeat_data) | ||||||
for heartbeat_message in message.batch: | ||||||
resource_load = dict( | ||||||
zip(heartbeat_message.resource_load_label, | ||||||
heartbeat_message.resource_load_capacity)) | ||||||
total_resources = dict( | ||||||
zip(heartbeat_message.resources_total_label, | ||||||
heartbeat_message.resources_total_capacity)) | ||||||
|
@@ -122,7 +125,7 @@ def xray_heartbeat_batch_handler(self, unused_channel, data): | |||||
ip = self.raylet_id_to_ip_map.get(client_id) | ||||||
if ip: | ||||||
self.load_metrics.update(ip, total_resources, | ||||||
available_resources) | ||||||
available_resources, resource_load) | ||||||
else: | ||||||
logger.warning( | ||||||
"Monitor: " | ||||||
|
@@ -357,6 +360,7 @@ def run(self): | |||||
try: | ||||||
self._run() | ||||||
except Exception: | ||||||
logger.exception("Error in monitor loop") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe in periods in log messages |
||||||
if self.autoscaler: | ||||||
self.autoscaler.kill_workers() | ||||||
raise | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,45 +142,56 @@ def terminate_node(self, node_id): | |
class LoadMetricsTest(unittest.TestCase): | ||
def testUpdate(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}) | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some test cases for the new metric There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah there are a couple entries below in testLoadMessages |
||
assert lm.approx_workers_used() == 0.5 | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}) | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {}) | ||
assert lm.approx_workers_used() == 1.0 | ||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}) | ||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}, {}) | ||
assert lm.approx_workers_used() == 2.0 | ||
|
||
def testLoadMessages(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {}) | ||
assert lm.approx_workers_used() == 0.5 | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {"CPU": 1}) | ||
assert lm.approx_workers_used() == 1.0 | ||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {}) | ||
assert lm.approx_workers_used() == 1.5 | ||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {"GPU": 1}) | ||
assert lm.approx_workers_used() == 2.0 | ||
|
||
def testPruneByNodeIp(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0}) | ||
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0}) | ||
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0}, {}) | ||
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0}, {}) | ||
lm.prune_active_ips({"1.1.1.1", "4.4.4.4"}) | ||
assert lm.approx_workers_used() == 1.0 | ||
|
||
def testBottleneckResource(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}) | ||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}) | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {}) | ||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {}) | ||
assert lm.approx_workers_used() == 1.88 | ||
|
||
def testHeartbeat(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}) | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {}) | ||
lm.mark_active("2.2.2.2") | ||
assert "1.1.1.1" in lm.last_heartbeat_time_by_ip | ||
assert "2.2.2.2" in lm.last_heartbeat_time_by_ip | ||
assert "3.3.3.3" not in lm.last_heartbeat_time_by_ip | ||
|
||
def testDebugString(self): | ||
lm = LoadMetrics() | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}) | ||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}) | ||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {}) | ||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {}) | ||
lm.update("3.3.3.3", { | ||
"memory": 20, | ||
"object_store_memory": 40 | ||
}, { | ||
"memory": 0, | ||
"object_store_memory": 20 | ||
}) | ||
}, {}) | ||
debug = lm.info_string() | ||
assert ("ResourceUsage=2.0/4.0 CPU, 14.0/16.0 GPU, " | ||
"1.05 GiB/1.05 GiB memory, " | ||
|
@@ -418,8 +429,8 @@ def testAggressiveAutoscaling(self): | |
tag_filters={TAG_RAY_NODE_TYPE: "worker"}, ) | ||
addrs += head_ip | ||
for addr in addrs: | ||
lm.update(addr, {"CPU": 2}, {"CPU": 0}) | ||
lm.update(addr, {"CPU": 2}, {"CPU": 2}) | ||
lm.update(addr, {"CPU": 2}, {"CPU": 0}, {}) | ||
lm.update(addr, {"CPU": 2}, {"CPU": 2}, {}) | ||
assert autoscaler.bringup | ||
autoscaler.update() | ||
|
||
|
@@ -428,7 +439,7 @@ def testAggressiveAutoscaling(self): | |
self.waitForNodes(1) | ||
|
||
# All of the nodes are down. Simulate some load on the head node | ||
lm.update(head_ip, {"CPU": 2}, {"CPU": 0}) | ||
lm.update(head_ip, {"CPU": 2}, {"CPU": 0}, {}) | ||
|
||
autoscaler.update() | ||
self.waitForNodes(6) # expected due to batch sizes and concurrency | ||
|
@@ -702,17 +713,17 @@ def testScaleUpBasedOnLoad(self): | |
|
||
# Scales up as nodes are reported as used | ||
local_ip = services.get_node_ip_address() | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head | ||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}) # worker 1 | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head | ||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}, {}) # worker 1 | ||
autoscaler.update() | ||
self.waitForNodes(3) | ||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}) | ||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}, {}) | ||
autoscaler.update() | ||
self.waitForNodes(5) | ||
|
||
# Holds steady when load is removed | ||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}) | ||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}) | ||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}, {}) | ||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {}) | ||
autoscaler.update() | ||
assert autoscaler.num_launches_pending.value == 0 | ||
assert len(self.provider.non_terminated_nodes({})) == 5 | ||
|
@@ -746,20 +757,20 @@ def testDontScaleBelowTarget(self): | |
|
||
# Scales up as nodes are reported as used | ||
local_ip = services.get_node_ip_address() | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head | ||
# 1.0 nodes used => target nodes = 2 => target workers = 1 | ||
autoscaler.update() | ||
self.waitForNodes(1) | ||
|
||
# Make new node idle, and never used. | ||
# Should hold steady as target is still 2. | ||
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}) | ||
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}, {}) | ||
lm.last_used_time_by_ip["172.0.0.0"] = 0 | ||
autoscaler.update() | ||
assert len(self.provider.non_terminated_nodes({})) == 1 | ||
|
||
# Reduce load on head => target nodes = 1 => target workers = 0 | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 1}) | ||
lm.update(local_ip, {"CPU": 2}, {"CPU": 1}, {}) | ||
autoscaler.update() | ||
assert len(self.provider.non_terminated_nodes({})) == 0 | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you don't do this, the error message is lost forever trying to terminate nodes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean nodes probably weren't cleaned up? If so, would be good to print that in the error message.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's done below actually