-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unify ThreadPoolExecutor usage in all Dashboard(Agent)Head. #47160
Changes from all commits
ec7a341
7be696e
d806727
43bc740
d02e64a
7ae3e0e
3352356
f720375
34b5169
40fbc18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import pathlib | ||
import signal | ||
import sys | ||
from concurrent.futures import ThreadPoolExecutor | ||
|
||
import ray | ||
import ray._private.ray_constants as ray_constants | ||
|
@@ -47,6 +48,10 @@ def __init__( | |
# Public attributes are accessible for all agent modules. | ||
self.ip = node_ip_address | ||
self.minimal = minimal | ||
self.thread_pool_executor = ThreadPoolExecutor( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check my comment above. We need to
|
||
max_workers=dashboard_consts.RAY_AGENT_THREAD_POOL_MAX_WORKERS, | ||
thread_name_prefix="dashboard_agent_tpe", | ||
) | ||
|
||
assert gcs_address is not None | ||
self.gcs_address = gcs_address | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
from typing import Any, List, Optional | ||
|
||
import ray.dashboard.consts as dashboard_consts | ||
from ray._private.utils import get_or_create_event_loop | ||
from ray.dashboard.utils import ( | ||
Dict, | ||
MutableNotificationDict, | ||
|
@@ -13,6 +14,7 @@ | |
logger = logging.getLogger(__name__) | ||
|
||
|
||
# NOT thread safe. Every assignment must be on the main event loop thread. | ||
class DataSource: | ||
# {node id hex(str): node stats(dict of GetNodeStatsReply | ||
# in node_manager.proto)} | ||
|
@@ -64,12 +66,29 @@ async def purge(): | |
|
||
@classmethod | ||
@async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_UPDATING_INTERVAL) | ||
async def organize(cls): | ||
async def organize(cls, thread_pool_executor): | ||
""" | ||
Organizes data: read from (node_physical_stats, node_stats) and updates | ||
(node_workers, node_worker_stats). | ||
|
||
This methods is not really async, but DataSource is not thread safe so we need | ||
to make sure it's on the main event loop thread. To avoid blocking the main | ||
event loop, we yield after each node processed. | ||
""" | ||
node_workers = {} | ||
core_worker_stats = {} | ||
# await inside for loop, so we create a copy of keys(). | ||
# nodes may change during process, so we create a copy of keys(). | ||
for node_id in list(DataSource.nodes.keys()): | ||
workers = await cls.get_node_workers(node_id) | ||
node_physical_stats = DataSource.node_physical_stats.get(node_id, {}) | ||
node_stats = DataSource.node_stats.get(node_id, {}) | ||
# Offloads the blocking operation to a thread pool executor. This also | ||
# yields to the event loop. | ||
workers = await get_or_create_event_loop().run_in_executor( | ||
thread_pool_executor, | ||
cls.merge_workers_for_node, | ||
node_physical_stats, | ||
node_stats, | ||
) | ||
for worker in workers: | ||
stats = worker.get("coreWorkerStats", {}) | ||
if stats: | ||
|
@@ -80,10 +99,8 @@ async def organize(cls): | |
DataSource.core_worker_stats.reset(core_worker_stats) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is DataSource thread safe given now it's accessed in another thread as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 These custom data-structures aren't thread-safe. If we're planning to run this on TPE we need to cover these with locks |
||
|
||
@classmethod | ||
async def get_node_workers(cls, node_id): | ||
def merge_workers_for_node(cls, node_physical_stats, node_stats): | ||
workers = [] | ||
node_physical_stats = DataSource.node_physical_stats.get(node_id, {}) | ||
node_stats = DataSource.node_stats.get(node_id, {}) | ||
# Merge coreWorkerStats (node stats) to workers (node physical stats) | ||
pid_to_worker_stats = {} | ||
pid_to_language = {} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import asyncio | ||
import logging | ||
from concurrent.futures import ThreadPoolExecutor | ||
from pathlib import Path | ||
from typing import Optional, Set | ||
|
||
|
@@ -96,6 +97,13 @@ def __init__( | |
self._modules_to_load = modules_to_load | ||
self._modules_loaded = False | ||
|
||
# A TPE holding background, compute-heavy, latency-tolerant jobs, typically | ||
# state updates. | ||
self._thread_pool_executor = ThreadPoolExecutor( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check comment above |
||
max_workers=dashboard_consts.RAY_DASHBOARD_THREAD_POOL_MAX_WORKERS, | ||
thread_name_prefix="dashboard_head_tpe", | ||
) | ||
|
||
self.gcs_address = None | ||
assert gcs_address is not None | ||
self.gcs_address = gcs_address | ||
|
@@ -312,7 +320,7 @@ async def _async_notify(): | |
self._gcs_check_alive(), | ||
_async_notify(), | ||
DataOrganizer.purge(), | ||
DataOrganizer.organize(), | ||
DataOrganizer.organize(self._thread_pool_executor), | ||
] | ||
for m in modules: | ||
concurrent_tasks.append(m.run(self.server)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,6 @@ | |
import json | ||
import logging | ||
import traceback | ||
from concurrent.futures import ThreadPoolExecutor | ||
from random import sample | ||
from typing import AsyncIterator, List, Optional | ||
|
||
|
@@ -163,9 +162,6 @@ def __init__(self, dashboard_head): | |
super().__init__(dashboard_head) | ||
self._gcs_aio_client = dashboard_head.gcs_aio_client | ||
self._job_info_client = None | ||
self._upload_package_thread_pool_executor = ThreadPoolExecutor( | ||
thread_name_prefix="job_head.upload_package" | ||
) | ||
|
||
# It contains all `JobAgentSubmissionClient` that | ||
# `JobHead` has ever used, and will not be deleted | ||
|
@@ -317,7 +313,7 @@ async def upload_package(self, req: Request): | |
try: | ||
data = await req.read() | ||
await get_or_create_event_loop().run_in_executor( | ||
self._upload_package_thread_pool_executor, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it critical enough to deserve its own thread pool? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so because it's not latency critical either. |
||
self._dashboard_head._thread_pool_executor, | ||
upload_package_to_gcs, | ||
package_uri, | ||
data, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Provided that we're planning on offloading CPU-bound tasks that however still hold on to GIL, we should limit # of threads in the TPE (by default TPE provisions at
# of CPUs + 4
threads)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
changed to 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
changed in head.py, I mean