-
Notifications
You must be signed in to change notification settings - Fork 199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Abstract more block handling from HighThroughputExecutor and share with WorkQueue #2071
Changes from all commits
800b490
d84591c
7130e29
7fccd2c
40a8c75
3011963
a0602d0
ce0f086
0bda48f
c32e3e6
e8c835d
7c46c16
ab1a8e0
99e163d
876b862
64c9ef1
2a01ba7
3f3b0a5
66e4b89
de9cc56
05d6db2
10a6817
eb00704
662fae3
98d3c74
dca99b9
0f07554
47d440b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,57 @@ | ||
import logging | ||
import threading | ||
from itertools import compress | ||
from abc import abstractmethod | ||
from abc import abstractmethod, abstractproperty | ||
from concurrent.futures import Future | ||
from typing import List, Any, Dict, Tuple | ||
from typing import List, Any, Dict, Optional, Tuple, Union | ||
|
||
import parsl # noqa F401 | ||
from parsl.executors.base import ParslExecutor | ||
from parsl.executors.errors import ScalingFailed | ||
from parsl.providers.provider_base import JobStatus, ExecutionProvider, JobState | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class StatusHandlingExecutor(ParslExecutor): | ||
def __init__(self, provider): | ||
class BlockProviderExecutor(ParslExecutor): | ||
"""A base class for executors which scale using blocks. | ||
|
||
This base class is intended to help with executors which: | ||
|
||
- use blocks of workers to execute tasks | ||
- blocks of workers are launched on a batch system through | ||
an `ExecutionProvider` | ||
|
||
An implementing class should implement the abstract methods required by | ||
`ParslExecutor` to submit tasks, as well as BlockProviderExecutor | ||
abstract methods to provide the executor-specific command to start a block | ||
of workers (the ``_get_launch_command`` method), and some basic scaling | ||
information (``outstanding`` and ``workers_per_node`` properties). | ||
|
||
This base class provides a ``scale_out`` method which will launch new | ||
blocks. It does not provide a ``scale_in`` method, because scale-in | ||
behaviour is not well defined in the Parsl scaling model and so behaviour | ||
is left to individual executors. | ||
|
||
Parsl scaling will provide scaling between min_blocks and max_blocks by | ||
invoking scale_out, but it will not initialize the blocks requested by | ||
any init_blocks parameter. Subclasses must implement that behaviour | ||
themselves. | ||
""" | ||
def __init__(self, provider: ExecutionProvider): | ||
super().__init__() | ||
self._provider = provider # type: ExecutionProvider | ||
self._provider = provider | ||
# errors can happen during the submit call to the provider; this is used | ||
# to keep track of such errors so that they can be handled in one place | ||
# together with errors reported by status() | ||
self._simulated_status = {} | ||
self._simulated_status: Dict[Any, JobStatus] = {} | ||
self._executor_bad_state = threading.Event() | ||
self._executor_exception = None | ||
self._executor_exception: Optional[Exception] = None | ||
self._generated_block_id_counter = 1 | ||
self._tasks = {} # type: Dict[object, Future] | ||
self.blocks = {} # type: Dict[str, str] | ||
self.block_mapping = {} # type: Dict[str, str] | ||
|
||
def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]: | ||
"""Given a list of block ids and a list of corresponding status strings, | ||
|
@@ -51,11 +79,6 @@ def status_polling_interval(self): | |
else: | ||
return self._provider.status_polling_interval | ||
|
||
@abstractmethod | ||
def _get_block_and_job_ids(self) -> Tuple[List[str], List[Any]]: | ||
raise NotImplementedError("Classes inheriting from StatusHandlingExecutor must implement " | ||
"_get_block_and_job_ids()") | ||
|
||
def _fail_job_async(self, block_id: Any, message: str): | ||
"""Marks a job that has failed to start but would not otherwise be included in status() | ||
as failed and report it in status() | ||
|
@@ -65,6 +88,13 @@ def _fail_job_async(self, block_id: Any, message: str): | |
self._generated_block_id_counter += 1 | ||
self._simulated_status[block_id] = JobStatus(JobState.FAILED, message) | ||
|
||
@abstractproperty | ||
def outstanding(self) -> int: | ||
"""This should return the number of tasks that the executor has been given to run (waiting to run, and running now)""" | ||
|
||
raise NotImplementedError("Classes inheriting from BlockProviderExecutor must implement " | ||
"outstanding()") | ||
|
||
def status(self) -> Dict[str, JobStatus]: | ||
"""Return status of all blocks.""" | ||
|
||
|
@@ -124,6 +154,51 @@ def _filter_scale_in_ids(self, to_kill, killed): | |
# Filters first iterable by bool values in second | ||
return list(compress(to_kill, killed)) | ||
|
||
def scale_out(self, blocks: int = 1) -> List[str]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scaling out and scaling in, despite having similar names, work really differently. We've had experience with block style scaling out with multiple generations of parsl executors - ipp, htex(+exex) and wq. We don't have much experience, and the experience we've had so far has been pretty poor, with managing scaling in. This PR attempts to factor out the stuff that we had long, positive experience with: scaling out. I don't want it to try to have abstractions for things we do not understand well / cannot do well: scaling in. |
||
"""Scales out the number of blocks by "blocks" | ||
""" | ||
if not self.provider: | ||
raise (ScalingFailed(None, "No execution provider available")) | ||
block_ids = [] | ||
for i in range(blocks): | ||
block_id = str(len(self.blocks)) | ||
try: | ||
job_id = self._launch_block(block_id) | ||
self.blocks[block_id] = job_id | ||
self.block_mapping[job_id] = block_id | ||
block_ids.append(block_id) | ||
except Exception as ex: | ||
self._fail_job_async(block_id, | ||
"Failed to start block {}: {}".format(block_id, ex)) | ||
return block_ids | ||
|
||
def _launch_block(self, block_id: str) -> Any: | ||
launch_cmd = self._get_launch_command(block_id) | ||
job_id = self.provider.submit(launch_cmd, 1) | ||
logger.debug("Launched block {}->{}".format(block_id, job_id)) | ||
if not job_id: | ||
raise(ScalingFailed(self.provider.label, | ||
"Attempts to provision nodes via provider has failed")) | ||
return job_id | ||
|
||
@abstractmethod | ||
def _get_launch_command(self, block_id: str) -> str: | ||
pass | ||
|
||
def _get_block_and_job_ids(self) -> Tuple[List[str], List[Any]]: | ||
# Not using self.blocks.keys() and self.blocks.values() simultaneously | ||
# The dictionary may be changed during invoking this function | ||
# As scale_in and scale_out are invoked in multiple threads | ||
block_ids = list(self.blocks.keys()) | ||
job_ids = [] # types: List[Any] | ||
for bid in block_ids: | ||
job_ids.append(self.blocks[bid]) | ||
return block_ids, job_ids | ||
|
||
@abstractproperty | ||
def workers_per_node(self) -> Union[int, float]: | ||
pass | ||
|
||
|
||
class NoStatusHandlingExecutor(ParslExecutor): | ||
def __init__(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if
BlockManagingExecutor
might be clearer