-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
2470 report batch times automlsearch #3577
Changes from 23 commits
c0e7daa
8fe158a
52ce328
2dcd340
df8cd27
03bcf9d
550f248
f87be9e
e9d8e1a
cc5ced0
095042c
eac697e
2e99b50
349d713
6fe4dbc
3802b8a
7afc635
4832fda
e9b3c19
fae0bcd
13ab5da
f477cf7
092cb07
a085d20
9e35adf
3694a31
774d6a7
4ba83d3
1caad75
42fc70c
3223f16
41695c0
cf13faf
93aec54
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,7 @@ | |
from evalml.utils.gen_utils import contains_all_ts_parameters | ||
from evalml.utils.logger import ( | ||
get_logger, | ||
log_batch_times, | ||
log_subtitle, | ||
log_title, | ||
time_elapsed, | ||
|
@@ -857,16 +858,34 @@ def _handle_keyboard_interrupt(self): | |
else: | ||
leading_char = "" | ||
|
||
def search(self, show_iteration_plot=True): | ||
def search(self, show_iteration_plot=True, timing=None): | ||
"""Find the best pipeline for the data set. | ||
|
||
Args: | ||
show_iteration_plot (boolean, True): Shows an iteration vs. score plot in Jupyter notebook. | ||
Disabled by default in non-Jupyter enviroments. | ||
timing (str, None): Shows timing of the batches and the individual timings of each pipeline. | ||
MichaelFu512 marked this conversation as resolved.
Show resolved
Hide resolved
MichaelFu512 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Default: None | ||
log=prints out batch/pipeline timing to console. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the only options are "log" or None, I vote that we switch to a boolean flag for this, something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed as well! |
||
|
||
Raises: | ||
AutoMLSearchException: If all pipelines in the current AutoML batch produced a score of np.nan on the primary objective. | ||
ValueError: If timing is not set to a correct value | ||
|
||
Returns: | ||
Dict[int, Dict[str, Timestamp]]: Returns dict. | ||
Key=batch #, value=Dict[key=pipeline name, value=timestamp of pipeline]. | ||
Inner dict has key called "Total time of batch" with value=total time of batch. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really hard to understand without reading closely. I'd refactor it, something more like:
As a side note, |
||
""" | ||
batch_times = {} | ||
if timing is not None: | ||
timing = timing.lower() | ||
|
||
if timing != "log" and timing is not None: | ||
raise ValueError( | ||
"""Timing isn't set to a correct value! Please try again using "log".""" | ||
) | ||
|
||
if self._searched: | ||
self.logger.error( | ||
"AutoMLSearch.search() has already been run and will not run again on the same instance. Re-initialize AutoMLSearch to search again." | ||
|
@@ -924,7 +943,10 @@ def search(self, show_iteration_plot=True): | |
current_batch_pipeline_scores = [] | ||
new_pipeline_ids = [] | ||
loop_interrupted = False | ||
|
||
while self._should_continue(): | ||
pipeline_times = {} | ||
start_batch_time = time.time() | ||
computations = [] | ||
try: | ||
if not loop_interrupted: | ||
|
@@ -952,6 +974,7 @@ def search(self, show_iteration_plot=True): | |
current_computation_index | ||
] | ||
if computation.done() and not has_been_processed: | ||
start_pipeline_time = time.time() | ||
evaluation = computation.get_result() | ||
data, cached_data, pipeline, job_log = ( | ||
evaluation.get("scores"), | ||
|
@@ -962,6 +985,9 @@ def search(self, show_iteration_plot=True): | |
pipeline_id = self._post_evaluation_callback( | ||
pipeline, data, cached_data, job_log | ||
) | ||
pipeline_times[pipeline.name] = time_elapsed( | ||
start_pipeline_time | ||
) | ||
new_pipeline_ids.append(pipeline_id) | ||
computations[current_computation_index] = (computation, True) | ||
computations_left_to_process -= 1 | ||
|
@@ -983,20 +1009,28 @@ def search(self, show_iteration_plot=True): | |
current_batch_pipeline_scores = full_rankings[current_batch_idx][ | ||
"validation_score" | ||
] | ||
|
||
if ( | ||
len(current_batch_pipeline_scores) | ||
and current_batch_pipeline_scores.isna().all() | ||
): | ||
raise AutoMLSearchException( | ||
f"All pipelines in the current AutoML batch produced a score of np.nan on the primary objective {self.objective}." | ||
) | ||
if len(pipeline_times) > 0: | ||
pipeline_times["Total time of batch"] = time_elapsed(start_batch_time) | ||
batch_times[self._get_batch_number()] = pipeline_times | ||
|
||
self.search_duration = time.time() - self._start | ||
elapsed_time = time_elapsed(self._start) | ||
desc = f"\nSearch finished after {elapsed_time}" | ||
desc = desc.ljust(self._MAX_NAME_LEN) | ||
self.logger.info(desc) | ||
|
||
if timing == "log": | ||
log_title(self.logger, "Batch Time Stats") | ||
log_batch_times(self.logger, batch_times) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move the call to |
||
|
||
self._find_best_pipeline() | ||
if self._best_pipeline is not None: | ||
best_pipeline = self.rankings.iloc[0] | ||
|
@@ -1006,6 +1040,7 @@ def search(self, show_iteration_plot=True): | |
f"Best pipeline {self.objective.name}: {best_pipeline['validation_score']:3f}" | ||
) | ||
self._searched = True | ||
return batch_times | ||
|
||
def _find_best_pipeline(self): | ||
"""Finds the best pipeline in the rankings If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should move this to be an argument in
AutoMLSearch.__init__
instead ofAutoMLSearch.search
. Reason being, we have two ways for users to run search. This is one of them, but we're trying to move more over to running the top levelsearch
method instead of manually instantiatingAutoMLSearch
first. With the argument living here, users have no access to the argument.If we move the arg to
AutoMLSearch.__init__
and add it to the top level search methods as well, that will ensure users have full access to controlling this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agreed - thanks for covering this @eccabay!