Skip to content

Commit

Permalink
Merge #4499 and #4481 into chrome branch (#4505)
Browse files Browse the repository at this point in the history
Running CI checks with a PR prior to deployment
  • Loading branch information
vitorguidi authored Dec 16, 2024
1 parent 1461978 commit 19fea40
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 14 deletions.
64 changes: 58 additions & 6 deletions src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,50 @@ def __init__(self, subtask: _Subtask):
self._subtask = subtask
self._labels = None
self.utask_main_failure = None
self._utask_success_conditions = [
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
]
self._utask_maybe_retry_conditions = [
uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.TESTCASE_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE, # pylint: disable=no-member
]
self._utask_failure_conditions = [
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.UNHANDLED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD, # pylint: disable=no-member
]

if subtask == _Subtask.PREPROCESS:
self._preprocess_start_time_ns = self.start_time_ns
Expand Down Expand Up @@ -125,6 +169,18 @@ def set_task_details(self,
# Ensure we always have a value after this method returns.
assert self._preprocess_start_time_ns is not None

def _infer_uworker_main_outcome(self, exc_type, uworker_error):
'''Infers, on a best effort basis, whether an uworker output implies
success or failure. If an unequivocal response is not possible,
classifies as maybe_retry.'''
if exc_type or uworker_error in self._utask_failure_conditions:
outcome = 'error'
elif uworker_error in self._utask_maybe_retry_conditions:
outcome = 'maybe_retry'
else:
outcome = 'success'
return outcome

def __exit__(self, _exc_type, _exc_value, _traceback):
# Ignore exception details, let Python continue unwinding the stack.

Expand All @@ -145,7 +201,8 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# The only case where a task might fail without throwing, is in
# utask_main, by returning an ErrorType proto which indicates
# failure.
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
outcome = self._infer_uworker_main_outcome(_exc_type,
self.utask_main_failure)
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
**self._labels, 'outcome': outcome
})
Expand All @@ -166,11 +223,6 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
trimmed_labels)

if error_condition != 'UNHANDLED_EXCEPTION':
task = self._labels['task']
subtask = self._labels['subtask']
logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')


def ensure_uworker_env_type_safety(uworker_env):
"""Converts all values in |uworker_env| to str types.
Expand Down
24 changes: 19 additions & 5 deletions src/clusterfuzz/_internal/common/testcase_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@


def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'''Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
if not testcase_upload_metadata:
logs.warning(f'No upload metadata found for testcase {testcase_id},'
Expand All @@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]
elapsed_time_since_upload = datetime.datetime.utcnow()
elapsed_time_since_upload -= testcase_upload_metadata.timestamp
elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600

testcase = data_handler.get_testcase_by_id(testcase_id)

Expand All @@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return

from_fuzzer = not get_testcase_upload_metadata(testcase_id)

assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

if not testcase.get_age_in_seconds():
logs.warning(f'No timestamp associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return

testcase_age_in_hours = testcase.get_age_in_seconds() / 3600

logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
f'{testcase_id} (age = {elapsed_time_since_upload}) '
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
'in step {step}.')

monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
elapsed_time_since_upload,
testcase_age_in_hours,
labels={
'job': testcase.job_type,
'step': step,
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
})


Expand Down
30 changes: 29 additions & 1 deletion src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler):
return filed


def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool):
if testcase.stuck_in_triage == state:
return
testcase.stuck_in_triage = state
testcase.put()


untriaged_testcases = {}


Expand All @@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status):
def _emit_untriaged_testcase_count_metric():
for (job, status) in untriaged_testcases:
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
untriaged_testcases[(job, status)], labels={
untriaged_testcases[(job, status)],
labels={
'job': job,
'status': status,
})
Expand Down Expand Up @@ -383,14 +391,21 @@ def main():

# Skip if testcase's job is removed.
if testcase.job_type not in all_jobs:
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
f' ({testcase.job_type})')
continue

# Skip if testcase's job is in exclusions list.
if testcase.job_type in excluded_jobs:
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
f' exclusion list ({testcase.job_type})')
continue

# Skip if we are running progression task at this time.
if testcase.get_metadata('progression_pending'):
_set_testcase_stuck_state(testcase, True)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
_increment_untriaged_testcase_count(testcase.job_type,
Expand All @@ -399,17 +414,24 @@ def main():

# If the testcase has a bug filed already, no triage is needed.
if _is_bug_filed(testcase):
_set_testcase_stuck_state(testcase, False)
logs.info(
f'Skipping testcase {testcase_id}, since a bug was already filed.')
continue

# Check if the crash is important, i.e. it is either a reproducible crash
# or an unreproducible crash happening frequently.
if not _is_crash_important(testcase):
_set_testcase_stuck_state(testcase, False)
logs.info(
f'Skipping testcase {testcase_id}, since the crash is not important.')
continue

# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
logs.info(
Expand All @@ -429,13 +451,15 @@ def main():
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue
Expand All @@ -450,13 +474,17 @@ def main():
# If there are similar issues to this test case already filed or recently
# closed, skip filing a duplicate bug.
if _check_and_update_similar_bug(testcase, issue_tracker):
_set_testcase_stuck_state(testcase, False)
logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
' was already filed.')
continue

# Clean up old triage messages that would be not applicable now.
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, False)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
Expand Down
5 changes: 5 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,9 @@ class Testcase(Model):
# corpus.
trusted = ndb.BooleanProperty(default=False)

# Tracks if a testcase is stuck during triage.
stuck_in_triage = ndb.BooleanProperty(default=False)

def is_chromium(self):
return self.project_name in ('chromium', 'chromium-testing')

Expand Down Expand Up @@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty:

def get_age_in_seconds(self):
current_time = datetime.datetime.utcnow()
if not self.get_created_time():
return None
testcase_age = current_time - self.get_created_time()
return testcase_age.total_seconds()

Expand Down
6 changes: 4 additions & 2 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,14 @@
TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'uploaded_testcase_analysis/triage_duration_secs',
description=('Time elapsed between testcase upload and completion'
' of relevant tasks in the testcase upload lifecycle, '
'in hours.'),
' of relevant tasks in the testcase upload lifecycle.'
' Origin can be either from a fuzzer, or a manual'
' upload. Measured in hours.'),
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('step'),
monitor.StringField('job'),
monitor.StringField('origin'),
],
)

Expand Down

0 comments on commit 19fea40

Please sign in to comment.