Skip to content

Commit

Permalink
[Monitoring] Enable job and status granularity in UNTRIAGED_TESTCASE_…
Browse files Browse the repository at this point in the history
…COUNT (#4500)

### Motivation

The UNTRIAGED_TESTCASE_COUNT metric worked as expected, however it is
hard to figure out which job the testcase belongs to, and why the
testcase is stuck. This PR adds the job label, and a status one, which
can be one of four values:

PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'

Since (job, status) tuples might disappear from one triage run to the
next, the period of time the gauge is valid for should be, at most, the
interval at which the triage cronjob runs, otherwise it might overcount
things.

Part of #4271
  • Loading branch information
vitorguidi authored and jonathanmetzman committed Jan 8, 2025
1 parent 4ecbd36 commit d4b7c81
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 11 deletions.
43 changes: 33 additions & 10 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,25 @@ def _file_issue(testcase, issue_tracker, throttler):
return filed


untriaged_testcases = {}


def _increment_untriaged_testcase_count(job, status):
identifier = (job, status)
if identifier not in untriaged_testcases:
untriaged_testcases[identifier] = 0
untriaged_testcases[identifier] += 1


def _emit_untriaged_testcase_count_metric():
for (job, status) in untriaged_testcases:
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
untriaged_testcases, labels={
'job': job,
'status': status,
})


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
"""Emmits a metric to track age of untriaged testcases."""
if not testcase.timestamp:
Expand All @@ -324,6 +343,12 @@ def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
})


PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def main():
"""Files bugs."""
try:
Expand All @@ -346,8 +371,6 @@ def main():

throttler = Throttler()

untriaged_testcases = 0

for testcase_id in data_handler.get_open_testcase_id_iterator():
logs.info(f'Triaging {testcase_id}')
try:
Expand Down Expand Up @@ -376,7 +399,8 @@ def main():
if testcase.get_metadata('progression_pending'):
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
untriaged_testcases += 1
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_PROGRESSION)
continue

# If the testcase has a bug filed already, no triage is needed.
Expand All @@ -396,7 +420,8 @@ def main():
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
untriaged_testcases += 1
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
logs.info(
f'Skipping testcase {testcase_id}, critical tasks still pending.')
continue
Expand All @@ -414,14 +439,14 @@ def main():
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
untriaged_testcases += 1
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
untriaged_testcases += 1
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

Expand Down Expand Up @@ -450,7 +475,7 @@ def main():

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
untriaged_testcases += 1
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
if not _file_issue(testcase, issue_tracker, throttler):
Expand All @@ -463,9 +488,7 @@ def main():
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
testcase_id))

monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
untriaged_testcases, labels={})

_emit_untriaged_testcase_count_metric()
logs.info('Triage testcases succeeded.')
return True

Expand Down
5 changes: 4 additions & 1 deletion src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,10 @@
description='Number of testcases that were not yet triaged '
'(have not yet completed analyze, regression,'
' minimization, impact task), in hours.',
field_spec=[],
field_spec=[
monitor.StringField('job'),
monitor.StringField('status'),
],
)

ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(
Expand Down

0 comments on commit d4b7c81

Please sign in to comment.