Skip to content

Commit

Permalink
Adding instrumentation for bugs opened/closed (#4322)
Browse files Browse the repository at this point in the history
### Motivation

Opening and closing bugs is the last step in the clusterfuzz user
journey, and we currently do not have that instrumentation. This PR:

* Enables monitoring in the kubernetes environment
* Enables monitoring on cron jobs
* Moves the wrap_with_monitoring context manager from run_bot.py to
monitoring.py, so it can get reused in run_cron.py
* Collects bugs opened metrics from the triage cronjob
* Collects bugs closed metrics from the cleanup cronjob

The only relevant label for these metrics is the fuzzer name.

For bug filing, we measure how many attempts:
* Succeeded
* Failed
* Got throttled

For bug closing, we measure how many attempts:
* Succeeded
* Failed

Part of #4271
  • Loading branch information
vitorguidi authored Oct 14, 2024
1 parent 0a8754d commit 3df5dfd
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 33 deletions.
22 changes: 18 additions & 4 deletions src/clusterfuzz/_internal/cron/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from clusterfuzz._internal.issue_management import issue_tracker_utils
from clusterfuzz._internal.metrics import crash_stats
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.metrics import monitoring_metrics

GENERIC_INCORRECT_COMMENT = (
'\n\nIf this is incorrect, please add the {label_text}')
Expand Down Expand Up @@ -483,10 +484,23 @@ def mark_issue_as_closed_if_testcase_is_fixed(policy, testcase, issue):
if not skip_auto_close:
issue.status = policy.status('verified')

issue.save(new_comment=comment, notify=True)
logs.info(f'Mark issue {issue.id} as verified for '
f'fixed testcase {testcase.key.id()}.')
issue_filer.notify_issue_update(testcase, 'verified')
try:
issue.save(new_comment=comment, notify=True)
logs.info(f'Mark issue {issue.id} as verified for '
f'fixed testcase {testcase.key.id()}.')
issue_filer.notify_issue_update(testcase, 'verified')
monitoring_metrics.ISSUE_CLOSING_SUCCESS.increment({
'fuzzer_name': testcase.fuzzer_name
})
except Exception as e:
logs.error(
f'Failed to mark issue {issue.id} as verified for '
f'fixed testcase {testcase.key.id()}.',
extras={'exception': e})
monitoring_metrics.ISSUE_CLOSING_FAILED.increment({
'fuzzer_name': testcase.fuzzer_name
})
raise e


def mark_unreproducible_testcase_as_fixed_if_issue_is_closed(testcase, issue):
Expand Down
10 changes: 10 additions & 0 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from clusterfuzz._internal.issue_management import issue_tracker_utils
from clusterfuzz._internal.metrics import crash_stats
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.metrics import monitoring_metrics

from . import grouper

Expand Down Expand Up @@ -261,6 +262,9 @@ def _file_issue(testcase, issue_tracker, throttler):

if throttler.should_throttle(testcase):
_add_triage_message(testcase, 'Skipping filing as it is throttled.')
monitoring_metrics.ISSUE_FILING_THROTTLED.increment({
'fuzzer_name': testcase.fuzzer_name
})
return False

if crash_analyzer.is_experimental_crash(testcase.crash_type):
Expand All @@ -273,11 +277,17 @@ def _file_issue(testcase, issue_tracker, throttler):
try:
_, file_exception = issue_filer.file_issue(testcase, issue_tracker)
filed = True
monitoring_metrics.ISSSUE_FILING_SUCCESS.increment({
'fuzzer_name': testcase.fuzzer_name
})
except Exception as e:
file_exception = e

if file_exception:
logs.error(f'Failed to file issue for testcase {testcase.key.id()}.')
monitoring_metrics.ISSUE_FILING_FAILED.increment({
'fuzzer_name': testcase.fuzzer_name
})
_add_triage_message(
testcase,
f'Failed to file issue due to exception: {str(file_exception)}')
Expand Down
11 changes: 6 additions & 5 deletions src/clusterfuzz/_internal/metrics/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
_default_extras = {}


def _is_running_on_k8s():
"""Returns whether or not we're running on K8s."""
# We do this here to avoid circular imports with environment.
return os.getenv('IS_K8S_ENV') == 'true'


def _increment_error_count():
""""Increment the error count metric."""
if _is_running_on_k8s():
Expand Down Expand Up @@ -60,11 +66,6 @@ def _is_running_on_app_engine():
os.getenv('SERVER_SOFTWARE').startswith('Google App Engine/')))


def _is_running_on_k8s():
"""Returns whether or not we're running on K8s."""
return os.getenv('IS_K8S_ENV') == 'true'


def _console_logging_enabled():
"""Return bool on where console logging is enabled, usually for tests."""
return bool(os.getenv('LOG_TO_CONSOLE'))
Expand Down
31 changes: 27 additions & 4 deletions src/clusterfuzz/_internal/metrics/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@

import bisect
import collections
import contextlib
import functools
import itertools
import re
import signal
import threading
import time
from typing import List
Expand Down Expand Up @@ -132,6 +134,23 @@ def _flush_metrics():
logs.error(f'Failed to flush metrics: {e}')


def handle_sigterm(signo, stack_frame): #pylint: disable=unused-argument
logs.info('Handling sigterm, stopping monitoring daemon.')
stop()
logs.info('Sigterm handled, metrics flushed.')


@contextlib.contextmanager
def wrap_with_monitoring():
"""Wraps execution so we flush metrics on exit"""
try:
initialize()
signal.signal(signal.SIGTERM, handle_sigterm)
yield
finally:
stop()


class _MonitoringDaemon():
"""Wrapper for the daemon threads responsible for flushing metrics."""

Expand Down Expand Up @@ -294,9 +313,9 @@ def monitoring_v3_metric(self, metric, labels=None):
for key, value in labels.items():
metric.labels[key] = str(value)

# Default labels.
bot_name = environment.get_value('BOT_NAME')
metric.labels['region'] = _get_region(bot_name)
if not environment.is_running_on_k8s():
bot_name = environment.get_value('BOT_NAME', None)
metric.labels['region'] = _get_region(bot_name)

return metric

Expand Down Expand Up @@ -548,7 +567,11 @@ def _initialize_monitored_resource():
_monitored_resource.labels['project_id'] = utils.get_application_id()

# Use bot name here instance as that's more useful to us.
_monitored_resource.labels['instance_id'] = environment.get_value('BOT_NAME')
if environment.is_running_on_k8s():
instance_name = environment.get_value('HOSTNAME')
else:
instance_name = environment.get_value('BOT_NAME')
_monitored_resource.labels['instance_id'] = instance_name

if compute_metadata.is_gce():
# Returned in the form projects/{id}/zones/{zone}
Expand Down
38 changes: 38 additions & 0 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,41 @@
monitor.StringField('platform'),
],
)

# Metrics related to issue lifecycle

ISSSUE_FILING_SUCCESS = monitor.CounterMetric(
'issues/filing/success',
description='Bugs opened through triage task.',
field_spec=[
monitor.StringField('fuzzer_name'),
])

ISSUE_FILING_THROTTLED = monitor.CounterMetric(
'issues/filing/throttled',
description='Bug creation attempts throttled during triage task.',
field_spec=[
monitor.StringField('fuzzer_name'),
monitor.StringField(''),
])

ISSUE_FILING_FAILED = monitor.CounterMetric(
'issues/filing/throttled',
description='Bugs that failed to be opened through triage task.',
field_spec=[
monitor.StringField('fuzzer_name'),
])

ISSUE_CLOSING_SUCCESS = monitor.CounterMetric(
'issues/closing/success',
description='Bugs closed during cleanup task.',
field_spec=[
monitor.StringField('fuzzer_name'),
])

ISSUE_CLOSING_FAILED = monitor.CounterMetric(
'issues/closing/failed',
description='Bugs failed to be closed through cleanup task.',
field_spec=[
monitor.StringField('fuzzer_name'),
])
5 changes: 5 additions & 0 deletions src/clusterfuzz/_internal/system/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,11 @@ def parse_environment_definition(environment_string):
return values


def is_running_on_k8s():
"""Returns whether or not we're running on K8s."""
return os.getenv('IS_K8S_ENV') == 'true'


def base_platform(override):
"""Return the base platform when an override is provided."""
return override.split(':')[0]
Expand Down
20 changes: 1 addition & 19 deletions src/python/bot/startup/run_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import contextlib
import multiprocessing
import os
import signal
import sys
import time
import traceback
Expand Down Expand Up @@ -85,23 +84,6 @@ def lease_all_tasks(task_list):
yield


def handle_sigterm(signo, stack_frame): #pylint: disable=unused-argument
logs.info('Handling sigterm, stopping monitoring daemon.')
monitor.stop()
logs.info('Sigterm handled, metrics flushed.')


@contextlib.contextmanager
def wrap_with_monitoring():
"""Wraps execution so we flush metrics on exit"""
try:
monitor.initialize()
signal.signal(signal.SIGTERM, handle_sigterm)
yield
finally:
monitor.stop()


def schedule_utask_mains():
"""Schedules utask_mains from preprocessed utasks on Google Cloud Batch."""
from clusterfuzz._internal.google_cloud_utils import batch
Expand Down Expand Up @@ -262,7 +244,7 @@ def main():
multiprocessing.set_start_method('spawn')

try:
with wrap_with_monitoring(), ndb_init.context():
with monitor.wrap_with_monitoring(), ndb_init.context():
main()
exit_code = 0
except Exception:
Expand Down
3 changes: 2 additions & 1 deletion src/python/bot/startup/run_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from clusterfuzz._internal.config import local_config
from clusterfuzz._internal.datastore import ndb_init
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.metrics import monitor
from clusterfuzz._internal.system import environment


Expand Down Expand Up @@ -58,7 +59,7 @@ def main():
task = sys.argv[1]

task_module_name = f'clusterfuzz._internal.cron.{task}'
with ndb_init.context():
with monitor.wrap_with_monitoring(), ndb_init.context():
task_module = importlib.import_module(task_module_name)
return 0 if task_module.main() else 1

Expand Down

0 comments on commit 3df5dfd

Please sign in to comment.