Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add process/cpu live metrics #34735

Merged
merged 24 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

- Add live metrics collection of requests/dependencies/exceptions
([#34673](https://github.com/Azure/azure-sdk-for-python/pull/34673))
- Add live metrics collection of cpu time/process memory
([#34735](https://github.com/Azure/azure-sdk-for-python/pull/34735))

### Breaking Changes

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from typing import Any, Iterable, Optional

import platform
from typing import Any, Optional
import psutil

from opentelemetry.metrics import CallbackOptions, Observation
from opentelemetry.sdk._logs import LogData
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
Expand All @@ -13,10 +16,12 @@

from azure.monitor.opentelemetry.exporter._generated.models import ContextTagKeys
from azure.monitor.opentelemetry.exporter._quickpulse._constants import (
_COMMITTED_BYTES_NAME,
_DEPENDENCY_DURATION_NAME,
_DEPENDENCY_FAILURE_RATE_NAME,
_DEPENDENCY_RATE_NAME,
_EXCEPTION_RATE_NAME,
_PROCESSOR_TIME_NAME,
_REQUEST_DURATION_NAME,
_REQUEST_FAILURE_RATE_NAME,
_REQUEST_RATE_NAME,
Expand All @@ -43,6 +48,8 @@
)


PROCESS = psutil.Process()

def enable_live_metrics(**kwargs: Any) -> None:
"""Live metrics entry point.

Expand Down Expand Up @@ -113,6 +120,14 @@ def __init__(self, connection_string: Optional[str], resource: Optional[Resource
"exc/sec",
"live metrics exception rate per second"
)
self._process_memory_gauge = self._meter.create_observable_gauge(
_COMMITTED_BYTES_NAME[0],
[_get_process_memory],
)
self._processor_time_gauge = self._meter.create_observable_gauge(
_PROCESSOR_TIME_NAME[0],
[_get_processor_time],
)

def _record_span(self, span: ReadableSpan) -> None:
# Only record if in post state
Expand Down Expand Up @@ -150,3 +165,21 @@ def _record_log_record(self, log_data: LogData) -> None:
exc_message = log_record.attributes.get(SpanAttributes.EXCEPTION_MESSAGE)
if exc_type is not None or exc_message is not None:
self._exception_rate_counter.add(1)


# pylint: disable=unused-argument
def _get_process_memory(options: CallbackOptions) -> Iterable[Observation]:
# rss is non-swapped physical memory a process has used
yield Observation(
PROCESS.memory_info().rss,
{},
)


# pylint: disable=unused-argument
def _get_processor_time(options: CallbackOptions) -> Iterable[Observation]:
# Processor time does not include idle time
yield Observation(
lzchen marked this conversation as resolved.
Show resolved Hide resolved
100 - psutil.cpu_times_percent().idle,
{},
)
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
"msrest>=0.6.10",
"opentelemetry-api~=1.21",
"opentelemetry-sdk~=1.21",
"psutil>=5.9.8",
],
entry_points={
"opentelemetry_traces_exporter": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,42 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import collections
import platform
import unittest
from unittest import mock

from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics import (
Counter,
Histogram,
Meter,
MeterProvider,
ObservableGauge,
)
from opentelemetry.sdk.resources import Resource, ResourceAttributes
from opentelemetry.semconv.trace import SpanAttributes
from opentelemetry.trace import SpanKind

from azure.monitor.opentelemetry.exporter._generated.models import ContextTagKeys
from azure.monitor.opentelemetry.exporter._quickpulse._constants import (
_COMMITTED_BYTES_NAME,
_DEPENDENCY_DURATION_NAME,
_DEPENDENCY_FAILURE_RATE_NAME,
_DEPENDENCY_RATE_NAME,
_EXCEPTION_RATE_NAME,
_PROCESSOR_TIME_NAME,
_REQUEST_DURATION_NAME,
_REQUEST_FAILURE_RATE_NAME,
_REQUEST_RATE_NAME,
)
from azure.monitor.opentelemetry.exporter._quickpulse._exporter import (
_QuickpulseExporter,
_QuickpulseMetricReader,
)
from azure.monitor.opentelemetry.exporter._quickpulse._live_metrics import (
enable_live_metrics,
_get_process_memory,
_get_processor_time,
_QuickpulseManager,
)
from azure.monitor.opentelemetry.exporter._quickpulse._state import (
Expand Down Expand Up @@ -92,6 +112,28 @@ def test_init(self, generator_mock):
self.assertEqual(qpm._reader._base_monitoring_data_point, qpm._base_monitoring_data_point)
self.assertTrue(isinstance(qpm._meter_provider, MeterProvider))
self.assertEqual(qpm._meter_provider._sdk_config.metric_readers, [qpm._reader])
self.assertTrue(isinstance(qpm._meter, Meter))
self.assertEqual(qpm._meter.name, "azure_monitor_live_metrics")
self.assertTrue(isinstance(qpm._request_duration, Histogram))
self.assertEqual(qpm._request_duration.name, _REQUEST_DURATION_NAME[0])
self.assertTrue(isinstance(qpm._dependency_duration, Histogram))
self.assertEqual(qpm._dependency_duration.name, _DEPENDENCY_DURATION_NAME[0])
self.assertTrue(isinstance(qpm._request_rate_counter, Counter))
self.assertEqual(qpm._request_rate_counter.name, _REQUEST_RATE_NAME[0])
self.assertTrue(isinstance(qpm._request_failed_rate_counter, Counter))
self.assertEqual(qpm._request_failed_rate_counter.name, _REQUEST_FAILURE_RATE_NAME[0])
self.assertTrue(isinstance(qpm._dependency_rate_counter, Counter))
self.assertEqual(qpm._dependency_rate_counter.name, _DEPENDENCY_RATE_NAME[0])
self.assertTrue(isinstance(qpm._dependency_failure_rate_counter, Counter))
self.assertEqual(qpm._dependency_failure_rate_counter.name, _DEPENDENCY_FAILURE_RATE_NAME[0])
self.assertTrue(isinstance(qpm._exception_rate_counter, Counter))
self.assertEqual(qpm._exception_rate_counter.name, _EXCEPTION_RATE_NAME[0])
self.assertTrue(isinstance(qpm._process_memory_gauge, ObservableGauge))
self.assertEqual(qpm._process_memory_gauge.name, _COMMITTED_BYTES_NAME[0])
self.assertEqual(qpm._process_memory_gauge._callbacks, [_get_process_memory])
self.assertTrue(isinstance(qpm._processor_time_gauge, ObservableGauge))
self.assertEqual(qpm._processor_time_gauge.name, _PROCESSOR_TIME_NAME[0])
self.assertEqual(qpm._processor_time_gauge._callbacks, [_get_processor_time])


def test_singleton(self):
Expand Down Expand Up @@ -247,3 +289,21 @@ def test_record_log_exception(self, post_state_mock, log_doc_mock, append_doc_mo
qpm._record_log_record(log_data_mock)
append_doc_mock.assert_called_once_with(log_record_doc)
qpm._exception_rate_counter.add.assert_called_once_with(1)

def test_process_memory(self):
with mock.patch("azure.monitor.opentelemetry.exporter._quickpulse._live_metrics.PROCESS") as process_mock:
memory = collections.namedtuple('memory', 'rss')
pmem = memory(rss=40)
process_mock.memory_info.return_value = pmem
mem = _get_process_memory(None)
obs = next(mem)
self.assertEqual(obs.value, 40)

@mock.patch("psutil.cpu_times_percent")
def test_processor_time(self, processor_mock):
cpu = collections.namedtuple('cpu', 'idle')
cpu_times = cpu(idle=94.5)
processor_mock.return_value = cpu_times
time = _get_processor_time(None)
obs = next(time)
self.assertEqual(obs.value, 5.5)
Loading