Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime instrumentation #207

Merged
merged 20 commits into from
Mar 3, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/workflows/package-Instrumentation.Runtime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Pack OpenTelemetry.Contrib.Instrumentation.Runtime

on:
workflow_dispatch:
inputs:
logLevel:
description: 'Log level'
required: true
default: 'warning'
push:
tags:
- 'Instrumentation.Runtime-*' # trigger when we create a tag with prefix "Instrumentation.Runtime-"

jobs:
build-test-pack:
runs-on: ${{ matrix.os }}
env:
PROJECT: OpenTelemetry.Contrib.Instrumentation.Runtime

strategy:
matrix:
os: [windows-latest]

steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0 # fetching all

- name: Install dependencies
run: dotnet restore

- name: dotnet build ${{env.PROJECT}}
run: dotnet build src/${{env.PROJECT}} --configuration Release --no-restore -p:Deterministic=true

- name: dotnet test ${{env.PROJECT}}
run: dotnet test test/${{env.PROJECT}}.Tests

- name: dotnet pack ${{env.PROJECT}}
run: dotnet pack src/${{env.PROJECT}} --configuration Release --no-build

- name: Publish Artifacts
uses: actions/upload-artifact@v2
with:
name: ${{env.PROJECT}}-packages
path: '**/${{env.PROJECT}}/bin/**/*.*nupkg'

- name: Publish Nuget
run: |
nuget push **/${{env.PROJECT}}/bin/**/*.nupkg -Source https://api.nuget.org/v3/index.json -ApiKey ${{ secrets.NUGET_TOKEN }} -SymbolApiKey ${{ secrets.NUGET_TOKEN }}
2 changes: 1 addition & 1 deletion .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
fail-fast: false # ensures the entire test matrix is run, even if one permutation fails
matrix:
os: [windows-latest,ubuntu-latest]
version: [net461,netcoreapp3.1,net5.0]
version: [net461,netcoreapp3.1,net5.0,net6.0]
exclude:
- os: ubuntu-latest
version: net461
Expand Down
18 changes: 16 additions & 2 deletions opentelemetry-dotnet-contrib.sln
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenTelemetry.Contrib.Exten
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenTelemetry.Contrib.Extensions.PersistentStorage.Tests", "test\OpenTelemetry.Contrib.Extensions.PersistentStorage.Tests\OpenTelemetry.Contrib.Extensions.PersistentStorage.Tests.csproj", "{72EBA81D-2933-417C-8F21-D4CFFD72F530}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenTelemetry.Contrib.Instrumentation.Runtime", "src\OpenTelemetry.Contrib.Instrumentation.Runtime\OpenTelemetry.Contrib.Instrumentation.Runtime.csproj", "{F01E8C75-2791-4DBE-BD7A-5510871EBF56}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenTelemetry.Contrib.Instrumentation.Runtime.Tests", "test\OpenTelemetry.Contrib.Instrumentation.Runtime.Tests\OpenTelemetry.Contrib.Instrumentation.Runtime.Tests.csproj", "{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -305,7 +309,15 @@ Global
{72EBA81D-2933-417C-8F21-D4CFFD72F530}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{72EBA81D-2933-417C-8F21-D4CFFD72F530}.Debug|Any CPU.Build.0 = Debug|Any CPU
{72EBA81D-2933-417C-8F21-D4CFFD72F530}.Release|Any CPU.ActiveCfg = Release|Any CPU
{72EBA81D-2933-417C-8F21-D4CFFD72F530}.Release|Any CPU.Build.0 = Release|Any CPU
{72EBA81D-2933-417C-8F21-D4CFFD72F530}.Release|Any CPU.Build.0 = Release|Any CPU
{F01E8C75-2791-4DBE-BD7A-5510871EBF56}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F01E8C75-2791-4DBE-BD7A-5510871EBF56}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F01E8C75-2791-4DBE-BD7A-5510871EBF56}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F01E8C75-2791-4DBE-BD7A-5510871EBF56}.Release|Any CPU.Build.0 = Release|Any CPU
{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -352,7 +364,9 @@ Global
{6B3AA3F2-89A7-433F-918A-1E5E6AAF8423} = {8D11A34C-D0EF-4DE1-8230-32168E67044D}
{D52558C8-B7BF-4F59-A0FA-9AA629E68012} = {2097345F-4DD3-477D-BC54-A922F9B2B402}
{433C59A3-D535-421E-BA7F-9BCE0D4A3D25} = {22DF5DC0-1290-4E83-A9D8-6BB7DE3B3E63}
{72EBA81D-2933-417C-8F21-D4CFFD72F530} = {2097345F-4DD3-477D-BC54-A922F9B2B402}
{72EBA81D-2933-417C-8F21-D4CFFD72F530} = {2097345F-4DD3-477D-BC54-A922F9B2B402}
{F01E8C75-2791-4DBE-BD7A-5510871EBF56} = {22DF5DC0-1290-4E83-A9D8-6BB7DE3B3E63}
{FB907DF7-F3F3-4A07-885D-E5FECAE36BDA} = {2097345F-4DD3-477D-BC54-A922F9B2B402}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {B0816796-CDB3-47D7-8C3C-946434DE3B66}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// <copyright file="AssemblyInfo.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System.Runtime.CompilerServices;

#if SIGNED
[assembly: InternalsVisibleTo("OpenTelemetry.Contrib.Instrumentation.Runtime.Tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010051c1562a090fb0c9f391012a32198b5e5d9a60e9b80fa2d7b434c9e5ccb7259bd606e66f9660676afc6692b8cdc6793d190904551d2103b7b22fa636dcbb8208839785ba402ea08fc00c8f1500ccef28bbf599aa64ffb1e1d5dc1bf3420a3777badfe697856e9d52070a50c3ea5821c80bef17ca3acffa28f89dd413f096f898")]
#else
[assembly: InternalsVisibleTo("OpenTelemetry.Contrib.Instrumentation.Runtime.Tests")]
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// <copyright file="AssembliesInstrumentation.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System;
using System.Diagnostics.Metrics;

namespace OpenTelemetry.Contrib.Instrumentation.Runtime.Implementation
{
internal class AssembliesInstrumentation : IRuntimeInstrumentation
{
private readonly Meter meter;
private readonly ObservableCounter<int> assemblyCounter;

public AssembliesInstrumentation(Meter meter)
{
this.meter = meter;
cijothomas marked this conversation as resolved.
Show resolved Hide resolved
this.assemblyCounter = meter.CreateObservableCounter("assembly-count", () => AppDomain.CurrentDomain.GetAssemblies().Length, description: "Number of Assemblies Loaded");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the names for the metrics in this PR should be prefixed with the process.runtime.dotnet. namespace. In this case something like process.runtime.dotnet.assembly_count (using an _ rather than - since I think that's more commonly what I've seen in other semantic conventions).

The spec has some general guidance here for runtime metrics: https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/semantic_conventions/runtime-environment-metrics.md#runtime-environment-specific-metrics---processruntimeenvironment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you. Names are now prefixed.

}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// <copyright file="EventCounterListener.cs" company="OpenTelemetry Authors">
// Copyright The OpenTelemetry Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </copyright>

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics.Tracing;

namespace OpenTelemetry.Contrib.Instrumentation.Runtime.Implementation
{
/// <summary>
/// EventCounterListener that subscribes to EventSource Events.
/// </summary>
internal class EventCounterListener : EventListener
{
private const string EVENTSOURCENAME = "System.Runtime";

private readonly ConcurrentQueue<EventSource> allEventSourcesCreated = new ConcurrentQueue<EventSource>();
private readonly Dictionary<string, string> refreshIntervalDictionary;
private readonly bool isInitialized = false;

// EventSourceNames from which counters are to be collected are the keys for this IDictionary.
// The value will be the corresponding ICollection of counter names.
private readonly IDictionary<string, ICollection<string>> countersToCollect = new Dictionary<string, ICollection<string>>();
private readonly IEventCounterStore eventCounterStore;

/// <summary>
/// Initializes a new instance of the <see cref="EventCounterListener"/> class.
/// </summary>
/// <param name="eventCounterStore">Instance to store the data to.</param>
public EventCounterListener(IEventCounterStore eventCounterStore)
{
this.eventCounterStore = eventCounterStore ?? throw new ArgumentNullException(nameof(eventCounterStore));

try
{
this.refreshIntervalDictionary = new Dictionary<string, string>();
this.refreshIntervalDictionary.Add("EventCounterIntervalSec", "1");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@twenzel this is doing exactly same as #177 as far as "configure the polling schedule" - except its hardcoded to 1 second. This PR is making this specific to runtime metrics as opposed to any available event counters including what other libraries/ customers might already have in place. @reyang is this what you had in mind for #207 - to restrict to runtime metrics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is dedicated to the runtime metrics and is using the API directly to retrieve the values.
Unfortunately sometimes the API is internal, so we've to fallback to the EventCounter.
The advantage of a this instrumentation is that it can use the direct API as soon as the runtime make it public. It will perform better as retrieving the values from event counters with less overhead.
The EventCounter instrumentation from #177 is a general purpose implementation to retrieve any existing EventCounter.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ideally we want this "Runtime Instrumentation" to use the OpenTelemetry Metrics API rather than relying on the EventCounter. The main reason is efficiency and accuracy - especially considering Prometheus scenario that using an asynchronous instrument will make sure we only invoke the callback when Prometheus is scaping metrics, rather than having to poll the EventCounter based on some predetermined schedule.

If there are cases where we cannot use OpenTelemetry Metrics API (e.g. there is no proper API - such as ObservableUpDownCounter) or we cannot access the internal functionality of .NET runtime (e.g. if a certain EventCounter is implemented by calling some non-public API), we should discuss and see if we can find better approaches.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately sometimes the API is internal, so we've to fallback to the EventCounter.

I suggest that we call out the list of APIs, and we should work with the .NET Runtime team to see if it makes sense to make these public.

Copy link
Member

@reyang reyang Feb 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hananiel the goal for #204 is to have a pure OpenTelemetry Metrics API based instrumentation library for .NET runtime - that does not rely on EventCounter at all (as a consequence, provides better efficiency, sampling accuracy and probably better mode - e.g. different gen of GC could be modeled as metric dimensions/attributes rather than each modeled as a separate metric).

Copy link
Member

@reyang reyang Feb 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@reyang These are the APIs that are internal and should be made public:

  • GC.GetLastGCPercentTimeInGC()
  • GC.GetGenerationSize() // for generation 0-4
  • System.Reflection.Assembly.GetAssemblyCount()
  • Exception.GetExceptionCount()
  • RuntimeEventSourceHelper.GetCpuUsage()

Should I make PRs or are you discuss these requests internally?

@cijothomas and I had a discussion with the .NET runtime team (@maryamariyan, @noahfalk and @tarekgh), here goes a quick summary:

  1. Some of the methods (e.g. RuntimeEventSourceHelper.GetCpuUsage()) were introduced into the runtime to reduce dependencies, there is likely going to be public APIs in other .NET runtime assemblies that meet the requirements here.
  2. Some methods might have no good alternatives that are public. From .NET perspective, @noahfalk can help to see what's the direction - e.g. we might make certain API public in the future version of runtime, or design some public APIs that provide the same or even more feature.
  3. Given this OpenTelemetry package will have to work with existing versions of the runtime, we need use reflection (probably in combination with some conditional compilation or runtime detection), for the functions that we took reflection approach here, .NET team could add comment in the source code to avoid breaking OpenTelemetry accidentally - before we can fully move to public APIs.
  4. All of the above doesn't need to block this PR, we can address these 5 missing instruments in separate PRs.

@noahfalk to share more insights.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • GC.GetGenerationSize() could be replaced with GC.GetGCMemoryInfo().GenerationInfo[i].SizeAfterBytes

  • GetCpuUsage() is a bit messy design-wise because the computation depends on how long it has been since the API was last called. It works OK if there is only one consumer of the counter, but imagine what happens if there are two exporters and each of them invokes the callback. If we imagine each of them is polling on the same interval (lets say once per minute), then one of them will be reporting the average CPU usage over the last minute and the other one will be reporting the average CPU usage over the last ~1ms since the previous exporter invoked the callback. That tiny sampling window could give very skewed results. We might need to redefine the metric or reconsider how it should be computed. The underlying calculation should be something like:
    At the beginning and end of an interval evaluate System.Diagnostics.Process.GetCurrentProcess().TotalProcessorTime, subtract the end from the beginning, and divide by (time_interval * Environment.ProcessorCount).

For the other three APIs I created a runtime issue to track adding them: dotnet/runtime#65989

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@reyang @cijothomas - Talking with Maoni about the percent-time-in-gc counter a similar issue arises that I see with the cpu % counter. It seems like what we'd want to do is represent these things as ObservableCounters which measure the number of seconds where something was true:
CPU = seconds/interval where CPU was in use
GC = seconds/interval where GC had all threads paused

Copy link
Member

@reyang reyang Mar 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Take an example to make sure I understand the proposal:

Time since program started Total time when GC had all threads paused
1.5 seconds 0.1 second
6.5 seconds 1.1 second
16.5 seconds 2.1 second
  • T0: 0s
  • T1: 1.5s
  • T2: 6.5s
  • T3: 16.5s

Option 1:

  • ObservableCounter("process.runtime.dotnet.gc.time")
  • at T1, record 0.1 // (T0, T1]
  • at T2, record 1.1 // (T0, T2]
  • at T3, record 2.1 // (T0, T3]

The consumer can configure the exporter to use delta temporality, so proper rate can be calculated downstream.

Option 2:

  • at T1, record 0.067 // 0.1 / 1.5
  • at T2, record 0.2 // (1.1 - 0.1) / (6.5 - 1.5)
  • at T3, record 0.1 // (2.1 - 1.1) / (16.5 - 6.5)

It seems Option 2 won't work, as the data being recorded won't fit any existing instrument (unless we want to pick something like async Gauge).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep exactly, option 1 is what I was suggesting we should start doing and option 2 is the problematic path that the cpu-usage EventCounter is doing right now.

Option (2) works ok with an ObservableGauge only if there is guaranteed exactly one observer/exporter, allowing the callback to infer the interval by timestamping when it was last called. As soon as there is a 2nd observer then the callback can't distinguish if the call at T2 is a new observer trying to measure the interval [0, T2] or the same observer that called at T1 now trying to measure [T1, T2].

At risk of getting a little into the weeds, the current runtime implementation of GetLastGCPercentTimeInGC() doesn't capture a timestamp of when it was last called, but the way it avoids doing that produces a measurement people are likely to find confusing. In your example above, imagine that the various GCs occured at:
GC 1: 1.0-1.2 (0.1 sec paused)
GC 2: 2.5-3.9 (0.9 sec paused)
GC 3: 4.0-4.1 (0.1 sec paused)
GC 4: 15.0-16.2 (1 sec paused)
You can see GCs can also do some work while the threads aren't paused. Adding up the pause times gives the same 0.1, 1.1 and 2.1 measurements as above when observed at T=1.5, 6.5, and 16.5.
If you measure at some time T, GetLastGCPercentTimeInGC() finds the most recent GC X that finished before time T and calculates GC_x_pause/ ( GC_x_end - GC_x-1_end). For example if you query at T=6.5 it determines GC 3 was the most recent completed GC before that point which ended at 4.1. Before that GC 2 ended at 3.9. That gives 0.1 pause time/(4.1-3.9) = 50% time in GC. Users were probably hoping to see (1.1-0.1)/(6.5-1.5)=20% time in GC and will be surprised not to see that. So the current GC stats don't suffer non-determinism when subjected to multiple observers, but only by using a surprising calculation that isn't necessarily a good approximation of what users really wanted.


RuntimeInstrumentationEventSource.Log.EventListenerInitializeSuccess();
this.isInitialized = true;

// Go over every EventSource created before we finished initialization, and enable if required.
// This will take care of all EventSources created before initialization was done.
foreach (var eventSource in this.allEventSourcesCreated)
{
this.EnableEventSource(eventSource);
}
}
catch (Exception ex)
{
RuntimeInstrumentationEventSource.Log.EventListenerError("EventCounterListener Constructor", ex.Message);
}
}

/// <summary>
/// Processes a new EventSource event.
/// </summary>
/// <param name="eventData">Event to process.</param>
protected override void OnEventWritten(EventWrittenEventArgs eventData)
{
// Ignore events if initialization not done yet. We may lose the 1st event if it happens before initialization, in multi-thread situations.
// Since these are counters, losing the 1st event will not have noticeable impact.
if (!this.isInitialized)
{
return;
}

try
{
if (eventData.EventSource.Name == EVENTSOURCENAME)
{
if (eventData.Payload.Count > 0 && eventData.Payload[0] is IDictionary<string, object> eventPayload)
{
this.ExtractValue(eventPayload);
}
else
{
RuntimeInstrumentationEventSource.Log.IgnoreEventWrittenAsEventPayloadNotParseable(eventData.EventName);
}
}
}
catch (Exception ex)
{
RuntimeInstrumentationEventSource.Log.ErrorEventCounter(eventData.EventName, ex.ToString());
}
}

/// <inheritdoc/>
protected override void OnEventSourceCreated(EventSource eventSource)
{
// Keeping track of all EventSources here, as this call may happen before initialization.
this.allEventSourcesCreated.Enqueue(eventSource);

// options might be null when this method gets called
// before our constructor was completed
if (this.isInitialized)
{
this.EnableEventSource(eventSource);
}
}

private void EnableEventSource(EventSource eventSource)
{
try
{
if (eventSource.Name == EVENTSOURCENAME)
{
this.EnableEvents(eventSource, EventLevel.Verbose, EventKeywords.All, this.refreshIntervalDictionary);
}
}
catch (Exception ex)
{
RuntimeInstrumentationEventSource.Log.EventListenerError(eventSource.Name, ex.Message);
}
}

private void ExtractValue(IDictionary<string, object> eventPayload)
{
try
{
if (eventPayload.TryGetValue("Name", out var p) && p is string counterName)
{
if (!this.eventCounterStore.HasSubscription(counterName))
{
RuntimeInstrumentationEventSource.Log.IgnoreEventWrittenAsCounterNotInConfiguredList(counterName);
return;
}

this.eventCounterStore.WriteValue(counterName, eventPayload);
}
}
catch (Exception ex)
{
RuntimeInstrumentationEventSource.Log.EventCountersInstrumentationWarning(ex.Message);
}
}
}
}
Loading