From f831c6dc997913188afc6a9bccb6291497c82a0e Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 8 Apr 2021 11:56:38 -0700 Subject: [PATCH 01/20] Updated FO version for sample plugin. etwLogger is now a static type. --- .../Utilities/Logger.cs | 2 +- SampleObserverPlugin/SampleNewObserver.cs | 447 +++++++++++++----- .../SampleObserverPlugin.csproj | 2 +- 3 files changed, 337 insertions(+), 114 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index 7116b4c3..e81f7d3c 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -22,7 +22,7 @@ namespace FabricObserver.Observers.Utilities public sealed class Logger : IObserverLogger { private const int Retries = 5; - private EventSource etwLogger = null; + private static EventSource etwLogger = null; // Text file logger for observers - info/warn/error. private ILogger OLogger diff --git a/SampleObserverPlugin/SampleNewObserver.cs b/SampleObserverPlugin/SampleNewObserver.cs index 4ce49a17..1f4dca8b 100644 --- a/SampleObserverPlugin/SampleNewObserver.cs +++ b/SampleObserverPlugin/SampleNewObserver.cs @@ -4,163 +4,386 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; using System.Diagnostics; -using System.Fabric; using System.Fabric.Health; -using System.Linq; using System.Text; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.Utilities; -using FabricObserver.Observers.Utilities.Telemetry; +using System.Linq; +using System.Fabric; namespace FabricObserver.Observers { - public class SampleNewObserver : ObserverBase + /// + /// Over all idea of how ASCRPObserver works: + /// It scans for the service instance of serviceType "CapsServiceType" under each ASCRP applicationType in the ObserverAsync function + /// After the scanning is done we use reportAsync() to report to SFX and Generate ETW traces to our logs + /// + + public class ASCRPObserver : ObserverBase { - private readonly StringBuilder message; - public SampleNewObserver(FabricClient fabricClient, StatelessServiceContext context) + private readonly StringBuilder message; // The message that will go into our ETW traces + private readonly IDictionary numberOfAscServiceInstancesRecorded = new Dictionary(); // This Dict is used to hold the number of services from the previous scan + private readonly int mintuesToWarn = 10; //Minutes to put the warning on the SFX + private readonly string appName = "fabric:/Ascrp"; //ApplicationName + private readonly string appType = "ASCRP"; //ApplicationType + private readonly string serviceName = "RPP_CapsServiceType_"; //ServiceName + private readonly string serviceType = "CapsServiceType";//ServiceType + private static int num_of_Instances_To_Warn; // This is the number of instances variable that will tell us after which number we want to send an alert + private bool serviceInstancesUpdated; //Flag variable that indicates if the serviceInstances got increased in a scan + private bool should_warn = false; //Another flag variable that indicates if the warning has to be pushed + private int totalNumberofAscServices = 0; //Temp variable that will hold the value of services in every loop + private const int warn_Threshold_lessThanOrEqual2 = 75; + private const int warn_Threshold_greaterThan2 = 30; + + public ASCRPObserver(FabricClient fabricClient, StatelessServiceContext context) : base(fabricClient, context) { - message = new StringBuilder(); + this.message = new StringBuilder(); + serviceInstancesUpdated = false; + // Key is our ServiceType which is "CapsServiceType" and 2 because I think there will be a minimum of two instances under each app on SF + numberOfAscServiceInstancesRecorded.Add(serviceType, 0); } + /// + /// ObserveAsync(): How it works: + /// Mainly counts the serviceInstances under each ASCRP application in each scan + /// Proactively checks with the recorded value in our dict, if its increased we trigger the flag variable serviceInstancesUpdated to true + /// if that number crosses our alert threshold (num_of_Instances_To_Warn) we switch the should_warn variable to true + /// All this is reported to the ReportAsync() to take action on the observations. + /// public override async Task ObserveAsync(CancellationToken token) { - // If set, this observer will only run during the supplied interval. - // See Settings.xml, CertificateObserverConfiguration section, RunInterval parameter for an example. - if (RunInterval > TimeSpan.MinValue - && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) - { - return; - } - - Stopwatch stopwatch = Stopwatch.StartNew(); - int totalNumberOfDeployedSFApps = 0, totalNumberOfDeployedServices = 0, totalNumberOfPartitions = 0, totalNumberOfReplicas = 0; - int appsInWarningError = 0, servicesInWarningError = 0, partitionsInWarningError = 0, replicasInWarningError = 0; - - var apps = await FabricClientInstance.QueryManager.GetApplicationListAsync( - null, - AsyncClusterOperationTimeoutSeconds, - token).ConfigureAwait(false); - - totalNumberOfDeployedSFApps = apps.Count; - appsInWarningError = apps.Where(a => a.HealthState == HealthState.Warning || a.HealthState == HealthState.Error).Count(); - - foreach (var app in apps) + this.message.AppendLine($"ASCRPObserver: Entering observerAsync: updated 1106"); + try { - var services = await FabricClientInstance.QueryManager.GetServiceListAsync( - app.ApplicationName, + if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) + { + return; + } + Stopwatch stopwatch = Stopwatch.StartNew(); + var apps = await FabricClientInstance.QueryManager.GetApplicationListAsync( null, AsyncClusterOperationTimeoutSeconds, token).ConfigureAwait(false); - - totalNumberOfDeployedServices += services.Count; - servicesInWarningError += services.Where(s => s.HealthState == HealthState.Warning || s.HealthState == HealthState.Error).Count(); - - foreach (var service in services) + // This is where we decide the threshold number for the alerts + var appsOfType = apps.Where(app => app.ApplicationTypeName == appType).ToList(); + int count_appTypes = appsOfType.Count; + if (count_appTypes > 0) + { + if (count_appTypes <= 2) + { + num_of_Instances_To_Warn = warn_Threshold_lessThanOrEqual2; + } + else if (count_appTypes > 2) + { + num_of_Instances_To_Warn = warn_Threshold_greaterThan2; + } + } + this.message.AppendLine($"ASCRPObserver:Slices count: {count_appTypes} "); + this.message.AppendLine($"ASCRPObserver: Alert therhsold : {num_of_Instances_To_Warn} "); + this.message.AppendLine($"ASCRPObserver:Going to start scanning for Apps with type : {appType} "); + //Starting to scan for the services + foreach (var app in apps) { - var partitions = await FabricClientInstance.QueryManager.GetPartitionListAsync( - service.ServiceName, + string app_applicationName = app.ApplicationName.OriginalString; + bool hasAscrpApp = app_applicationName.Contains(appName); + string appTypeName = app.ApplicationTypeName; + bool hasAscrpTypeApp = appTypeName.Equals(appTypeName); + this.message.AppendLine($"ASCRPObserver: Application type name: {appTypeName}, flag contains value : {hasAscrpTypeApp} "); + if (hasAscrpTypeApp) + { + var services = await FabricClientInstance.QueryManager.GetServiceListAsync( + app.ApplicationName, null, AsyncClusterOperationTimeoutSeconds, token).ConfigureAwait(false); + foreach (var service in services) + { + string service_name = service.ServiceName.OriginalString; + bool hasAscrpServiceName = service_name.Contains(serviceName); + string serviceTypeName = service.ServiceTypeName; + bool hasSerivceType = serviceTypeName.Equals(serviceType); + this.message.AppendLine($"ASCRPObserver: Service type name: {serviceTypeName}, flag contains value : {hasAscrpTypeApp} "); + if (hasSerivceType) + { + totalNumberofAscServices += 1; + } + if (numberOfAscServiceInstancesRecorded != null && numberOfAscServiceInstancesRecorded.Count > 0) + { + for (int i = 0; i < numberOfAscServiceInstancesRecorded.Count; i++) + { + KeyValuePair ascrpServiceInstances = numberOfAscServiceInstancesRecorded.ElementAt(i); + this.message.AppendLine($"ASCRPObserver: Number of service instances before scan:{ascrpServiceInstances.Value}"); + if (ascrpServiceInstances.Key.Contains(serviceType)) + { + if (ascrpServiceInstances.Value < totalNumberofAscServices) + { + this.message.AppendLine($"ASCRPObserver: Looks like the service instances have increased since last scan."); + this.message.AppendLine($"ASCRPObserver: Updating the record."); + numberOfAscServiceInstancesRecorded[ascrpServiceInstances.Key] = totalNumberofAscServices; + serviceInstancesUpdated = true; + if (totalNumberofAscServices >= num_of_Instances_To_Warn) + { + this.message.AppendLine($"ASCRPObserver: Number of Services now:{totalNumberofAscServices}, Global threshold: {num_of_Instances_To_Warn}."); + this.message.AppendLine($"ASCRPObserver: Looks like we crossed the global maximum, hence initaiting warnings and hence alerts."); + should_warn = true; + } - totalNumberOfPartitions += partitions.Count; - partitionsInWarningError += partitions.Where(p => p.HealthState == HealthState.Warning || p.HealthState == HealthState.Error).Count(); + } + else + { + this.message.AppendLine($"ASCRPObserver: No change in the no of service instances since our last scan"); + serviceInstancesUpdated = false; - foreach (var partition in partitions) - { - var replicas = await FabricClientInstance.QueryManager.GetReplicaListAsync( - partition.PartitionInformation.Id, - null, - AsyncClusterOperationTimeoutSeconds, - token).ConfigureAwait(false); - - totalNumberOfReplicas += replicas.Count; - replicasInWarningError += replicas.Where(r => r.HealthState == HealthState.Warning || r.HealthState == HealthState.Error).Count(); + } + this.message.AppendLine($"ASCRPObserver: Now starting to report."); + await ReportAsync(token); + } + this.message.AppendLine($"ASCRPObserver: Number of service instances after scan:{ascrpServiceInstances.Value}"); + } + } + else + { + this.message.AppendLine($"ASCRPObserver: Dict is null, can't proceed."); + } + } } + //Clearing the temp variables + this.message.AppendLine($"ASCRPObserver: Clearing the temp variables totalNumberofAscServices"); + totalNumberofAscServices = 0; + serviceInstancesUpdated = false; + should_warn = false; } + stopwatch.Stop(); + RunDuration = stopwatch.Elapsed; + this.message.AppendLine($"ASCRPObserver: After loop report: total number of ASC Services: {totalNumberofAscServices}"); + this.message.AppendLine($"ASCRPObserver: Time it took to run {ObserverName}.ObserveAsync: {RunDuration}"); + await ReportAsync(token); + LastRunDateTime = DateTime.Now; } + catch (Exception e) when + (e is FabricException || + e is OperationCanceledException || + e is TaskCanceledException || + e is TimeoutException) + { + // These can happen, transiently. Ignore them. + } + catch (Exception exception) + { + string msg = $"ASCRPObserver : Report Async : Exception occured :{Environment.NewLine}{exception}"; + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Code = AscFOErrorAndWarningCodes.AscrpObserverError, //This error code shows that ASCRPObserver just crashed due to some unexecpted error + Level = "Error", + Description = msg, + Source = ObserverName, + Node = NodeName, + }); + throw; - message.AppendLine($"Total number of Applications: {totalNumberOfDeployedSFApps}"); - message.AppendLine($"Total number of Applications in Warning or Error: {appsInWarningError}"); - message.AppendLine($"Total number of Services: {totalNumberOfDeployedServices}"); - message.AppendLine($"Total number of Services in Warning or Error: {servicesInWarningError}"); - message.AppendLine($"Total number of Partitions: {totalNumberOfPartitions}"); - message.AppendLine($"Total number of Partitions in Warning or Error: {partitionsInWarningError}"); - message.AppendLine($"Total number of Replicas: {totalNumberOfReplicas}"); - message.AppendLine($"Total number of Replicas in Warning or Error: {replicasInWarningError}"); - - // The time it took to run ObserveAsync; for use in computing HealthReport TTL. - stopwatch.Stop(); - RunDuration = stopwatch.Elapsed; - - message.AppendLine($"Time it took to run {base.ObserverName}.ObserveAsync: {RunDuration}"); + } - await ReportAsync(token); - LastRunDateTime = DateTime.Now; } + /// + /// ReportAsync() how it works + /// After the scanning is done we land here: based on the two state variables should_warn and serviceInstancesUpdated, we decide what has to be done next. + /// if serviceInstancesUpdated is switched on, we add a trace with warning code "FOASC002". + /// In addition to serviceInstancesUpdated, if should_warn is switched on, we put the SFX into warning mode for "mintuesToWarn" mintues + /// This also adds the trace with warning code "FOASC003" that will eventually trigger the monitor to the send the alert (IcM). + /// + /// + /// public override Task ReportAsync(CancellationToken token) { - // Local log. + + // Local log. This should only be used while debugging. In general, don't enable verbose logging for observer. +#if DEBUG ObserverLogger.LogInfo(message.ToString()); +#endif + try + { + /* Report to Fabric */ + var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); + Utilities.HealthReport healthReport = null; + if (totalNumberofAscServices >= 0) + { + if (serviceInstancesUpdated) + { + if (should_warn) // Create the Warning. + { + this.message.AppendLine("The TTL for the warning is:" + mintuesToWarn.ToString()); - /* Report to Fabric */ + // Provide a SourceId and Property for use in the health event clearing. + // This means the related health event will go from Warning -> Ok, with the Warning state removed from the entity + // (the fabric Node will be green in SFX, in this case since the ReportType is Node) + healthReport = new Utilities.HealthReport + { + Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, + HealthMessage = this.message.ToString(), + NodeName = NodeName, + Observer = ObserverName, + Property = $"{NodeName}_ServiceInstancesUpdated", + ReportType = HealthReportType.Node, + State = HealthState.Warning, + HealthReportTimeToLive = TimeSpan.FromMinutes(mintuesToWarn), + SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert})", + }; - var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); - var healthReport = new Utilities.HealthReport - { - Code = FOErrorWarningCodes.Ok, - HealthMessage = message.ToString(), - NodeName = NodeName, - Observer = ObserverName, - Property = "SomeUniquePropertyForMyHealthEvent", - ReportType = HealthReportType.Node, - State = HealthState.Ok, - }; - - healthReporter.ReportHealthToServiceFabric(healthReport); - - // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml. - var telemetryData = new TelemetryData(FabricClientInstance, Token) - { - Code = FOErrorWarningCodes.Ok, - Description = message.ToString(), - HealthState = "Ok", - NodeName = NodeName, - ObserverName = ObserverName, - Source = ObserverConstants.FabricObserverName, - }; - - if (IsTelemetryEnabled) + // This property is used to keep track of this observer's latest known warning state. + // If it was in warning from the last time it ran and then it is no longer detects the issue the next time it runs, + // then this property would be used to clear the existing warning, which requires: + // 1. Same SourceID for each event (Warning and Ok) + // 2. Same Property for each event (Warning and Ok) + HasActiveFabricErrorOrWarning = true; + } + else // Create a new Information Event (HealthState = Ok). + { + healthReport = new Utilities.HealthReport + { + Code = AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased, + HealthMessage = this.message.ToString(), + NodeName = NodeName, + Observer = ObserverName, + Property = $"{NodeName}_ServiceInstancesUpdated", + ReportType = HealthReportType.Node, + State = HealthState.Ok, + SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased})", + }; + HasActiveFabricErrorOrWarning = false; + } + } + else // Clear the Warning with an Ok clear. + { + if (HasActiveFabricErrorOrWarning) + { + healthReport = new Utilities.HealthReport + { + Code = AscFOErrorAndWarningCodes.AscServiceInstanceOK, + HealthMessage = this.message.ToString(), + NodeName = NodeName, + Observer = ObserverName, + Property = $"{NodeName}_ServiceInstancesUpdated", + ReportType = HealthReportType.Node, + State = HealthState.Ok, + SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert})", + }; + + HasActiveFabricErrorOrWarning = false; + } + } + } + if (healthReport != null) + { + healthReporter.ReportHealthToServiceFabric(healthReport); //Reports the health state on SFX + /*Report to logs */ + string code_toBeAppended; + // ETW. EventSource tracing. This is very vital for triggering the FO's backend alert system + if (IsEtwEnabled) + { + if (healthReport.State == HealthState.Ok) + { + if (HasActiveFabricErrorOrWarning) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, + HealthEventDescription = this.message.ToString(), + HealthState = "Warning", + NodeName, + ObserverName, + Source = ObserverConstants.FabricObserverName, + }); + } + else + { + code_toBeAppended = null; + if (serviceInstancesUpdated) + { + code_toBeAppended = AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased; + } + else + { + code_toBeAppended = AscFOErrorAndWarningCodes.AscServiceInstanceOK; + } + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Code = code_toBeAppended, + HealthEventDescription = this.message.ToString(), + HealthState = "Ok", + NodeName, + ObserverName, + Source = ObserverConstants.FabricObserverName, + }); + } + } + else + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, + HealthEventDescription = this.message.ToString(), + HealthState = "Warning", + NodeName, + ObserverName, + Source = ObserverConstants.FabricObserverName, + }); + } + } + } + this.message.Clear(); + return Task.CompletedTask; + } + catch (Exception e) when + (e is FabricException || + e is OperationCanceledException || + e is TaskCanceledException || + e is TimeoutException) { - _ = TelemetryClient?.ReportHealthAsync( - telemetryData, - Token); + // These can happen, transiently. Ignore them. } - - // ETW. - if (IsEtwEnabled) + catch (Exception e) { + // This will take down our observer and FO will not recreate it. We will have to redeploy FO. + //This will let us fix bugs that cause unhandled exceptions and then handle them if we can. + // Leave this in place when we deploy to test and staging, so that we could fix bugs that could take down ASCRPObserver. + string msg = $"ASCRPObserver : Report Async : Exception occured :{Environment.NewLine}{e}"; ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = FOErrorWarningCodes.Ok, - HealthEventDescription = message.ToString(), - HealthState = "Ok", - NodeName, - ObserverName, - Source = ObserverConstants.FabricObserverName, - }); - } - - message.Clear(); + ObserverConstants.FabricObserverETWEventName, + new + { + Code = AscFOErrorAndWarningCodes.AscrpObserverError, //This error code shows that ASCRPObserver just crashed due to some unexecpted error + Level = "Error", + Description = msg, + Source = ObserverName, + Node = NodeName, + }); + throw; + } return Task.CompletedTask; } } + + public sealed class AscFOErrorAndWarningCodes + { + public const string AscServiceInstanceOK = "FOASC001"; + public const string AscServiceInstanceWarningInstanceIncreased = "FOASC002"; + public const string AscServiceInstanceSendAlert = "FOASC003"; + public const string AscServiceInstanceError = "FOASC005"; + public const string AscrpObserverError = "FOASC006"; + } } \ No newline at end of file diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index 7134e5d2..c8b68837 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -10,6 +10,6 @@ - + From 492efd8855ca7145fc1dc529178c07c3528faa3e Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 8 Apr 2021 12:06:26 -0700 Subject: [PATCH 02/20] SampleObs code. --- SampleObserverPlugin/SampleNewObserver.cs | 447 ++++++---------------- 1 file changed, 112 insertions(+), 335 deletions(-) diff --git a/SampleObserverPlugin/SampleNewObserver.cs b/SampleObserverPlugin/SampleNewObserver.cs index 1f4dca8b..1f3cbdfd 100644 --- a/SampleObserverPlugin/SampleNewObserver.cs +++ b/SampleObserverPlugin/SampleNewObserver.cs @@ -4,386 +4,163 @@ // ------------------------------------------------------------ using System; -using System.Collections.Generic; using System.Diagnostics; +using System.Fabric; using System.Fabric.Health; +using System.Linq; using System.Text; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.Utilities; -using System.Linq; -using System.Fabric; +using FabricObserver.Observers.Utilities.Telemetry; namespace FabricObserver.Observers { - /// - /// Over all idea of how ASCRPObserver works: - /// It scans for the service instance of serviceType "CapsServiceType" under each ASCRP applicationType in the ObserverAsync function - /// After the scanning is done we use reportAsync() to report to SFX and Generate ETW traces to our logs - /// - - public class ASCRPObserver : ObserverBase + public class SampleNewObserver : ObserverBase { + private readonly StringBuilder message; - private readonly StringBuilder message; // The message that will go into our ETW traces - private readonly IDictionary numberOfAscServiceInstancesRecorded = new Dictionary(); // This Dict is used to hold the number of services from the previous scan - private readonly int mintuesToWarn = 10; //Minutes to put the warning on the SFX - private readonly string appName = "fabric:/Ascrp"; //ApplicationName - private readonly string appType = "ASCRP"; //ApplicationType - private readonly string serviceName = "RPP_CapsServiceType_"; //ServiceName - private readonly string serviceType = "CapsServiceType";//ServiceType - private static int num_of_Instances_To_Warn; // This is the number of instances variable that will tell us after which number we want to send an alert - private bool serviceInstancesUpdated; //Flag variable that indicates if the serviceInstances got increased in a scan - private bool should_warn = false; //Another flag variable that indicates if the warning has to be pushed - private int totalNumberofAscServices = 0; //Temp variable that will hold the value of services in every loop - private const int warn_Threshold_lessThanOrEqual2 = 75; - private const int warn_Threshold_greaterThan2 = 30; - - public ASCRPObserver(FabricClient fabricClient, StatelessServiceContext context) + public SampleNewObserver(FabricClient fabricClient, StatelessServiceContext context) : base(fabricClient, context) { - this.message = new StringBuilder(); - serviceInstancesUpdated = false; - // Key is our ServiceType which is "CapsServiceType" and 2 because I think there will be a minimum of two instances under each app on SF - numberOfAscServiceInstancesRecorded.Add(serviceType, 0); + message = new StringBuilder(); } - /// - /// ObserveAsync(): How it works: - /// Mainly counts the serviceInstances under each ASCRP application in each scan - /// Proactively checks with the recorded value in our dict, if its increased we trigger the flag variable serviceInstancesUpdated to true - /// if that number crosses our alert threshold (num_of_Instances_To_Warn) we switch the should_warn variable to true - /// All this is reported to the ReportAsync() to take action on the observations. - /// public override async Task ObserveAsync(CancellationToken token) { - this.message.AppendLine($"ASCRPObserver: Entering observerAsync: updated 1106"); - try + // If set, this observer will only run during the supplied interval. + // See Settings.xml, CertificateObserverConfiguration section, RunInterval parameter for an example. + if (RunInterval > TimeSpan.MinValue + && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) { - if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) - { - return; - } - Stopwatch stopwatch = Stopwatch.StartNew(); - var apps = await FabricClientInstance.QueryManager.GetApplicationListAsync( + return; + } + + Stopwatch stopwatch = Stopwatch.StartNew(); + int totalNumberOfDeployedSFApps = 0, totalNumberOfDeployedServices = 0, totalNumberOfPartitions = 0, totalNumberOfReplicas = 0; + int appsInWarningError = 0, servicesInWarningError = 0, partitionsInWarningError = 0, replicasInWarningError = 0; + + var apps = await FabricClientInstance.QueryManager.GetApplicationListAsync( + null, + AsyncClusterOperationTimeoutSeconds, + token).ConfigureAwait(false); + + totalNumberOfDeployedSFApps = apps.Count; + appsInWarningError = apps.Where(a => a.HealthState == HealthState.Warning || a.HealthState == HealthState.Error).Count(); + + foreach (var app in apps) + { + var services = await FabricClientInstance.QueryManager.GetServiceListAsync( + app.ApplicationName, null, AsyncClusterOperationTimeoutSeconds, token).ConfigureAwait(false); - // This is where we decide the threshold number for the alerts - var appsOfType = apps.Where(app => app.ApplicationTypeName == appType).ToList(); - int count_appTypes = appsOfType.Count; - if (count_appTypes > 0) - { - if (count_appTypes <= 2) - { - num_of_Instances_To_Warn = warn_Threshold_lessThanOrEqual2; - } - else if (count_appTypes > 2) - { - num_of_Instances_To_Warn = warn_Threshold_greaterThan2; - } - } - this.message.AppendLine($"ASCRPObserver:Slices count: {count_appTypes} "); - this.message.AppendLine($"ASCRPObserver: Alert therhsold : {num_of_Instances_To_Warn} "); - this.message.AppendLine($"ASCRPObserver:Going to start scanning for Apps with type : {appType} "); - //Starting to scan for the services - foreach (var app in apps) + + totalNumberOfDeployedServices += services.Count; + servicesInWarningError += services.Where(s => s.HealthState == HealthState.Warning || s.HealthState == HealthState.Error).Count(); + + foreach (var service in services) { - string app_applicationName = app.ApplicationName.OriginalString; - bool hasAscrpApp = app_applicationName.Contains(appName); - string appTypeName = app.ApplicationTypeName; - bool hasAscrpTypeApp = appTypeName.Equals(appTypeName); - this.message.AppendLine($"ASCRPObserver: Application type name: {appTypeName}, flag contains value : {hasAscrpTypeApp} "); - if (hasAscrpTypeApp) - { - var services = await FabricClientInstance.QueryManager.GetServiceListAsync( - app.ApplicationName, + var partitions = await FabricClientInstance.QueryManager.GetPartitionListAsync( + service.ServiceName, null, AsyncClusterOperationTimeoutSeconds, token).ConfigureAwait(false); - foreach (var service in services) - { - string service_name = service.ServiceName.OriginalString; - bool hasAscrpServiceName = service_name.Contains(serviceName); - string serviceTypeName = service.ServiceTypeName; - bool hasSerivceType = serviceTypeName.Equals(serviceType); - this.message.AppendLine($"ASCRPObserver: Service type name: {serviceTypeName}, flag contains value : {hasAscrpTypeApp} "); - if (hasSerivceType) - { - totalNumberofAscServices += 1; - } - if (numberOfAscServiceInstancesRecorded != null && numberOfAscServiceInstancesRecorded.Count > 0) - { - for (int i = 0; i < numberOfAscServiceInstancesRecorded.Count; i++) - { - KeyValuePair ascrpServiceInstances = numberOfAscServiceInstancesRecorded.ElementAt(i); - this.message.AppendLine($"ASCRPObserver: Number of service instances before scan:{ascrpServiceInstances.Value}"); - if (ascrpServiceInstances.Key.Contains(serviceType)) - { - if (ascrpServiceInstances.Value < totalNumberofAscServices) - { - this.message.AppendLine($"ASCRPObserver: Looks like the service instances have increased since last scan."); - this.message.AppendLine($"ASCRPObserver: Updating the record."); - numberOfAscServiceInstancesRecorded[ascrpServiceInstances.Key] = totalNumberofAscServices; - serviceInstancesUpdated = true; - if (totalNumberofAscServices >= num_of_Instances_To_Warn) - { - this.message.AppendLine($"ASCRPObserver: Number of Services now:{totalNumberofAscServices}, Global threshold: {num_of_Instances_To_Warn}."); - this.message.AppendLine($"ASCRPObserver: Looks like we crossed the global maximum, hence initaiting warnings and hence alerts."); - should_warn = true; - } - } - else - { - this.message.AppendLine($"ASCRPObserver: No change in the no of service instances since our last scan"); - serviceInstancesUpdated = false; + totalNumberOfPartitions += partitions.Count; + partitionsInWarningError += partitions.Where(p => p.HealthState == HealthState.Warning || p.HealthState == HealthState.Error).Count(); - } - this.message.AppendLine($"ASCRPObserver: Now starting to report."); - await ReportAsync(token); - } - this.message.AppendLine($"ASCRPObserver: Number of service instances after scan:{ascrpServiceInstances.Value}"); - } - } - else - { - this.message.AppendLine($"ASCRPObserver: Dict is null, can't proceed."); - } - } + foreach (var partition in partitions) + { + var replicas = await FabricClientInstance.QueryManager.GetReplicaListAsync( + partition.PartitionInformation.Id, + null, + AsyncClusterOperationTimeoutSeconds, + token).ConfigureAwait(false); + + totalNumberOfReplicas += replicas.Count; + replicasInWarningError += replicas.Where(r => r.HealthState == HealthState.Warning || r.HealthState == HealthState.Error).Count(); } - //Clearing the temp variables - this.message.AppendLine($"ASCRPObserver: Clearing the temp variables totalNumberofAscServices"); - totalNumberofAscServices = 0; - serviceInstancesUpdated = false; - should_warn = false; } - stopwatch.Stop(); - RunDuration = stopwatch.Elapsed; - this.message.AppendLine($"ASCRPObserver: After loop report: total number of ASC Services: {totalNumberofAscServices}"); - this.message.AppendLine($"ASCRPObserver: Time it took to run {ObserverName}.ObserveAsync: {RunDuration}"); - await ReportAsync(token); - LastRunDateTime = DateTime.Now; } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TaskCanceledException || - e is TimeoutException) - { - // These can happen, transiently. Ignore them. - } - catch (Exception exception) - { - string msg = $"ASCRPObserver : Report Async : Exception occured :{Environment.NewLine}{exception}"; - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = AscFOErrorAndWarningCodes.AscrpObserverError, //This error code shows that ASCRPObserver just crashed due to some unexecpted error - Level = "Error", - Description = msg, - Source = ObserverName, - Node = NodeName, - }); - throw; - } + message.AppendLine($"Total number of Applications: {totalNumberOfDeployedSFApps}"); + message.AppendLine($"Total number of Applications in Warning or Error: {appsInWarningError}"); + message.AppendLine($"Total number of Services: {totalNumberOfDeployedServices}"); + message.AppendLine($"Total number of Services in Warning or Error: {servicesInWarningError}"); + message.AppendLine($"Total number of Partitions: {totalNumberOfPartitions}"); + message.AppendLine($"Total number of Partitions in Warning or Error: {partitionsInWarningError}"); + message.AppendLine($"Total number of Replicas: {totalNumberOfReplicas}"); + message.AppendLine($"Total number of Replicas in Warning or Error: {replicasInWarningError}"); + + // The time it took to run ObserveAsync; for use in computing HealthReport TTL. + stopwatch.Stop(); + RunDuration = stopwatch.Elapsed; + message.AppendLine($"Time it took to run {base.ObserverName}.ObserveAsync: {RunDuration}"); + + await ReportAsync(token); + LastRunDateTime = DateTime.Now; } - /// - /// ReportAsync() how it works - /// After the scanning is done we land here: based on the two state variables should_warn and serviceInstancesUpdated, we decide what has to be done next. - /// if serviceInstancesUpdated is switched on, we add a trace with warning code "FOASC002". - /// In addition to serviceInstancesUpdated, if should_warn is switched on, we put the SFX into warning mode for "mintuesToWarn" mintues - /// This also adds the trace with warning code "FOASC003" that will eventually trigger the monitor to the send the alert (IcM). - /// - /// - /// public override Task ReportAsync(CancellationToken token) { - - // Local log. This should only be used while debugging. In general, don't enable verbose logging for observer. -#if DEBUG + // Local log. ObserverLogger.LogInfo(message.ToString()); -#endif - try - { - /* Report to Fabric */ - var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); - Utilities.HealthReport healthReport = null; - if (totalNumberofAscServices >= 0) - { - if (serviceInstancesUpdated) - { - if (should_warn) // Create the Warning. - { - this.message.AppendLine("The TTL for the warning is:" + mintuesToWarn.ToString()); - // Provide a SourceId and Property for use in the health event clearing. - // This means the related health event will go from Warning -> Ok, with the Warning state removed from the entity - // (the fabric Node will be green in SFX, in this case since the ReportType is Node) - healthReport = new Utilities.HealthReport - { - Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, - HealthMessage = this.message.ToString(), - NodeName = NodeName, - Observer = ObserverName, - Property = $"{NodeName}_ServiceInstancesUpdated", - ReportType = HealthReportType.Node, - State = HealthState.Warning, - HealthReportTimeToLive = TimeSpan.FromMinutes(mintuesToWarn), - SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert})", - }; + /* Report to Fabric */ - // This property is used to keep track of this observer's latest known warning state. - // If it was in warning from the last time it ran and then it is no longer detects the issue the next time it runs, - // then this property would be used to clear the existing warning, which requires: - // 1. Same SourceID for each event (Warning and Ok) - // 2. Same Property for each event (Warning and Ok) - HasActiveFabricErrorOrWarning = true; - } - else // Create a new Information Event (HealthState = Ok). - { - healthReport = new Utilities.HealthReport - { - Code = AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased, - HealthMessage = this.message.ToString(), - NodeName = NodeName, - Observer = ObserverName, - Property = $"{NodeName}_ServiceInstancesUpdated", - ReportType = HealthReportType.Node, - State = HealthState.Ok, - SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased})", - }; - HasActiveFabricErrorOrWarning = false; - } - } - else // Clear the Warning with an Ok clear. - { - if (HasActiveFabricErrorOrWarning) - { - healthReport = new Utilities.HealthReport - { - Code = AscFOErrorAndWarningCodes.AscServiceInstanceOK, - HealthMessage = this.message.ToString(), - NodeName = NodeName, - Observer = ObserverName, - Property = $"{NodeName}_ServiceInstancesUpdated", - ReportType = HealthReportType.Node, - State = HealthState.Ok, - SourceId = $"{ObserverName}({AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert})", - }; - - HasActiveFabricErrorOrWarning = false; - } - } - } - if (healthReport != null) - { - healthReporter.ReportHealthToServiceFabric(healthReport); //Reports the health state on SFX - /*Report to logs */ - string code_toBeAppended; - // ETW. EventSource tracing. This is very vital for triggering the FO's backend alert system - if (IsEtwEnabled) - { - if (healthReport.State == HealthState.Ok) - { - if (HasActiveFabricErrorOrWarning) - { - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, - HealthEventDescription = this.message.ToString(), - HealthState = "Warning", - NodeName, - ObserverName, - Source = ObserverConstants.FabricObserverName, - }); - } - else - { - code_toBeAppended = null; - if (serviceInstancesUpdated) - { - code_toBeAppended = AscFOErrorAndWarningCodes.AscServiceInstanceWarningInstanceIncreased; - } - else - { - code_toBeAppended = AscFOErrorAndWarningCodes.AscServiceInstanceOK; - } - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = code_toBeAppended, - HealthEventDescription = this.message.ToString(), - HealthState = "Ok", - NodeName, - ObserverName, - Source = ObserverConstants.FabricObserverName, - }); - } - } - else - { - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = AscFOErrorAndWarningCodes.AscServiceInstanceSendAlert, - HealthEventDescription = this.message.ToString(), - HealthState = "Warning", - NodeName, - ObserverName, - Source = ObserverConstants.FabricObserverName, - }); - } - } - } - this.message.Clear(); - return Task.CompletedTask; - } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TaskCanceledException || - e is TimeoutException) + var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); + var healthReport = new Utilities.HealthReport + { + Code = FOErrorWarningCodes.Ok, + HealthMessage = message.ToString(), + NodeName = NodeName, + Observer = ObserverName, + Property = "SomeUniquePropertyForMyHealthEvent", + ReportType = HealthReportType.Node, + State = HealthState.Ok, + }; + + healthReporter.ReportHealthToServiceFabric(healthReport); + + // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml. + var telemetryData = new TelemetryData(FabricClientInstance, Token) { - // These can happen, transiently. Ignore them. + Code = FOErrorWarningCodes.Ok, + Description = message.ToString(), + HealthState = "Ok", + NodeName = NodeName, + ObserverName = ObserverName, + Source = ObserverConstants.FabricObserverName, + }; + + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + telemetryData, + Token); } - catch (Exception e) + + // ETW. + if (IsEtwEnabled) { - // This will take down our observer and FO will not recreate it. We will have to redeploy FO. - //This will let us fix bugs that cause unhandled exceptions and then handle them if we can. - // Leave this in place when we deploy to test and staging, so that we could fix bugs that could take down ASCRPObserver. - string msg = $"ASCRPObserver : Report Async : Exception occured :{Environment.NewLine}{e}"; ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = AscFOErrorAndWarningCodes.AscrpObserverError, //This error code shows that ASCRPObserver just crashed due to some unexecpted error - Level = "Error", - Description = msg, - Source = ObserverName, - Node = NodeName, - }); - - throw; + ObserverConstants.FabricObserverETWEventName, + new + { + Code = FOErrorWarningCodes.Ok, + HealthEventDescription = message.ToString(), + HealthState = "Ok", + NodeName, + ObserverName, + Source = ObserverConstants.FabricObserverName, + }); } + + message.Clear(); + return Task.CompletedTask; } } - - public sealed class AscFOErrorAndWarningCodes - { - public const string AscServiceInstanceOK = "FOASC001"; - public const string AscServiceInstanceWarningInstanceIncreased = "FOASC002"; - public const string AscServiceInstanceSendAlert = "FOASC003"; - public const string AscServiceInstanceError = "FOASC005"; - public const string AscrpObserverError = "FOASC006"; - } } \ No newline at end of file From ba560ebd2d9757abde792f4720459ee0bed62e92 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 8 Apr 2021 13:35:55 -0700 Subject: [PATCH 03/20] Updated sample setting.. --- FabricObserver/PackageRoot/Config/AppObserver.config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 1e4e294a..db8fb24f 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -9,7 +9,7 @@ }, { "targetAppType": "SomeAppType", - "memoryWarningLimitMb": 1048, - "networkWarningEphemeralPorts": 7500 + "memoryWarningLimitMb": 500, + "networkWarningEphemeralPorts": 5000 } ] \ No newline at end of file From dabf2ee1d6fd246cdcbcfbe9932c21876b0c8c6e Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Fri, 9 Apr 2021 13:24:13 -0700 Subject: [PATCH 04/20] 3.1.9 (in process) - Telemetry updates. --- ClusterObserver/ApplicationInsights.config | 3 +- ClusterObserver/ClusterObserver.cs | 206 +++++++----------- ClusterObserver/ClusterObserverManager.cs | 106 +++++---- .../PackageRoot/Config/Settings.xml | 13 +- .../Telemetry/AppInsightsTelemetry.cs | 103 ++++----- FabricObserver.Extensibility/ObserverBase.cs | 8 +- .../Utilities/DataTableFileLogger.cs | 3 +- .../Utilities/Logger.cs | 6 +- .../Telemetry/AppInsightsTelemetry.cs | 196 +++++++++-------- FabricObserver/ApplicationInsights.config | 3 +- .../Observers/CertificateObserver.cs | 90 ++++---- FabricObserver/Observers/NetworkObserver.cs | 52 ++--- FabricObserver/Observers/OSObserver.cs | 8 +- 13 files changed, 358 insertions(+), 439 deletions(-) diff --git a/ClusterObserver/ApplicationInsights.config b/ClusterObserver/ApplicationInsights.config index f64501d6..bba013fa 100644 --- a/ClusterObserver/ApplicationInsights.config +++ b/ClusterObserver/ApplicationInsights.config @@ -1,7 +1,6 @@  - + diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index 96dce491..a2eace69 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -196,8 +196,8 @@ private async Task ReportClusterHealthAsync(CancellationToken token) } telemetryDescription += - $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}" + - $"{ids}"; + $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}" + + $"{ids}"; } int udInClusterUpgrade = await UpgradeChecker.GetUdsWhereFabricUpgradeInProgressAsync(FabricClientInstance, token); @@ -275,11 +275,11 @@ private async Task ReportClusterHealthAsync(CancellationToken token) await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates, token).ConfigureAwait(false); } catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + (e is FabricException || + e is OperationCanceledException || + e is TimeoutException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } else if (evaluation.Kind == HealthEvaluationKind.Application @@ -291,11 +291,11 @@ e is OperationCanceledException || await ProcessApplicationHealthAsync(clusterHealth.ApplicationHealthStates, token).ConfigureAwait(false); } catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + (e is FabricException || + e is OperationCanceledException || + e is TimeoutException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } else @@ -305,11 +305,11 @@ e is OperationCanceledException || await ProcessGenericEntityHealthAsync(evaluation, token).ConfigureAwait(false); } catch (Exception e) when - (e is FabricException || - e is TimeoutException || - e is OperationCanceledException) + (e is FabricException || + e is TimeoutException || + e is OperationCanceledException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } } @@ -320,7 +320,7 @@ e is TimeoutException || } catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) { - // Handled by ignoring. + } catch (Exception e) { @@ -417,134 +417,94 @@ private async Task ProcessApplicationHealthAsync(IList a if (appHealthEvents.Count() == 0) { - var evals = appHealth.UnhealthyEvaluations; + continue; + } - if (evals.Count == 0) + foreach (HealthEvent healthEvent in appHealthEvents.OrderByDescending(f => f.SourceUtcTimestamp)) + { + var foTelemetryData = TryGetFOHealthStateEventData(healthEvent, HealthScope.Application); + + // From FabricObserver? + if (foTelemetryData != null) { - continue; - } - - var eval = appHealth.UnhealthyEvaluations[0]; + // Telemetry. + if (TelemetryEnabled && ObserverTelemetryClient != null) + { + await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); + } - telemetryDescription += eval.Description; - - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) - { - var telemetryData = new TelemetryData(FabricClientInstance, token) + // ETW. + if (etwEnabled) { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - Description = telemetryDescription, - Source = ObserverName, - }; + double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); - } + Logger.EtwLogger?.Write( + ObserverConstants.ClusterObserverETWEventName, + new + { + foTelemetryData.ApplicationName, + foTelemetryData.ServiceName, + foTelemetryData.HealthState, + foTelemetryData.Description, + foTelemetryData.Metric, + foTelemetryData.ObserverName, + foTelemetryData.NodeName, + Source = ObserverName, + foTelemetryData.PartitionId, + foTelemetryData.ProcessId, + foTelemetryData.ReplicaId, + foTelemetryData.SystemServiceProcessName, + // 0 could be a real value, thus defaulting to -1 when tryparse returns false (see above).. + Value = value > -1 ? value : 0, + }); + } - // ETW. - if (etwEnabled) - { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName, - }); + // Reset + telemetryDescription = string.Empty; } - - // Reset - telemetryDescription = string.Empty; - } - else - { - // We only care about the latest (most recent) health event - there can be a very large number of events in the Health Event Store. - foreach (HealthEvent healthEvent in appHealthEvents.OrderByDescending(f => f.SourceUtcTimestamp).Take(1)) + else { - var foTelemetryData = TryGetFOHealthStateEventData(healthEvent, HealthScope.Application); - - // From FabricObserver? - if (foTelemetryData != null) + if (!string.IsNullOrWhiteSpace(healthEvent.HealthInformation.Description)) { - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) - { - await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); - } - - // ETW. - if (etwEnabled) - { - double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - foTelemetryData.ApplicationName, - foTelemetryData.HealthState, - foTelemetryData.Description, - foTelemetryData.Metric, - foTelemetryData.ObserverName, - foTelemetryData.NodeName, - Source = ObserverName, - foTelemetryData.PartitionId, - foTelemetryData.ReplicaId, - foTelemetryData.SystemServiceProcessName, - Value = value, - }); - } - - // Reset - telemetryDescription = string.Empty; + telemetryDescription += healthEvent.HealthInformation.Description; } else { - if (!string.IsNullOrWhiteSpace(healthEvent.HealthInformation.Description)) - { - telemetryDescription += healthEvent.HealthInformation.Description; - } - else - { - telemetryDescription += string.Join($"{Environment.NewLine}", appHealth.UnhealthyEvaluations); - } + telemetryDescription += string.Join($"{Environment.NewLine}", appHealth.UnhealthyEvaluations); + } - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + // Telemetry. + if (TelemetryEnabled && ObserverTelemetryClient != null) + { + var telemetryData = new TelemetryData(FabricClientInstance, token) { - var telemetryData = new TelemetryData(FabricClientInstance, token) + ApplicationName = appName.OriginalString, + HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), + Description = telemetryDescription, + Source = ObserverName, + }; + + await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + } + + // ETW. + if (etwEnabled) + { + Logger.EtwLogger?.Write( + ObserverConstants.ClusterObserverETWEventName, + new { ApplicationName = appName.OriginalString, HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - Description = telemetryDescription, + HealthEventDescription = telemetryDescription, Source = ObserverName, - }; - - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); - } - - // ETW. - if (etwEnabled) - { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName, - }); - } - - // Reset - telemetryDescription = string.Empty; + }); } + + // Reset + telemetryDescription = string.Empty; } - } + } } } diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index 184b4969..3bed2962 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -334,27 +334,27 @@ public async Task StartAsync() if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ClusterObserverServiceHealth", - HealthState.Warning, - message, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ClusterObserverServiceHealth", + HealthState.Warning, + message, + ObserverConstants.ObserverManagerName, + token); } // ETW. if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = message, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = message, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } // Don't swallow the unhandled exception. Fix the bug. @@ -388,27 +388,27 @@ private Task SignalAbortToRunningObserverAsync() if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ClusterObserverServiceHealth", - HealthState.Warning, - $"{e}", - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ClusterObserverServiceHealth", + HealthState.Warning, + $"{e}", + ObserverConstants.ObserverManagerName, + token); } // ETW. if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = $"{e}", - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = $"{e}", + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } } @@ -445,12 +445,12 @@ private async Task RunObserverAync() if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ObserverHealthReport", - HealthState.Warning, - observerHealthWarning, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ObserverHealthReport", + HealthState.Warning, + observerHealthWarning, + ObserverConstants.ObserverManagerName, + token); } if (EtwEnabled) @@ -489,26 +489,26 @@ ex.InnerException is TaskCanceledException || if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ObserverHealthReport", - HealthState.Warning, - msg, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ObserverHealthReport", + HealthState.Warning, + msg, + ObserverConstants.ObserverManagerName, + token); } if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = msg, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = msg, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } throw; @@ -517,9 +517,7 @@ ex.InnerException is TaskCanceledException || IsObserverRunning = false; } - private void CodePackageActivationContext_ConfigurationPackageModifiedEvent( - object sender, - PackageModifiedEventArgs e) + private void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) { appParamsUpdating = true; Logger.LogInfo("Application Parameter upgrade started..."); diff --git a/ClusterObserver/PackageRoot/Config/Settings.xml b/ClusterObserver/PackageRoot/Config/Settings.xml index 9570a3c3..03d25595 100644 --- a/ClusterObserver/PackageRoot/Config/Settings.xml +++ b/ClusterObserver/PackageRoot/Config/Settings.xml @@ -1,12 +1,13 @@ 
- - + + - + - - + + - + diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index beef5084..6d3b5aab 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -57,14 +57,8 @@ public AppInsightsTelemetry(string key) /// public string Key { - get - { - return telemetryClient?.InstrumentationKey; - } - set - { - telemetryClient.InstrumentationKey = value; - } + get => telemetryClient?.InstrumentationKey; + set => telemetryClient.InstrumentationKey = value; } /// @@ -81,15 +75,15 @@ public string Key /// CancellationToken instance. /// A representing the asynchronous operation. public Task ReportAvailabilityAsync( - Uri serviceName, - string instance, - string testName, - DateTimeOffset captured, - TimeSpan duration, - string location, - bool success, - CancellationToken cancellationToken, - string message = null) + Uri serviceName, + string instance, + string testName, + DateTimeOffset captured, + TimeSpan duration, + string location, + bool success, + CancellationToken cancellationToken, + string message = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -117,13 +111,9 @@ public Task ReportAvailabilityAsync( /// TelemetryData instance. /// CancellationToken instance. /// a Task. - public Task ReportHealthAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (!IsEnabled - || cancellationToken.IsCancellationRequested - || telemetryData == null) + if (!IsEnabled || cancellationToken.IsCancellationRequested || telemetryData == null) { return Task.FromResult(1); } @@ -141,29 +131,28 @@ public Task ReportHealthAsync( Dictionary properties = new Dictionary { - { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, + { "HealthState", telemetryData.HealthState ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, - { "HealthState", telemetryData.HealthState ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", $"{telemetryData.PartitionId}" }, - { "Replica", $"{telemetryData.ReplicaId}" }, - { "Source", telemetryData.Source ?? ObserverConstants.ClusterObserverName }, { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, }; telemetryClient.TrackEvent(ObserverConstants.ClusterObserverETWEventName, properties); } catch (Exception e) { - logger.LogWarning( - $"Unhandled exception in TelemetryClient.ReportHealthAsync:" + - $"{Environment.NewLine}{e}"); - - throw; + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); } return Task.FromResult(0); @@ -182,14 +171,14 @@ public Task ReportHealthAsync( /// Optional: TraceTelemetry context cloud instance name. /// A representing the asynchronous operation. public Task ReportHealthAsync( - HealthScope scope, - string propertyName, - HealthState state, - string unhealthyEvaluations, - string source, - CancellationToken cancellationToken, - string serviceName = null, - string instanceName = null) + HealthScope scope, + string propertyName, + HealthState state, + string unhealthyEvaluations, + string source, + CancellationToken cancellationToken, + string serviceName = null, + string instanceName = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -219,7 +208,6 @@ public Task ReportHealthAsync( catch (Exception e) { logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); - throw; } return Task.FromResult(0); @@ -232,10 +220,7 @@ public Task ReportHealthAsync( /// Value of the property. /// CancellationToken instance. /// Task of bool. - public Task ReportMetricAsync( - string name, - T value, - CancellationToken cancellationToken) + public Task ReportMetricAsync(string name, T value, CancellationToken cancellationToken) { if (!IsEnabled || cancellationToken.IsCancellationRequested || string.IsNullOrEmpty(name)) { @@ -317,17 +302,17 @@ public async Task ReportMetricAsync(string role, long id, string name, long valu /// CancellationToken instance. /// A representing the asynchronous operation. public Task ReportMetricAsync( - string roleName, - string instance, - string name, - long value, - int count, - long min, - long max, - long sum, - double deviation, - IDictionary properties, - CancellationToken cancellationToken) + string roleName, + string instance, + string name, + long value, + int count, + long min, + long max, + long sum, + double deviation, + IDictionary properties, + CancellationToken cancellationToken) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 0f594485..61c8f909 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -683,8 +683,6 @@ public void ProcessResourceDataReportHealth( { ApplicationName = appName?.OriginalString ?? string.Empty, NodeName = NodeName, - Code = string.Empty, - HealthState = string.Empty, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 0), @@ -758,8 +756,6 @@ public void ProcessResourceDataReportHealth( // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { - Code = string.Empty, - HealthState = string.Empty, NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", @@ -778,8 +774,6 @@ public void ProcessResourceDataReportHealth( ObserverConstants.FabricObserverETWEventName, new { - Code = string.Empty, - HealthState = string.Empty, NodeName, ObserverName, Metric = $"{drive}{data.Property}", @@ -1042,7 +1036,7 @@ public void ProcessResourceDataReportHealth( // Telemetry if (IsTelemetryEnabled) { - _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. diff --git a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs index e3c5fc52..03873e29 100644 --- a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs +++ b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs @@ -6,7 +6,6 @@ using System; using System.Collections.Generic; using System.IO; -using System.Linq; using System.Runtime.InteropServices; using FabricObserver.Interfaces; using NLog; @@ -41,7 +40,7 @@ public CsvFileWriteFormat FileWriteFormat } /// - /// The maximum number of archive files that will be stored. + /// The maximum number of days that archive files will be stored. /// 0 means there is no limit set. /// public int MaxArchiveCsvFileLifetimeDays diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index e81f7d3c..21543792 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -22,6 +22,8 @@ namespace FabricObserver.Observers.Utilities public sealed class Logger : IObserverLogger { private const int Retries = 5; + + // This needs to be static to prevent internal EventSource instantiation errors. private static EventSource etwLogger = null; // Text file logger for observers - info/warn/error. @@ -76,7 +78,7 @@ public string Filename } /// - /// The maximum number of archive files that will be stored. + /// The maximum number of days that archive files will be stored. /// 0 means there is no limit set. /// public int MaxArchiveFileLifetimeDays @@ -276,8 +278,8 @@ public void InitializeLoggers() FileName = file, Layout = "${longdate}--${uppercase:${level}}--${message}", OpenFileCacheTimeout = 5, - ArchiveNumbering = ArchiveNumberingMode.DateAndSequence, ArchiveEvery = FileArchivePeriod.Day, + ArchiveNumbering = ArchiveNumberingMode.DateAndSequence, MaxArchiveDays = MaxArchiveFileLifetimeDays <= 0 ? 7 : MaxArchiveFileLifetimeDays, AutoFlush = true, }; diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 637fc1a1..3a0be3a3 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -72,15 +72,15 @@ public string Key /// Error message on availability test run failure. /// A representing the asynchronous operation. public Task ReportAvailabilityAsync( - Uri serviceName, - string instance, - string testName, - DateTimeOffset captured, - TimeSpan duration, - string location, - bool success, - CancellationToken cancellationToken, - string message = null) + Uri serviceName, + string instance, + string testName, + DateTimeOffset captured, + TimeSpan duration, + string location, + bool success, + CancellationToken cancellationToken, + string message = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -110,14 +110,14 @@ public Task ReportAvailabilityAsync( /// Optional: TraceTelemetry context cloud instance name. /// A representing the asynchronous operation. public Task ReportHealthAsync( - HealthScope scope, - string propertyName, - HealthState state, - string unhealthyEvaluations, - string source, - CancellationToken cancellationToken, - string serviceName = null, - string instanceName = null) + HealthScope scope, + string propertyName, + HealthState state, + string unhealthyEvaluations, + string source, + CancellationToken cancellationToken, + string serviceName = null, + string instanceName = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -147,7 +147,6 @@ public Task ReportHealthAsync( catch (Exception e) { logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); - throw; } return Task.CompletedTask; @@ -159,15 +158,11 @@ public Task ReportHealthAsync( /// TelemetryData instance. /// CancellationToken instance. /// a Task. - public Task ReportHealthAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (!IsEnabled - || cancellationToken.IsCancellationRequested - || telemetryData == null) + if (!IsEnabled || cancellationToken.IsCancellationRequested || telemetryData == null) { - return Task.FromResult(1); + return Task.CompletedTask; } try @@ -183,31 +178,28 @@ public Task ReportHealthAsync( Dictionary properties = new Dictionary { - { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, - { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.Description ?? string.Empty }, { "HealthState", telemetryData.HealthState ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ErrorCode", telemetryData.Code ?? string.Empty }, + { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", $"{telemetryData.PartitionId}" }, - { "Replica", $"{telemetryData.ReplicaId}" }, - { "Source", telemetryData.Source ?? string.Empty }, { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, }; - telemetryClient.TrackEvent( - $"{telemetryData.ObserverName ?? "ClusterObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); } catch (Exception e) { - logger.LogWarning( - $"Unhandled exception in TelemetryClient.ReportHealthAsync:" + - $"{Environment.NewLine}{e}"); - - throw; + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); } return Task.CompletedTask; @@ -222,10 +214,10 @@ public Task ReportHealthAsync( /// cancellation token. /// A Task of bool. public Task ReportMetricAsync( - string metric, - T value, - string source, - CancellationToken cancellationToken) + string metric, + T value, + string source, + CancellationToken cancellationToken) { if (!IsEnabled || string.IsNullOrEmpty(metric) || cancellationToken.IsCancellationRequested) { @@ -233,8 +225,8 @@ public Task ReportMetricAsync( } telemetryClient?.TrackEvent( - string.IsNullOrEmpty(source) ? ObserverConstants.FabricObserverETWEventName : source, - new Dictionary { { metric, value?.ToString() } }); + string.IsNullOrEmpty(source) ? ObserverConstants.FabricObserverETWEventName : source, + new Dictionary { { metric, value?.ToString() } }); return Task.FromResult(true); } @@ -245,35 +237,44 @@ public Task ReportMetricAsync( /// TelemetryData instance. /// Cancellation token. /// A task. - public Task ReportMetricAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { if (telemetryData == null) { return Task.CompletedTask; } - Dictionary properties = new Dictionary + string value = null; + + if (telemetryData.Value != null) { - { "Application", telemetryData.ApplicationName ?? string.Empty }, - { "ClusterId", telemetryData.ClusterId ?? string.Empty }, - { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.Description ?? string.Empty }, - { "HealthState", telemetryData.HealthState ?? string.Empty }, - { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "ObserverName", telemetryData.ObserverName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", telemetryData.PartitionId ?? string.Empty }, - { "Replica", telemetryData.ReplicaId ?? string.Empty }, - { "Source", telemetryData.Source ?? string.Empty }, - { "Value", telemetryData.Value?.ToString() ?? string.Empty }, - }; + value = telemetryData.Value.ToString(); + } + + try + { + Dictionary properties = new Dictionary + { + { "ClusterId", telemetryData.ClusterId ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "Metric", telemetryData.Metric ?? string.Empty }, + { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, + }; - telemetryClient.TrackEvent( - $"{telemetryData.ObserverName ?? "FabricObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } return Task.CompletedTask; } @@ -284,42 +285,45 @@ public Task ReportMetricAsync( /// TelemetryData instance. /// Cancellation token. /// A task. - public Task ReportMetricAsync( - MachineTelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportMetricAsync(MachineTelemetryData telemetryData, CancellationToken cancellationToken) { if (telemetryData == null || cancellationToken.IsCancellationRequested) { return Task.CompletedTask; } - Dictionary properties = new Dictionary + try { - { "ActiveEphemeralPorts", telemetryData.ActiveEphemeralPorts.ToString() }, - { "ActiveFirewallRules", telemetryData.ActiveFirewallRules.ToString() }, - { "ActivePorts", telemetryData.ActivePorts.ToString() }, - { "AvailablePhysicalMemory", telemetryData.AvailablePhysicalMemoryGB.ToString() }, - { "AvailableVirtualMemory", telemetryData.AvailableVirtualMemoryGB.ToString() }, - { "DriveInfo", telemetryData.DriveInfo }, - { "FabricAppPortRange", telemetryData.FabricAppPortRange.ToString() }, - { "HotFixes", telemetryData.HotFixes.ToString() }, - { "LastBootUpTime", telemetryData.LastBootUpTime.ToString() }, - { "Level", telemetryData.HealthState.ToString() }, - { "LogicalDriveCount", telemetryData.LogicalDriveCount.ToString() }, - { "LogicalProcessorCount", telemetryData.LogicalProcessorCount.ToString() }, - { "Node", telemetryData.Node.ToString() }, - { "NumberOfRunningProcesses", telemetryData.NumberOfRunningProcesses.ToString() }, - { "Observer", telemetryData.Observer }, - { "OS", telemetryData.OS }, - { "OSInstallDate", telemetryData.OSInstallDate }, - { "OSVersion", telemetryData.OSVersion }, - { "TotalMemorySizeGB", telemetryData.TotalMemorySizeGB.ToString() }, - { "WindowsDynamicPortRange", telemetryData.WindowsDynamicPortRange }, - }; + Dictionary properties = new Dictionary + { + { "ActiveEphemeralPorts", telemetryData.ActiveEphemeralPorts.ToString() }, + { "ActiveFirewallRules", telemetryData.ActiveFirewallRules.ToString() }, + { "ActivePorts", telemetryData.ActivePorts.ToString() }, + { "AvailablePhysicalMemory", telemetryData.AvailablePhysicalMemoryGB.ToString() }, + { "AvailableVirtualMemory", telemetryData.AvailableVirtualMemoryGB.ToString() }, + { "DriveInfo", telemetryData.DriveInfo }, + { "FabricAppPortRange", telemetryData.FabricAppPortRange.ToString() }, + { "HotFixes", telemetryData.HotFixes.ToString() }, + { "LastBootUpTime", telemetryData.LastBootUpTime.ToString() }, + { "Level", telemetryData.HealthState.ToString() }, + { "LogicalDriveCount", telemetryData.LogicalDriveCount.ToString() }, + { "LogicalProcessorCount", telemetryData.LogicalProcessorCount.ToString() }, + { "Node", telemetryData.Node.ToString() }, + { "NumberOfRunningProcesses", telemetryData.NumberOfRunningProcesses.ToString() }, + { "Observer", telemetryData.Observer }, + { "OS", telemetryData.OS }, + { "OSInstallDate", telemetryData.OSInstallDate }, + { "OSVersion", telemetryData.OSVersion }, + { "TotalMemorySizeGB", telemetryData.TotalMemorySizeGB.ToString() }, + { "WindowsDynamicPortRange", telemetryData.WindowsDynamicPortRange }, + }; - telemetryClient.TrackEvent( - $"{telemetryData.Observer ?? "FabricObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } return Task.CompletedTask; } diff --git a/FabricObserver/ApplicationInsights.config b/FabricObserver/ApplicationInsights.config index e18496a5..bba013fa 100644 --- a/FabricObserver/ApplicationInsights.config +++ b/FabricObserver/ApplicationInsights.config @@ -1,7 +1,6 @@  - + diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 38ee28d8..701d8b78 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -165,18 +165,14 @@ public override Task ReportAsync(CancellationToken token) token.ThrowIfCancellationRequested(); // Someone calling without observing first, must be run after a new run of ObserveAsync - if (ExpiringWarnings == null || - ExpiredWarnings == null || - NotFoundWarnings == null) + if (ExpiringWarnings == null || ExpiredWarnings == null || NotFoundWarnings == null) { return Task.CompletedTask; } HealthReport healthReport; - if (ExpiringWarnings.Count == 0 - && ExpiredWarnings.Count == 0 - && NotFoundWarnings.Count == 0) + if (ExpiringWarnings.Count == 0 && ExpiredWarnings.Count == 0 && NotFoundWarnings.Count == 0) { healthReport = new HealthReport { @@ -226,27 +222,25 @@ public override Task ReportAsync(CancellationToken token) Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = FOErrorWarningCodes.WarningCertificateExpiration, - HealthState = "Warning", - NodeName, - Metric = ErrorWarningProperty.CertificateExpiration, - HealthEventDescription = healthMessage, - ObserverName, - OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", - Source = ObserverConstants.FabricObserverName, - Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), - }); + ObserverConstants.FabricObserverETWEventName, + new + { + Code = FOErrorWarningCodes.WarningCertificateExpiration, + HealthState = "Warning", + NodeName, + Metric = ErrorWarningProperty.CertificateExpiration, + HealthEventDescription = healthMessage, + ObserverName, + OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", + Source = ObserverConstants.FabricObserverName, + Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), + }); } } @@ -300,14 +294,14 @@ private async Task Initialize(CancellationToken token) token.ThrowIfCancellationRequested(); var daysUntilClusterExpireWarningThreshold = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverDaysUntilClusterExpiryWarningThreshold); + ConfigurationSectionName, + ObserverConstants.CertificateObserverDaysUntilClusterExpiryWarningThreshold); DaysUntilClusterExpireWarningThreshold = !string.IsNullOrEmpty(daysUntilClusterExpireWarningThreshold) ? int.Parse(daysUntilClusterExpireWarningThreshold) : 14; var daysUntilAppExpireWarningClusterThreshold = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverDaysUntilAppExpiryWarningThreshold); + ConfigurationSectionName, + ObserverConstants.CertificateObserverDaysUntilAppExpiryWarningThreshold); DaysUntilAppExpireWarningThreshold = !string.IsNullOrEmpty(daysUntilAppExpireWarningClusterThreshold) ? int.Parse(daysUntilAppExpireWarningClusterThreshold) : 14; @@ -323,8 +317,8 @@ private async Task Initialize(CancellationToken token) if (AppCertificateCommonNamesToObserve == null) { var appCommonNamesToObserve = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverAppCertificateCommonNames); + ConfigurationSectionName, + ObserverConstants.CertificateObserverAppCertificateCommonNames); AppCertificateCommonNamesToObserve = !string.IsNullOrEmpty(appCommonNamesToObserve) ? JsonHelper.ConvertFromString>(appCommonNamesToObserve) : new List(); } @@ -337,11 +331,7 @@ private async Task GetSecurityTypes(CancellationToken token) token.ThrowIfCancellationRequested(); SecurityConfiguration = new SecurityConfiguration(); - - string clusterManifestXml = await FabricClientInstance.ClusterManager.GetClusterManifestAsync( - AsyncClusterOperationTimeoutSeconds, - Token).ConfigureAwait(true); - + string clusterManifestXml = await FabricClientInstance.ClusterManager.GetClusterManifestAsync(AsyncClusterOperationTimeoutSeconds, Token).ConfigureAwait(true); XmlReader xreader = null; StringReader sreader = null; @@ -391,12 +381,12 @@ private async Task GetSecurityTypes(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { WriteToLogWithLevel( - ObserverName, - $"There was an issue parsing the cluster manifest. Observer cannot run.\nError Details:\n{e}", - LogLevel.Error); + ObserverName, + $"There was an issue parsing the cluster manifest. Observer cannot run. Error Details:{Environment.NewLine}{e}", + LogLevel.Error); throw; } @@ -459,11 +449,7 @@ private void CheckLatestBySubjectName(X509Store store, string subjectName, int w private void CheckByThumbprint(X509Store store, string thumbprint, int warningThreshold) { - X509Certificate2Collection certificates = store.Certificates.Find( - X509FindType.FindByThumbprint, - thumbprint, - validOnly: false); - + X509Certificate2Collection certificates = store.Certificates.Find(X509FindType.FindByThumbprint, thumbprint, validOnly: false); X509Certificate2 certificate; if (certificates.Count == 0) @@ -473,15 +459,15 @@ private void CheckByThumbprint(X509Store store, string thumbprint, int warningTh if (!TryFindCertificate("/var/lib/sfcerts", thumbprint, out certificate) && !TryFindCertificate("/var/lib/waagent", thumbprint, out certificate)) { - NotFoundWarnings.Add( - $"Could not find requested certificate with thumbprint: {thumbprint} in /var/lib/sfcerts, /var/lib/waagent, and LocalMachine/Root"); + NotFoundWarnings.Add($"Could not find requested certificate with thumbprint: {thumbprint} in /var/lib/sfcerts, /var/lib/waagent, and LocalMachine/Root"); + return; } } else { - NotFoundWarnings.Add( - $"Could not find requested certificate with thumbprint: {thumbprint} in LocalMachine/My"); + NotFoundWarnings.Add($"Could not find requested certificate with thumbprint: {thumbprint} in LocalMachine/My"); + return; } } @@ -502,16 +488,16 @@ private void CheckByThumbprint(X509Store store, string thumbprint, int warningTh if (timeUntilExpiry.TotalMilliseconds < 0) { ExpiredWarnings.Add($"Certificate Expired on {expiry.ToShortDateString()}: " + - $"Thumbprint: {certificate.Thumbprint} " + - $"Issuer {certificate.Issuer}, " + - $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); + $"Thumbprint: {certificate.Thumbprint} " + + $"Issuer {certificate.Issuer}, " + + $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); } else if (timeUntilExpiry.TotalDays < warningThreshold) { ExpiringWarnings.Add($"Certificate Expiring on {expiry.ToShortDateString()}: " + - $"Thumbprint: {certificate.Thumbprint} " + - $"Issuer {certificate.Issuer}, " + - $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); + $"Thumbprint: {certificate.Thumbprint} " + + $"Issuer {certificate.Issuer}, " + + $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); } } } diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 5f51b2a3..990827cd 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -165,9 +165,7 @@ public override Task ReportAsync(CancellationToken token) if (IsTelemetryEnabled) { - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } var report = new HealthReport @@ -195,17 +193,17 @@ public override Task ReportAsync(CancellationToken token) if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - ApplicationName = conn.TargetApp, - Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, - HealthState = "Warning", - HealthEventDescription = healthMessage, - ObserverName, - Metric = ErrorWarningProperty.InternetConnectionFailure, - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + ApplicationName = conn.TargetApp, + Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, + HealthState = "Warning", + HealthEventDescription = healthMessage, + ObserverName, + Metric = ErrorWarningProperty.InternetConnectionFailure, + NodeName, + }); } } else @@ -249,26 +247,24 @@ public override Task ReportAsync(CancellationToken token) NodeName = NodeName, }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - ApplicationName = conn.TargetApp, - Code = FOErrorWarningCodes.Ok, - HealthState = "Ok", - HealthEventDescription = healthMessage, - ObserverName, - Metric = "Internet Connection State", - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + ApplicationName = conn.TargetApp, + Code = FOErrorWarningCodes.Ok, + HealthState = "Ok", + HealthEventDescription = healthMessage, + ObserverName, + Metric = "Internet Connection State", + NodeName, + }); } // Reset health state. diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 2fa5cac0..4ef15dd4 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -188,9 +188,7 @@ public override Task ReportAsync(CancellationToken token) HealthReporter.ReportHealthToServiceFabric(report); - if (IsTelemetryProviderEnabled - && IsTelemetryEnabled - && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). var telemetryData = new TelemetryData(FabricClientInstance, token) @@ -204,9 +202,7 @@ public override Task ReportAsync(CancellationToken token) Source = ObserverConstants.FabricObserverName, }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. From 378b3402cfb35049d48f4dee02cca8ff532ebc80 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 12 Apr 2021 11:57:21 -0700 Subject: [PATCH 05/20] FO 3.1.9, CO 2.1.7 (no merge) --- Build-COSFPkgs.ps1 | 8 +- Build-FOWSFPkgs.ps1 | 8 +- Build-SFPkgs.ps1 | 8 +- ClusterObserver.nuspec.template | 4 +- ClusterObserver/ClusterObserver.cs | 215 ++++++++---------- ClusterObserver/ClusterObserverManager.cs | 171 +++++--------- ClusterObserver/FabricClusterObserver.cs | 16 +- .../PackageRoot/Config/Settings.xml | 7 +- .../PackageRoot/ServiceManifest.xml | 6 +- ClusterObserver/Readme.md | 2 +- .../Telemetry/AppInsightsTelemetry.cs | 35 +-- .../Telemetry/LogAnalyticsTelemetry.cs | 33 ++- .../ApplicationManifest.xml | 6 +- FabricObserver.nuspec.template | 4 +- .../PackageRoot/ServiceManifest._linux.xml | 8 +- .../PackageRoot/ServiceManifest.xml | 8 +- .../ApplicationManifest.xml | 4 +- README.md | 2 +- 18 files changed, 199 insertions(+), 346 deletions(-) diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index 1875d66a..f2c42ac9 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.6" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.6" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.7" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.7" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.6" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.6" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.7" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.7" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/Build-FOWSFPkgs.ps1 b/Build-FOWSFPkgs.ps1 index 51c39465..128897d4 100644 --- a/Build-FOWSFPkgs.ps1 +++ b/Build-FOWSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "FabricObserverWeb.Linux.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\self-contained\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Linux.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\framework-dependent\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Linux.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\self-contained\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Linux.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\framework-dependent\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Windows.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\self-contained\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Windows.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\framework-dependent\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Windows.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\self-contained\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Windows.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\framework-dependent\FabricObserverWebApiType" } finally { Pop-Location diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index dd79d5e3..cf32cd67 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.8" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.8" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.9" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.9" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.8" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.8" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.9" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.9" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index d6c6b081..17217cdf 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 2.1.6 - FO 3.1.8 support. Crashing bug fix. + 2.1.7 + Bug fix in Application monitoring logic. Microsoft MIT false diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index a2eace69..43ae9428 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -37,7 +37,7 @@ private HealthState LastKnownClusterHealthState private Dictionary NodeStatusDictionary { get; - } = new Dictionary(); + } protected bool TelemetryEnabled => ClusterObserverManager.TelemetryEnabled; @@ -50,11 +50,6 @@ public ConfigSettings ConfigSettings get; set; } - public bool IsTestRun - { - get; set; - } = false; - public string ObserverName { get; set; @@ -134,6 +129,7 @@ public ClusterObserver(ConfigurationSettings settings = null) FabricServiceContext = ClusterObserverManager.FabricServiceContext; NodeName = FabricServiceContext.NodeContext.NodeName; NodeType = FabricServiceContext.NodeContext.NodeType; + NodeStatusDictionary = new Dictionary(); if (settings == null) { @@ -191,13 +187,10 @@ private async Task ReportClusterHealthAsync(CancellationToken token) foreach (var repair in repairsInProgress) { - ids += $"TaskId: {repair.TaskId}{Environment.NewLine}" + - $"State: {repair.State}{Environment.NewLine}"; + ids += $"TaskId: {repair.TaskId}{Environment.NewLine}State: {repair.State}{Environment.NewLine}"; } - telemetryDescription += - $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}" + - $"{ids}"; + telemetryDescription += $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}{ids}"; } int udInClusterUpgrade = await UpgradeChecker.GetUdsWhereFabricUpgradeInProgressAsync(FabricClientInstance, token); @@ -231,15 +224,15 @@ private async Task ReportClusterHealthAsync(CancellationToken token) if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Cluster", - HealthState = "Ok", - HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", - Metric = "AggregatedClusterHealth", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Cluster", + HealthState = "Ok", + HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", + Metric = "AggregatedClusterHealth", + Source = ObserverName, + }); } } else @@ -274,10 +267,7 @@ private async Task ReportClusterHealthAsync(CancellationToken token) { await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { continue; } @@ -290,10 +280,7 @@ e is OperationCanceledException || { await ProcessApplicationHealthAsync(clusterHealth.ApplicationHealthStates, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { continue; } @@ -304,10 +291,7 @@ e is OperationCanceledException || { await ProcessGenericEntityHealthAsync(evaluation, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is TimeoutException || - e is OperationCanceledException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { continue; } @@ -318,11 +302,14 @@ e is TimeoutException || LastKnownClusterHealthState = clusterHealth.AggregatedHealthState; } } - catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) + catch (FabricException fe) // This can happen when running CO unit test. In production, this is very rare. { - + string msg = $"Handled transient FabricException in ReportClusterHealthAsync:{Environment.NewLine}{fe}"; + + // Log it locally. + ObserverLogger.LogWarning(msg); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { string msg = $"Unhandled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"; @@ -345,12 +332,12 @@ e is TimeoutException || if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthState = "Warning", - HealthEventDescription = msg, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthState = "Warning", + HealthEventDescription = msg, + }); } // Fix the bug. @@ -386,18 +373,13 @@ private async Task ProcessApplicationHealthAsync(IList a // Check upgrade status of unhealthy application. Note, this doesn't apply to System applications as they update as part of a platform update. if (appName.OriginalString != "fabric:/System") { - var appUpgradeStatus = - await FabricClientInstance.ApplicationManager.GetApplicationUpgradeProgressAsync(appName); + var appUpgradeStatus = await FabricClientInstance.ApplicationManager.GetApplicationUpgradeProgressAsync(appName); if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending) { - var udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync( - FabricClientInstance, - token, - appName); - + List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(FabricClientInstance, token, appName); string udText = string.Empty; // -1 means no upgrade in progress for application @@ -412,8 +394,7 @@ private async Task ProcessApplicationHealthAsync(IList a } } - var appHealthEvents = - appHealth.HealthEvents.Where(e => e.HealthInformation.HealthState == HealthState.Error || e.HealthInformation.HealthState == HealthState.Warning); + var appHealthEvents = appHealth.HealthEvents.Where(e => e.HealthInformation.HealthState == HealthState.Error || e.HealthInformation.HealthState == HealthState.Warning); if (appHealthEvents.Count() == 0) { @@ -439,24 +420,24 @@ private async Task ProcessApplicationHealthAsync(IList a double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - foTelemetryData.ApplicationName, - foTelemetryData.ServiceName, - foTelemetryData.HealthState, - foTelemetryData.Description, - foTelemetryData.Metric, - foTelemetryData.ObserverName, - foTelemetryData.NodeName, - Source = ObserverName, - foTelemetryData.PartitionId, - foTelemetryData.ProcessId, - foTelemetryData.ReplicaId, - foTelemetryData.SystemServiceProcessName, - // 0 could be a real value, thus defaulting to -1 when tryparse returns false (see above).. - Value = value > -1 ? value : 0, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + foTelemetryData.ApplicationName, + foTelemetryData.ServiceName, + foTelemetryData.HealthState, + foTelemetryData.Description, + foTelemetryData.Metric, + foTelemetryData.ObserverName, + foTelemetryData.NodeName, + Source = ObserverName, + foTelemetryData.PartitionId, + foTelemetryData.ProcessId, + foTelemetryData.ReplicaId, + foTelemetryData.SystemServiceProcessName, + // 0 could be a real value, thus defaulting to -1 when tryparse returns false (see above).. + Value = value > -1 ? value : 0, + }); } // Reset @@ -491,14 +472,14 @@ private async Task ProcessApplicationHealthAsync(IList a if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + ApplicationName = appName.OriginalString, + HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), + HealthEventDescription = telemetryDescription, + Source = ObserverName, + }); } // Reset @@ -512,9 +493,7 @@ private async Task ProcessNodeHealthAsync(IList nodeHealthState { // Check cluster upgrade status. int udInClusterUpgrade = await UpgradeChecker.GetUdsWhereFabricUpgradeInProgressAsync(FabricClientInstance, token).ConfigureAwait(false); - - var supportedNodeHealthStates = - nodeHealthStates.Where( a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); + var supportedNodeHealthStates = nodeHealthStates.Where( a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); foreach (var node in supportedNodeHealthStates) { @@ -640,13 +619,13 @@ private async Task ProcessGenericEntityHealthAsync(HealthEvaluation evaluation, if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthEventDescription = telemetryDescription, - HealthState = healthState, - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthEventDescription = telemetryDescription, + HealthState = healthState, + Source = ObserverName, + }); } } @@ -655,19 +634,14 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) // If a node's NodeStatus is Disabling, Disabled, or Down // for at or above the specified maximum time (in Settings.xml), // then CO will emit a Warning signal. - var nodeList = - await FabricClientInstance.QueryManager.GetNodeListAsync( - null, - ConfigSettings.AsyncTimeout, - token).ConfigureAwait(true); + var nodeList = await FabricClientInstance.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, token).ConfigureAwait(true); // Are any of the nodes that were previously in non-Up status, now Up? if (NodeStatusDictionary.Count > 0) { foreach (var nodeDictItem in NodeStatusDictionary) { - if (!nodeList.Any(n => n.NodeName == nodeDictItem.Key - && n.NodeStatus == NodeStatus.Up)) + if (!nodeList.Any(n => n.NodeName == nodeDictItem.Key && n.NodeStatus == NodeStatus.Up)) { continue; } @@ -692,17 +666,17 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Node", - HealthState = "Ok", - HealthEventDescription = $"{nodeDictItem.Key} is now Up.", - Metric = "NodeStatus", - NodeName = nodeDictItem.Key, - NodeStatus = "Up", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Node", + HealthState = "Ok", + HealthEventDescription = $"{nodeDictItem.Key} is now Up.", + Metric = "NodeStatus", + NodeName = nodeDictItem.Key, + NodeStatus = "Up", + Source = ObserverName, + }); } // Clear dictionary entry. @@ -710,9 +684,7 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( } } - if (!nodeList.All( - n => - n.NodeStatus == NodeStatus.Up)) + if (!nodeList.All(n => n.NodeStatus == NodeStatus.Up)) { var filteredList = nodeList.Where( node => node.NodeStatus == NodeStatus.Disabled @@ -723,14 +695,11 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( { if (!NodeStatusDictionary.ContainsKey(node.NodeName)) { - NodeStatusDictionary.Add( - node.NodeName, - (node.NodeStatus, DateTime.Now, DateTime.Now)); + NodeStatusDictionary.Add(node.NodeName, (node.NodeStatus, DateTime.Now, DateTime.Now)); } else { - if (NodeStatusDictionary.TryGetValue( - node.NodeName, out var tuple)) + if (NodeStatusDictionary.TryGetValue(node.NodeName, out var tuple)) { NodeStatusDictionary[node.NodeName] = (node.NodeStatus, tuple.FirstDetectedTime, DateTime.Now); } @@ -771,17 +740,17 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Node", - HealthState = "Warning", - HealthEventDescription = message, - Metric = "NodeStatus", - NodeName = kvp.Key, - NodeStatus = $"{kvp.Value.NodeStatus}", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Node", + HealthState = "Warning", + HealthEventDescription = message, + Metric = "NodeStatus", + NodeName = kvp.Key, + NodeStatus = $"{kvp.Value.NodeStatus}", + Source = ObserverName, + }); } } } diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index 3bed2962..b0c88519 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -20,18 +20,17 @@ namespace ClusterObserver { public class ClusterObserverManager : IDisposable { + private static bool etwEnabled; private readonly string nodeName; + private readonly CancellationTokenSource linkedSFRuntimeObserverTokenSource; + private readonly CancellationToken token; private ClusterObserver observer; - private EventWaitHandle globalShutdownEventHandle; - private volatile bool shutdownSignaled; private int shutdownGracePeriodInSeconds = 2; private TimeSpan observerExecTimeout = TimeSpan.FromMinutes(30); - private CancellationToken token; private CancellationTokenSource cts; - private CancellationTokenSource linkedSFRuntimeObserverTokenSource; + private volatile bool shutdownSignaled; private bool hasDisposed; private bool appParamsUpdating; - private static bool etwEnabled; public bool IsObserverRunning { @@ -133,6 +132,7 @@ private static string GetConfigSettingValue(string parameterName) } catch (Exception e) when (e is KeyNotFoundException || e is FabricElementNotFoundException) { + } return null; @@ -145,84 +145,38 @@ private async void ShutdownHandler(object sender, ConsoleCancelEventArgs console return; } - Thread.Sleep(shutdownGracePeriodInSeconds * 1000); + await Task.Delay(shutdownGracePeriodInSeconds).ConfigureAwait(false); shutdownSignaled = true; - _ = globalShutdownEventHandle?.Set(); await StopAsync(); } - // This impl is to ensure FCO exits if shutdown is requested while the over loop is sleeping - // So, instead of blocking with a Thread.Sleep, for example, ThreadSleep is used to ensure - // we can receive signals and act accordingly during thread sleep state. - private void ThreadSleep(EventWaitHandle ewh, TimeSpan timeout) - { - // if timeout is <= 0, return. 0 is infinite, and negative is not valid - if (timeout.TotalMilliseconds <= 0) - { - return; - } - - var elapsedTime = new TimeSpan(0, 0, 0); - var stopwatch = new Stopwatch(); - - while (!shutdownSignaled && - !token.IsCancellationRequested && - timeout > elapsedTime) - { - stopwatch.Start(); - - // The event can be signaled by CtrlC, - // Exit ASAP when the program terminates (i.e., shutdown/abort is signaled.) - _ = ewh.WaitOne(timeout.Subtract(elapsedTime)); - stopwatch.Stop(); - - elapsedTime = stopwatch.Elapsed; - } - - if (stopwatch.IsRunning) - { - stopwatch.Stop(); - } - } - private void SetPropertiesFromConfigurationParameters() { // Observer - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverExecutionTimeout), - out int result)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverExecutionTimeout), out int result)) { observerExecTimeout = TimeSpan.FromSeconds(result); } // Logger - if (bool.TryParse( - GetConfigSettingValue(ObserverConstants.EnableVerboseLoggingParameter), - out bool enableVerboseLogging)) + if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableVerboseLoggingParameter), out bool enableVerboseLogging)) { Logger.EnableVerboseLogging = enableVerboseLogging; } - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverLoopSleepTimeSeconds), - out int execFrequency)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverLoopSleepTimeSeconds), out int execFrequency)) { ObserverExecutionLoopSleepSeconds = execFrequency; - - Logger.LogInfo($"ExecutionFrequency is {ObserverExecutionLoopSleepSeconds} Seconds"); } // Shutdown - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverShutdownGracePeriodInSeconds), - out int gracePeriodInSeconds)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverShutdownGracePeriodInSeconds), out int gracePeriodInSeconds)) { shutdownGracePeriodInSeconds = gracePeriodInSeconds; } - if (int.TryParse(GetConfigSettingValue(ObserverConstants.AsyncOperationTimeoutSeconds), - out int asyncTimeout)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.AsyncOperationTimeoutSeconds), out int asyncTimeout)) { AsyncOperationTimeoutSeconds = asyncTimeout; } @@ -240,14 +194,12 @@ private void SetPropertiesFromConfigurationParameters() if (string.IsNullOrEmpty(telemetryProviderType)) { TelemetryEnabled = false; - return; } if (!Enum.TryParse(telemetryProviderType, out TelemetryProviderType telemetryProvider)) { TelemetryEnabled = false; - return; } @@ -255,29 +207,22 @@ private void SetPropertiesFromConfigurationParameters() { case TelemetryProviderType.AzureLogAnalytics: - var logAnalyticsLogType = - GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter) ?? "Application"; - - var logAnalyticsSharedKey = - GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter); + string logAnalyticsLogType = GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter) ?? "Application"; + string logAnalyticsSharedKey = GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter); + string logAnalyticsWorkspaceId = GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter); - var logAnalyticsWorkspaceId = - GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter); - - if (string.IsNullOrEmpty(logAnalyticsSharedKey) - || string.IsNullOrEmpty(logAnalyticsWorkspaceId)) + if (string.IsNullOrEmpty(logAnalyticsSharedKey) || string.IsNullOrEmpty(logAnalyticsWorkspaceId)) { TelemetryEnabled = false; - return; } TelemetryClient = new LogAnalyticsTelemetry( - logAnalyticsWorkspaceId, - logAnalyticsSharedKey, - logAnalyticsLogType, - FabricClientInstance, - token); + logAnalyticsWorkspaceId, + logAnalyticsSharedKey, + logAnalyticsLogType, + FabricClientInstance, + token); break; @@ -288,7 +233,6 @@ private void SetPropertiesFromConfigurationParameters() if (string.IsNullOrEmpty(aiKey)) { TelemetryEnabled = false; - return; } @@ -303,31 +247,30 @@ public async Task StartAsync() { try { - if (globalShutdownEventHandle == null) - { - globalShutdownEventHandle = new EventWaitHandle(false, EventResetMode.ManualReset); - } - while (true) { if (!appParamsUpdating && (shutdownSignaled || token.IsCancellationRequested)) { - _ = globalShutdownEventHandle.Set(); Logger.LogInfo("Shutdown signaled. Stopping."); + await StopAsync().ConfigureAwait(false); break; } await RunObserverAync().ConfigureAwait(false); - Logger.LogInfo($"Sleeping for {(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 10)} seconds before running again."); - ThreadSleep(globalShutdownEventHandle, TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 10)); + Logger.LogInfo($"Sleeping for {(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 30)} seconds before running again."); + await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 30), token); Logger.Flush(); } } - catch (Exception ex) + catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) + { + + } + catch (Exception e) { - var message = $"Unhanded Exception in ClusterObserverManager on node {nodeName}. Taking down CO process. Error info:{Environment.NewLine}{ex}"; + string message = $"Unhanded Exception in ClusterObserverManager on node {nodeName}. Taking down CO process. Error info:{Environment.NewLine}{e}"; Logger.LogError(message); // Telemetry. @@ -422,23 +365,20 @@ private async Task RunObserverAync() return; } - var exceptionBuilder = new StringBuilder(); - try { Logger.LogInfo($"Starting {observer.ObserverName}"); IsObserverRunning = true; // Synchronous call. - var isCompleted = observer.ObserveAsync( - linkedSFRuntimeObserverTokenSource != null ? linkedSFRuntimeObserverTokenSource.Token : token).Wait(observerExecTimeout); + var isCompleted = observer.ObserveAsync(linkedSFRuntimeObserverTokenSource != null ? linkedSFRuntimeObserverTokenSource.Token : token).Wait(observerExecTimeout); // The observer is taking too long (hung?) if (!isCompleted) { string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified run time of {observerExecTimeout.TotalSeconds} seconds. Aborting."; - await SignalAbortToRunningObserverAsync(); + await SignalAbortToRunningObserverAsync().ConfigureAwait(false); Logger.LogWarning(observerHealthWarning); @@ -456,34 +396,25 @@ private async Task RunObserverAync() if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = observerHealthWarning, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = observerHealthWarning, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } + // Create new instance of CO. observer = new ClusterObserver(); cts = new CancellationTokenSource(); } } - catch (AggregateException ex) when ( - ex.InnerException is OperationCanceledException || - ex.InnerException is TaskCanceledException || - ex.InnerException is TimeoutException) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - IsObserverRunning = false; - _ = exceptionBuilder.AppendLine($"Handled Exception from {observer.ObserverName}:{Environment.NewLine}{ex.InnerException}"); - Logger.LogError(exceptionBuilder.ToString()); - _ = exceptionBuilder.Clear(); - } - catch (Exception e) - { - string msg = $"Unhandled exception in ClusterObserverManager.Run(). Taking down process. Error info:{Environment.NewLine}{e}"; + string msg = $"Unhandled exception in ClusterObserverManager.RunObserverAync(). Taking down process. Error info:{Environment.NewLine}{e}"; Logger.LogError(msg); if (TelemetryEnabled) @@ -517,13 +448,21 @@ ex.InnerException is TaskCanceledException || IsObserverRunning = false; } - private void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) + /// + /// App parameter config update handler. This will recreate CO instance with new ConfigSettings applied. + /// + /// + /// + private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) { appParamsUpdating = true; Logger.LogInfo("Application Parameter upgrade started..."); - SignalAbortToRunningObserverAsync(); + + await SignalAbortToRunningObserverAsync(); + observer = new ClusterObserver(e.NewPackage.Settings); cts = new CancellationTokenSource(); + Logger.LogInfo("Application Parameter upgrade complete..."); appParamsUpdating = false; } @@ -545,8 +484,6 @@ protected virtual void Dispose(bool disposing) StopAsync().GetAwaiter().GetResult(); } - globalShutdownEventHandle?.Dispose(); - if (FabricClientInstance != null) { FabricClientInstance.Dispose(); @@ -562,9 +499,7 @@ protected virtual void Dispose(bool disposing) // Flush and Dispose all NLog targets. No more logging. Logger.Flush(); Logger.ShutDown(); - FabricServiceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent -= CodePackageActivationContext_ConfigurationPackageModifiedEvent; - hasDisposed = true; } diff --git a/ClusterObserver/FabricClusterObserver.cs b/ClusterObserver/FabricClusterObserver.cs index 74a61c2a..1d34b5d0 100644 --- a/ClusterObserver/FabricClusterObserver.cs +++ b/ClusterObserver/FabricClusterObserver.cs @@ -10,8 +10,6 @@ namespace ClusterObserver /// internal sealed class FabricClusterObserver : StatelessService { - private ClusterObserverManager observerManager; - public FabricClusterObserver(StatelessServiceContext context) : base(context) { @@ -24,20 +22,8 @@ public FabricClusterObserver(StatelessServiceContext context) /// Canceled when Service Fabric needs to shut down this service instance. protected override async Task RunAsync(CancellationToken cancellationToken) { - observerManager = new ClusterObserverManager(Context, cancellationToken); - + using var observerManager = new ClusterObserverManager(Context, cancellationToken); await observerManager.StartAsync().ConfigureAwait(true); } - - - protected override Task OnCloseAsync(CancellationToken cancellationToken) - { - if (observerManager != null) - { - observerManager.Dispose(); - } - - return base.OnCloseAsync(cancellationToken); - } } } diff --git a/ClusterObserver/PackageRoot/Config/Settings.xml b/ClusterObserver/PackageRoot/Config/Settings.xml index 03d25595..302cbdf8 100644 --- a/ClusterObserver/PackageRoot/Config/Settings.xml +++ b/ClusterObserver/PackageRoot/Config/Settings.xml @@ -1,9 +1,8 @@ 
- - + + @@ -33,7 +32,7 @@ - +
- + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Readme.md b/ClusterObserver/Readme.md index 1da6d140..7c30aa6f 100644 --- a/ClusterObserver/Readme.md +++ b/ClusterObserver/Readme.md @@ -1,4 +1,4 @@ -### ClusterObserver 2.1.0 +### ClusterObserver 2.1.7 ClusterObserver (CO) is a standalone SF singleton stateless service that runs on one node (1) and is independent from FabricObserver, which runs on all nodes (-1). CO observes cluster health (aggregated) and sends telemetry when cluster is in Error (and optionally in Warning). CO shares a very small subset of FabricObserver's (FO) code. It is designed to be completely independent from FO sources, diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index 6d3b5aab..7b479269 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -19,7 +19,7 @@ namespace ClusterObserver.Utilities.Telemetry /// Abstracts the ApplicationInsights telemetry API calls allowing /// other telemetry providers to be plugged in. /// - public class AppInsightsTelemetry : ITelemetryProvider, IDisposable + public class AppInsightsTelemetry : ITelemetryProvider { /// /// ApplicationInsights telemetry client. @@ -344,38 +344,5 @@ public Task ReportMetricAsync( return Task.FromResult(0); } - - private bool disposedValue; // To detect redundant calls - - protected virtual void Dispose(bool disposing) - { - if (!disposedValue) - { - if (disposing) - { - } - - disposedValue = true; - } - } - - // TODO: override a finalizer only if Dispose(bool disposing) above has code to free unmanaged resources. - // ~AppInsightsTelemetry() - // { - // // Do not change this code. Put cleanup code in Dispose(bool disposing) above. - // Dispose(false); - // } - - // This code added to correctly implement the disposable pattern. - - - public void Dispose() - { - // Do not change this code. Put cleanup code in Dispose(bool disposing) above. - Dispose(true); - - // TODO: uncomment the following line if the finalizer is overridden above. - // GC.SuppressFinalize(this); - } } } \ No newline at end of file diff --git a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs index 31cdb896..a25ff4ed 100644 --- a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -121,13 +121,11 @@ private async Task SendTelemetryAsync(string payload, CancellationToken token) logger.LogWarning($"Unexpected response from server in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{responseAsync.StatusCode}: {responseAsync.StatusDescription}"); } -#pragma warning disable CA1031 // Do not take down process due to unhandled exception during telemetry transmission. Log it, fix the bug. catch (Exception e) { // An Exception during telemetry data submission should never take down CO process. Log it. Don't throw it. Fix it. logger.LogWarning($"Handled Exception in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{e}"); } -#pragma warning restore CA1031 // It's ok here. if (retries < MaxRetries) { @@ -178,24 +176,23 @@ public async Task ReportHealthAsync( string serviceName = null, string instanceName = null) { - var (clusterId, _) = - await ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).ConfigureAwait(true); + var (clusterId, _) = await ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).ConfigureAwait(true); string jsonPayload = JsonConvert.SerializeObject( - new - { - id = $"CO_{Guid.NewGuid()}", - datetime = DateTime.UtcNow, - clusterId = clusterId ?? string.Empty, - source = ObserverConstants.ClusterObserverName, - property = propertyName, - healthScope = scope.ToString(), - healthState = state.ToString(), - healthEvaluation = unhealthyEvaluations, - serviceName = serviceName ?? string.Empty, - instanceName = instanceName ?? string.Empty, - osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", - }); + new + { + id = $"CO_{Guid.NewGuid()}", + datetime = DateTime.UtcNow, + clusterId = clusterId ?? string.Empty, + source = ObserverConstants.ClusterObserverName, + property = propertyName, + healthScope = scope.ToString(), + healthState = state.ToString(), + healthEvaluation = unhealthyEvaluations, + serviceName = serviceName ?? string.Empty, + instanceName = instanceName ?? string.Empty, + osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", + }); await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(false); } diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index bcd7086c..12cc57f5 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -7,14 +7,14 @@ - + - + diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 7ca5fc2b..5c363f2e 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 3.1.8 - Updated Windows port monitoring code (ephemeral port range). Added new member to TelemetryData, ProcessId. Observers will no longer locally log Error or Warning Health Events when EnableVerboseLogging is set to false - and it should be false, generally. Only enable EnableVerboseLogging for debugging purposes. You should emit health telemetry to an external service as a general rule. FO supports ApplicationInsights and LogAnalytics out of the box. If you are using the FabricObserverWebApi service, you will still get local Error/Warning logs, regardless of EnableVerboseLogging setting. + 3.1.9 + Minor bug fixes: ETW (internal EventSource error), Telemetry provider impl (ApplicationInsights). Microsoft MIT false diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index 4cdd5c8a..fd1b4533 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,11 +27,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 907f7c8c..d8d7bace 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,11 +21,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index bcd16400..8af6d042 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + - + diff --git a/README.md b/README.md index fe041748..754d7c0d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.8 +# FabricObserver 3.1.9 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of resources that tend to be important to all Service Fabric applications, like disk, CPU, memory, networking, and cluster certificates out-of-the-box. From fd15321e4bc0c9a6920603cd2960c1ee84c70c70 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 12 Apr 2021 18:00:33 -0700 Subject: [PATCH 06/20] Added support for app list paging. Code maintenance. --- .../Utilities/FabricClientRetryErrors.cs | 25 ++-- .../Utilities/FabricClientRetryHelper.cs | 51 +++---- FabricObserver/Observers/AppObserver.cs | 51 +++++-- FabricObserver/Observers/NetworkObserver.cs | 132 +++++++----------- 4 files changed, 121 insertions(+), 138 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs b/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs index 209fb4d2..9d32655e 100644 --- a/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs +++ b/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs @@ -30,7 +30,7 @@ public class FabricClientRetryErrors public static readonly Lazy MoveSecondaryFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.AlreadySecondaryReplica); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.AlreadySecondaryReplica); retryErrors.RetryableFabricErrorCodes.Add(FabricErrorCode.PLBNotReady); return retryErrors; }); @@ -41,7 +41,7 @@ public class FabricClientRetryErrors public static readonly Lazy MovePrimaryFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.AlreadyPrimaryReplica); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.AlreadyPrimaryReplica); retryErrors.RetryableFabricErrorCodes.Add(FabricErrorCode.PLBNotReady); return retryErrors; }); @@ -92,7 +92,7 @@ public class FabricClientRetryErrors public static readonly Lazy ProvisionFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricVersionAlreadyExists); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricVersionAlreadyExists); return retryErrors; }); @@ -102,8 +102,8 @@ public class FabricClientRetryErrors public static readonly Lazy UpgradeFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricUpgradeInProgress); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricAlreadyInTargetVersion); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricUpgradeInProgress); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricAlreadyInTargetVersion); return retryErrors; }); @@ -113,7 +113,7 @@ public class FabricClientRetryErrors public static readonly Lazy RemoveUnreliableTransportBehaviorErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.InternalRetrySuccesSFabricErrorCodes.Add(2147949808); + retryErrors.InternalRetrySuccessFabricErrorCodes.Add(2147949808); return retryErrors; }); @@ -123,7 +123,7 @@ public class FabricClientRetryErrors public static readonly Lazy CreateAppErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.ApplicationAlreadyExists); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.ApplicationAlreadyExists); return retryErrors; }); @@ -133,7 +133,7 @@ public class FabricClientRetryErrors public static readonly Lazy DeleteAppErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.ApplicationNotFound); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.ApplicationNotFound); return retryErrors; }); @@ -146,9 +146,8 @@ public FabricClientRetryErrors() RetryableExceptions = new List(); RetryableFabricErrorCodes = new List(); RetrySuccessExceptions = new List(); - RetrySuccesSFabricErrorCodes = new List(); - - InternalRetrySuccesSFabricErrorCodes = new List(); + RetrySuccessFabricErrorCodes = new List(); + InternalRetrySuccessFabricErrorCodes = new List(); PopulateDefaultValues(); } @@ -180,7 +179,7 @@ public IList RetrySuccessExceptions /// /// Gets list of success error codes that are retry-able. /// - public IList RetrySuccesSFabricErrorCodes + public IList RetrySuccessFabricErrorCodes { get; private set; } @@ -188,7 +187,7 @@ public IList RetrySuccesSFabricErrorCodes /// /// Gets list of internal success error codes that are retry-able. /// - internal IList InternalRetrySuccesSFabricErrorCodes + internal IList InternalRetrySuccessFabricErrorCodes { get; private set; } diff --git a/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs b/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs index ef2fda25..4711df9b 100644 --- a/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs +++ b/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs @@ -26,15 +26,13 @@ public static class FabricClientRetryHelper /// Action to be performed. /// Cancellation token for Async operation. /// Task object. - public static async Task ExecuteFabricActionWithRetryAsync( - Func> function, - CancellationToken cancellationToken) + public static async Task ExecuteFabricActionWithRetryAsync(Func> function, CancellationToken cancellationToken) { return await ExecuteFabricActionWithRetryAsync( - function, - new FabricClientRetryErrors(), - DefaultOperationTimeout, - cancellationToken).ConfigureAwait(false); + function, + new FabricClientRetryErrors(), + DefaultOperationTimeout, + cancellationToken).ConfigureAwait(false); } /// @@ -46,10 +44,10 @@ public static async Task ExecuteFabricActionWithRetryAsync( /// Cancellation token for Async operation. /// Task object. public static async Task ExecuteFabricActionWithRetryAsync( - Func> function, - FabricClientRetryErrors errors, - TimeSpan operationTimeout, - CancellationToken cancellationToken) + Func> function, + FabricClientRetryErrors errors, + TimeSpan operationTimeout, + CancellationToken cancellationToken) { bool needToWait = false; var watch = new Stopwatch(); @@ -77,16 +75,15 @@ public static async Task ExecuteFabricActionWithRetryAsync( if (retryElseSuccess) { - Logger.LogInfo( - $"ExecuteFabricActionWithRetryAsync: Retrying due to Exception: {e}"); + Logger.LogInfo($"ExecuteFabricActionWithRetryAsync: Retrying due to Exception: {e}"); if (watch.Elapsed > operationTimeout) { Logger.LogWarning( - "ExecuteFabricActionWithRetryAsync: Done Retrying. " + - $"Time Elapsed: {watch.Elapsed.TotalSeconds}, " + - $"Timeout: {operationTimeout.TotalSeconds}. " + - $"Throwing Exception: {e}"); + "ExecuteFabricActionWithRetryAsync: Done Retrying. " + + $"Time Elapsed: {watch.Elapsed.TotalSeconds}, " + + $"Timeout: {operationTimeout.TotalSeconds}. " + + $"Throwing Exception: {e}"); throw; } @@ -96,70 +93,58 @@ public static async Task ExecuteFabricActionWithRetryAsync( continue; } - Logger.LogInfo( - $"ExecuteFabricActionWithRetryAsync: Exception {e} Handled but No Retry."); + Logger.LogInfo($"ExecuteFabricActionWithRetryAsync: Exception {e} Handled but No Retry."); return default; } } } - private static bool HandleException( - Exception e, - FabricClientRetryErrors errors, - out bool retryElseSuccess) + private static bool HandleException(Exception e, FabricClientRetryErrors errors, out bool retryElseSuccess) { var fabricException = e as FabricException; if (errors.RetryableExceptions.Contains(e.GetType())) { retryElseSuccess = true /*retry*/; - return true; } if (fabricException != null && errors.RetryableFabricErrorCodes.Contains(fabricException.ErrorCode)) { retryElseSuccess = true /*retry*/; - return true; } if (errors.RetrySuccessExceptions.Contains(e.GetType())) { retryElseSuccess = false /*success*/; - return true; } if (fabricException != null - && errors.RetrySuccesSFabricErrorCodes.Contains(fabricException.ErrorCode)) + && errors.RetrySuccessFabricErrorCodes.Contains(fabricException.ErrorCode)) { retryElseSuccess = false /*success*/; - return true; } if (e.GetType() == typeof(FabricTransientException)) { retryElseSuccess = true /*retry*/; - return true; } if (fabricException?.InnerException != null) { - if (fabricException.InnerException is COMException ex - && errors.InternalRetrySuccesSFabricErrorCodes.Contains((uint)ex.ErrorCode)) + if (fabricException.InnerException is COMException ex && errors.InternalRetrySuccessFabricErrorCodes.Contains((uint)ex.ErrorCode)) { retryElseSuccess = false /*success*/; - return true; } } retryElseSuccess = false; - return false; } } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index d24beb04..13c3c003 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -8,6 +8,7 @@ using System.ComponentModel; using System.Diagnostics; using System.Fabric; +using System.Fabric.Description; using System.Fabric.Health; using System.Fabric.Query; using System.IO; @@ -87,10 +88,10 @@ public override async Task ObserveAsync(CancellationToken token) if (!initialized) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "This observer was unable to initialize correctly due to missing configuration info."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "This observer was unable to initialize correctly due to missing configuration info."); stopwatch.Stop(); stopwatch.Reset(); @@ -312,9 +313,8 @@ private async Task InitializeAsync() ConfigurationSectionName, "AppObserverDataFileName"); - var appObserverConfigFileName = Path.Combine( - ConfigPackagePath ?? string.Empty, - configSettings.AppObserverConfigFileName ?? string.Empty); + // Unit tests may have null path and filename, thus the null equivalence operations. + var appObserverConfigFileName = Path.Combine(ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); if (!File.Exists(appObserverConfigFileName)) { @@ -366,14 +366,39 @@ private async Task InitializeAsync() { ApplicationInfo application = userTargetList.Find(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); - // TODO: This should be paged for cases where a node has hundreds of apps. - var appList = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( - NodeName, - null, + // Let's make sure that we page through app lists that are huge (like 4MB result set (that's a lot of apps)). + var deployedAppQueryDesc = new PagedDeployedApplicationQueryDescription(NodeName) + { + IncludeHealthState = false, + MaxResults = 150, + }; + + var appList = await FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token).ConfigureAwait(false); + + // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. + // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports + // all-app service process monitoring with a very simple configuration pattern. + var apps = appList.ToList(); + + // The GetDeployedApplicationPagedList api will set a continuation token value if it knows it did not return all the results in one swoop. + // Check that it is not null, and make a new query passing back the token it gave you. + while (appList.ContinuationToken != null) + { + Token.ThrowIfCancellationRequested(); + + deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; + + appList = await FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, ConfigurationSettings.AsyncTimeout, Token).ConfigureAwait(false); + apps.AddRange(appList.ToList()); + } - foreach (var app in appList) + foreach (var app in apps) { Token.ThrowIfCancellationRequested(); @@ -452,6 +477,8 @@ private async Task InitializeAsync() // Remove the All or * config item. userTargetList.Remove(application); + apps.Clear(); + apps = null; } int settingSFail = 0; diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 990827cd..0f20dacf 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -109,10 +109,7 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Start(); // Run conn tests. - Retry.Do( - InternetConnectionStateIsConnected, - TimeSpan.FromSeconds(10), - token); + Retry.Do(InternetConnectionStateIsConnected, TimeSpan.FromSeconds(10), token); await ReportAsync(token).ConfigureAwait(true); @@ -161,6 +158,7 @@ public override Task ReportAsync(CancellationToken token) ObserverName = ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName = NodeName, + Source = ObserverConstants.FabricObserverName, }; if (IsTelemetryEnabled) @@ -203,6 +201,7 @@ public override Task ReportAsync(CancellationToken token) ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName, + Source = ObserverConstants.FabricObserverName, }); } } @@ -245,6 +244,7 @@ public override Task ReportAsync(CancellationToken token) ObserverName = ObserverName, Metric = "Internet Connection State", NodeName = NodeName, + Source = ObserverConstants.FabricObserverName, }; _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); @@ -264,6 +264,7 @@ public override Task ReportAsync(CancellationToken token) ObserverName, Metric = "Internet Connection State", NodeName, + Source = ObserverConstants.FabricObserverName, }); } @@ -293,16 +294,15 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) return string.Empty; } - var interfaceInfo = new StringBuilder( - $"Network Interface information for {iPGlobalProperties.HostName}:\n "); + var interfaceInfo = new StringBuilder($"Network Interface information for {iPGlobalProperties.HostName}:{Environment.NewLine} "); foreach (var nic in nics) { token.ThrowIfCancellationRequested(); - _ = interfaceInfo.Append("\n" + nic.Description + "\n"); - _ = interfaceInfo.AppendFormat(" Interface type : {0}\n", nic.NetworkInterfaceType); - _ = interfaceInfo.AppendFormat(" Operational status: {0}\n", nic.OperationalStatus); + _ = interfaceInfo.Append($"{Environment.NewLine}{nic.Description}{Environment.NewLine}"); + _ = interfaceInfo.AppendFormat($" Interface type : {0}{Environment.NewLine}", nic.NetworkInterfaceType); + _ = interfaceInfo.AppendFormat($" Operational status: {0}{Environment.NewLine}", nic.OperationalStatus); // Traffic. if (nic.OperationalStatus != OperationalStatus.Up) @@ -314,10 +314,10 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) var stats = nic.GetIPv4Statistics(); - _ = interfaceInfo.AppendFormat(" Bytes received: {0}\n", stats.BytesReceived); - _ = interfaceInfo.AppendFormat(" Bytes sent: {0}\n", stats.BytesSent); - _ = interfaceInfo.AppendFormat(" Incoming Packets With Errors: {0}\n", stats.IncomingPacketsWithErrors); - _ = interfaceInfo.AppendFormat(" Outgoing Packets With Errors: {0}\n", stats.OutgoingPacketsWithErrors); + _ = interfaceInfo.AppendFormat($" Bytes received: {0}{Environment.NewLine}", stats.BytesReceived); + _ = interfaceInfo.AppendFormat($" Bytes sent: {0}{Environment.NewLine}", stats.BytesSent); + _ = interfaceInfo.AppendFormat($" Incoming Packets With Errors: {0}{Environment.NewLine}", stats.IncomingPacketsWithErrors); + _ = interfaceInfo.AppendFormat($" Outgoing Packets With Errors: {0}{Environment.NewLine}", stats.OutgoingPacketsWithErrors); _ = interfaceInfo.AppendLine(); } @@ -334,11 +334,6 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) private async Task InitializeAsync() { - WriteToLogWithLevel( - ObserverName, - $"Initializing {ObserverName} for network monitoring. | {NodeName}", - LogLevel.Information); - cancellationToken.ThrowIfCancellationRequested(); // This only needs to be logged once. @@ -352,54 +347,38 @@ private async Task InitializeAsync() if (!ObserverLogger.TryWriteLogFile(logPath, GetNetworkInterfaceInfo(cancellationToken))) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create NetInfo.txt file."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Unable to create NetInfo.txt file."); } } - var settings = - FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( - ObserverConstants.ObserverConfigurationPackageName)?.Settings; - - configSettings.Initialize( - settings, - ConfigurationSectionName, - "NetworkObserverDataFileName"); - - var networkObserverConfigFileName = - Path.Combine(dataPackagePath ?? string.Empty, configSettings.NetworkObserverConfigFileName); + var settings = FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject(ObserverConstants.ObserverConfigurationPackageName)?.Settings; + configSettings.Initialize(settings, ConfigurationSectionName, "NetworkObserverDataFileName"); + var networkObserverConfigFileName = Path.Combine(dataPackagePath ?? string.Empty, configSettings.NetworkObserverConfigFileName); if (string.IsNullOrWhiteSpace(networkObserverConfigFileName)) { ObserverLogger.LogWarning("NetworkObserver configuration file path not specified. Exiting."); - return false; } if (!File.Exists(networkObserverConfigFileName)) { ObserverLogger.LogWarning("NetworkObserver configuration file not found. Exiting."); - return false; } if (userConfig.Count == 0) { - using (Stream stream = new FileStream( - networkObserverConfigFileName, - FileMode.Open, - FileAccess.Read, - FileShare.Read)) + using (Stream stream = new FileStream(networkObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { var configs = JsonHelper.ReadFromJsonStream(stream); foreach (var netConfig in configs) { - var deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( - NodeName, - new Uri(netConfig.TargetApp)).ConfigureAwait(false); + var deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, new Uri(netConfig.TargetApp)).ConfigureAwait(false); if (deployedApps == null || deployedApps.Count < 1) { @@ -495,8 +474,7 @@ private void InternetConnectionStateIsConnected() } catch (IOException ie) { - if (ie.InnerException != null - && ie.InnerException is ProtocolViolationException) + if (ie.InnerException != null && ie.InnerException is ProtocolViolationException) { passed = true; } @@ -527,10 +505,10 @@ private void InternetConnectionStateIsConnected() catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - e.ToString()); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + e.ToString()); // Fix the bug.. throw; @@ -582,8 +560,7 @@ private bool TcpEndpointDoConnectionTest(string hostName, int port) { var se = ie.InnerException as SocketException; - if (se.SocketErrorCode == SocketError.ConnectionRefused - || se.SocketErrorCode == SocketError.ConnectionReset) + if (se.SocketErrorCode == SocketError.ConnectionRefused || se.SocketErrorCode == SocketError.ConnectionReset) { if (tcpConnTestRetried <= MaxTcpConnTestRetries) { @@ -600,8 +577,7 @@ private bool TcpEndpointDoConnectionTest(string hostName, int port) } catch (SocketException se) { - if (se.SocketErrorCode == SocketError.ConnectionRefused - || se.SocketErrorCode == SocketError.ConnectionReset) + if (se.SocketErrorCode == SocketError.ConnectionRefused || se.SocketErrorCode == SocketError.ConnectionReset) { if (tcpConnTestRetried < MaxTcpConnTestRetries) { @@ -629,47 +605,43 @@ private void SetHealthState(Endpoint endpoint, string targetApp, bool passed) { if (passed) { - if (healthState == HealthState.Warning && - connectionStatus.Any(conn => conn.HostName == endpoint.HostName && - conn.Health == HealthState.Warning)) + if (healthState == HealthState.Warning && connectionStatus.Any(conn => conn.HostName == endpoint.HostName && conn.Health == HealthState.Warning)) { _ = connectionStatus.RemoveAll(conn => conn.HostName == endpoint.HostName); connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = true, - Health = HealthState.Warning, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = true, + Health = HealthState.Warning, + TargetApp = targetApp, + }); } else { connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = true, - Health = HealthState.Ok, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = true, + Health = HealthState.Ok, + TargetApp = targetApp, + }); } } else { - if (!connectionStatus.Any(conn => conn.HostName == endpoint.HostName && - conn.TargetApp == targetApp && - conn.Health == HealthState.Warning)) + if (!connectionStatus.Any(conn => conn.HostName == endpoint.HostName && conn.TargetApp == targetApp && conn.Health == HealthState.Warning)) { connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = false, - Health = HealthState.Warning, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = false, + Health = HealthState.Warning, + TargetApp = targetApp, + }); if (!AppNames.Contains(targetApp)) { From ff61a1cb8bb3a1bbf6fef9bdc75db8ae60840934 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Apr 2021 10:15:47 -0700 Subject: [PATCH 07/20] Formatting --- FabricObserver.Extensibility/ObserverBase.cs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 61c8f909..f93c2a32 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -1189,7 +1189,7 @@ private void SetObserverConfiguration() IsEtwProviderEnabled = etwProviderEnabled; } - // (Assuming Diagnostics/Analytics cloud service implemented) Telemetry. + // Telemetry. if (bool.TryParse(GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.TelemetryEnabled), out bool telemEnabled)) { IsTelemetryProviderEnabled = telemEnabled; @@ -1226,11 +1226,9 @@ private void SetObserverConfiguration() string logAnalyticsWorkspaceId = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.LogAnalyticsWorkspaceIdParameter); - if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) - || string.IsNullOrEmpty(logAnalyticsSharedKey)) + if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) { IsTelemetryProviderEnabled = false; - return; } @@ -1250,7 +1248,6 @@ private void SetObserverConfiguration() if (string.IsNullOrEmpty(aiKey)) { IsTelemetryProviderEnabled = false; - return; } @@ -1268,8 +1265,6 @@ private void SetObserverConfiguration() private void InitializeCsvLogger() { - // This could be called from app paramter-only update handler. - // You can turn CSV data logging on and off with app parameter updates for 3 observers: AppObserver, FabricSystemObserver and NodeObserver. if (CsvFileLogger != null) { return; @@ -1314,6 +1309,7 @@ private bool IsObserverWebApiAppInstalled() } catch (Exception e) when (e is FabricException || e is TimeoutException) { + } return false; From 4fe6a7cc4627c2f131c398af90bfee12517cd5e5 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Apr 2021 13:40:28 -0700 Subject: [PATCH 08/20] Add: Windows port monitoring retry logic. Function rename. --- .../OperatingSystemInfo/LinuxInfoProvider.cs | 2 +- .../OperatingSystemInfoProvider.cs | 2 +- .../WindowsInfoProvider.cs | 400 ++++++++++-------- .../Utilities/Retry.cs | 12 +- FabricObserver/Observers/AppObserver.cs | 2 +- .../Observers/FabricSystemObserver.cs | 2 +- FabricObserver/Observers/NodeObserver.cs | 2 +- FabricObserver/Observers/OSObserver.cs | 2 +- .../Observers/SFConfigurationObserver.cs | 2 +- FabricObserverTests/ObserverTest.cs | 5 +- 10 files changed, 233 insertions(+), 198 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs index 71849c33..6a11a38d 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs @@ -30,7 +30,7 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem return (totalMem, Math.Round(pctUsed, 2)); } - public override int GetActivePortCount(int processId = -1, ServiceContext context = null) + public override int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null) { int count = GetPortCount(processId, predicate: (line) => true, context); return count; diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs index eda5b721..9c7cde60 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs @@ -53,7 +53,7 @@ protected Logger Logger public abstract (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMemorySizeAndPercentInUse(); - public abstract int GetActivePortCount(int processId = -1, ServiceContext context = null); + public abstract int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null); public abstract int GetActiveEphemeralPortCount(int processId = -1, ServiceContext context = null); diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 53d19cef..7ae15ecd 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -78,13 +78,205 @@ e is FormatException return (-1L, -1); } + public override (int LowPort, int HighPort) TupleGetDynamicPortRange() + { + using (var p = new Process()) + { + try + { + var ps = new ProcessStartInfo + { + Arguments = $"/c netsh int ipv4 show dynamicportrange {TcpProtocol} | find /i \"port\"", + FileName = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe", + UseShellExecute = false, + WindowStyle = ProcessWindowStyle.Hidden, + RedirectStandardInput = true, + RedirectStandardOutput = true, + }; + + p.StartInfo = ps; + _ = p.Start(); + + var stdOutput = p.StandardOutput; + string output = stdOutput.ReadToEnd(); + Match match = Regex.Match( + output, + @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", + RegexOptions.Singleline | RegexOptions.IgnoreCase); + + string startPort = match.Groups["startPort"].Value; + string portCount = match.Groups["numberOfPorts"].Value; + string exitStatus = p.ExitCode.ToString(); + stdOutput.Close(); + + if (exitStatus != "0") + { + return (-1, -1); + } + + int lowPortRange = int.Parse(startPort); + int highPortRange = lowPortRange + int.Parse(portCount); + + return (lowPortRange, highPortRange); + } + catch (Exception e) when ( + e is ArgumentException + || e is IOException + || e is InvalidOperationException + || e is RegexMatchTimeoutException + || e is Win32Exception) + { + } + } + + return (-1, -1); + } + /// - /// Compute count of active ports in dynamic range. + /// Compute count of active TCP ports in dynamic range. /// /// Optional: If supplied, then return the number of ephemeral ports in use by the process. - /// Optional (this is used by Linux callers only): If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. - /// + /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): + /// If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. + /// number of active Epehemeral TCP ports as int value public override int GetActiveEphemeralPortCount(int processId = -1, ServiceContext context = null) + { + int count; + try + { + count = Retry.Do(() => GetEphemeralPortCount(processId), TimeSpan.FromSeconds(5), CancellationToken.None); + } + catch (AggregateException ae) + { + Logger.LogWarning($"Retry failed for GetActiveEphemeralPortCount:{Environment.NewLine}{ae.InnerException}"); + count = -1; + } + + return count; + } + + /// + /// Compute count of active TCP ports. + /// + /// Optional: If supplied, then return the number of tcp ports in use by the process. + /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. + /// number of active TCP ports as int value + public override int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null) + { + int count; + try + { + count = Retry.Do(() => GetTcpPortCount(processId), TimeSpan.FromSeconds(5), CancellationToken.None); + } + catch (AggregateException ae) + { + Logger.LogWarning($"Retry failed for GetActivePortCount:{Environment.NewLine}{ae.InnerException}"); + count = -1; + } + + return count; + } + + public override Task GetOSInfoAsync(CancellationToken cancellationToken) + { + ManagementObjectSearcher win32OsInfo = null; + ManagementObjectCollection results = null; + + OSInfo osInfo = default; + + try + { + win32OsInfo = new ManagementObjectSearcher("SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory,TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); + results = win32OsInfo.Get(); + + foreach (var prop in results) + { + cancellationToken.ThrowIfCancellationRequested(); + + foreach (var p in prop.Properties) + { + cancellationToken.ThrowIfCancellationRequested(); + + string name = p.Name; + string value = p.Value.ToString(); + + if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(value)) + { + continue; + } + + switch (name.ToLowerInvariant()) + { + case "caption": + osInfo.Name = value; + break; + case "numberofprocesses": + if (int.TryParse(value, out int numProcesses)) + { + osInfo.NumberOfProcesses = numProcesses; + } + else + { + osInfo.NumberOfProcesses = -1; + } + + break; + case "status": + osInfo.Status = value; + break; + case "oslanguage": + osInfo.Language = value; + break; + case "version": + osInfo.Version = value; + break; + case "installdate": + osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); + break; + case "lastbootuptime": + osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); + break; + case "freephysicalmemory": + osInfo.AvailableMemoryKB = ulong.Parse(value); + break; + case "freevirtualmemory": + osInfo.FreeVirtualMemoryKB = ulong.Parse(value); + break; + case "totalvirtualmemorysize": + osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); + break; + case "totalvisiblememorysize": + osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); + break; + } + } + } + } + catch (ManagementException) + { + } + finally + { + results?.Dispose(); + win32OsInfo?.Dispose(); + } + + return Task.FromResult(osInfo); + } + + // Not implemented. No Windows support. + public override int GetMaximumConfiguredFileHandlesCount() + { + return -1; + } + + // Not implemented. No Windows support. + public override int GetTotalAllocatedFileHandlesCount() + { + return -1; + } + + private int GetEphemeralPortCount(int processId = -1) { try { @@ -113,7 +305,7 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte p.StartInfo = ps; _ = p.Start(); var stdOutput = p.StandardOutput; - + (int lowPortRange, int highPortRange) = TupleGetDynamicPortRange(); string portRow; while ((portRow = stdOutput.ReadLine()) != null) @@ -169,77 +361,26 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte if (exitStatus != 0) { - return -1; - } - } - - return count; - } - catch (Exception e) when ( - e is ArgumentException - || e is InvalidOperationException - || e is Win32Exception) - { - } - - return -1; - } - - public override (int LowPort, int HighPort) TupleGetDynamicPortRange() - { - using (var p = new Process()) - { - try - { - var ps = new ProcessStartInfo - { - Arguments = $"/c netsh int ipv4 show dynamicportrange {TcpProtocol} | find /i \"port\"", - FileName = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe", - UseShellExecute = false, - WindowStyle = ProcessWindowStyle.Hidden, - RedirectStandardInput = true, - RedirectStandardOutput = true, - }; - - p.StartInfo = ps; - _ = p.Start(); - - var stdOutput = p.StandardOutput; - string output = stdOutput.ReadToEnd(); - Match match = Regex.Match( - output, - @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", - RegexOptions.Singleline | RegexOptions.IgnoreCase); - - string startPort = match.Groups["startPort"].Value; - string portCount = match.Groups["numberOfPorts"].Value; - string exitStatus = p.ExitCode.ToString(); - stdOutput.Close(); + string msg = $"netstat failure: {exitStatus}"; + Logger.LogWarning(msg); - if (exitStatus != "0") - { - return (-1, -1); + // this will be handled by Retry.Do(). + throw new Exception(msg); } - int lowPortRange = int.Parse(startPort); - int highPortRange = lowPortRange + int.Parse(portCount); - - return (lowPortRange, highPortRange); - } - catch (Exception e) when ( - e is ArgumentException - || e is IOException - || e is InvalidOperationException - || e is RegexMatchTimeoutException - || e is Win32Exception) - { + return count; } } - - return (-1, -1); + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + Logger.LogWarning($"Handled Exception in GetEphemeralPortCount:{Environment.NewLine}{e}"); + + // This will be handled by Retry.Do(). + throw; + } } - public override int GetActivePortCount(int processId = -1, ServiceContext context = null) + private int GetTcpPortCount(int processId = -1) { try { @@ -283,7 +424,7 @@ public override int GetActivePortCount(int processId = -1, ServiceContext contex if (processId > 0) { List stats = portRow.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); - + if (stats.Count != 5 || !int.TryParse(stats[4], out int pidPart)) { continue; @@ -310,114 +451,29 @@ public override int GetActivePortCount(int processId = -1, ServiceContext contex output = tempLocalPortData.Count; p.WaitForExit(); - string exitStatus = p.ExitCode.ToString(); + int exitStatus = p.ExitCode; stdOutput.Close(); tempLocalPortData.Clear(); - if (exitStatus != "0") + if (exitStatus != 0) { - return -1; - } + string msg = $"netstat failure: {exitStatus}"; + Logger.LogWarning(msg); + // this will be handled by Retry.Do(). + throw new Exception(msg); + } + return output; } } - catch (Exception e) when ( - e is ArgumentException - || e is InvalidOperationException - || e is Win32Exception) - { - - } - - return -1; - } - - public override Task GetOSInfoAsync(CancellationToken cancellationToken) - { - ManagementObjectSearcher win32OsInfo = null; - ManagementObjectCollection results = null; - - OSInfo osInfo = default; - - try + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { - win32OsInfo = new ManagementObjectSearcher("SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory,TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); - results = win32OsInfo.Get(); - - foreach (var prop in results) - { - cancellationToken.ThrowIfCancellationRequested(); - - foreach (var p in prop.Properties) - { - cancellationToken.ThrowIfCancellationRequested(); - - string name = p.Name; - string value = p.Value.ToString(); + Logger.LogWarning($"Handled Exception in GetTcpPortCount:{Environment.NewLine}{e}"); - if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(value)) - { - continue; - } - - switch (name.ToLowerInvariant()) - { - case "caption": - osInfo.Name = value; - break; - case "numberofprocesses": - if (int.TryParse(value, out int numProcesses)) - { - osInfo.NumberOfProcesses = numProcesses; - } - else - { - osInfo.NumberOfProcesses = -1; - } - - break; - case "status": - osInfo.Status = value; - break; - case "oslanguage": - osInfo.Language = value; - break; - case "version": - osInfo.Version = value; - break; - case "installdate": - osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - case "lastbootuptime": - osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - case "freephysicalmemory": - osInfo.AvailableMemoryKB = ulong.Parse(value); - break; - case "freevirtualmemory": - osInfo.FreeVirtualMemoryKB = ulong.Parse(value); - break; - case "totalvirtualmemorysize": - osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); - break; - case "totalvisiblememorysize": - osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); - break; - } - } - } + // This will be handled by Retry.Do(). + throw; } - catch (ManagementException) - { - } - finally - { - results?.Dispose(); - win32OsInfo?.Dispose(); - } - - return Task.FromResult(osInfo); } private int GetLocalPortFromConsoleOutputRow(string portRow) @@ -446,17 +502,5 @@ private int GetLocalPortFromConsoleOutputRow(string portRow) return int.Parse(localPort); } - - // Not implemented. No Windows support. - public override int GetMaximumConfiguredFileHandlesCount() - { - return -1; - } - - // Not implemented. No Windows support. - public override int GetTotalAllocatedFileHandlesCount() - { - return -1; - } } } diff --git a/FabricObserver.Extensibility/Utilities/Retry.cs b/FabricObserver.Extensibility/Utilities/Retry.cs index 08c645ab..ac825b71 100644 --- a/FabricObserver.Extensibility/Utilities/Retry.cs +++ b/FabricObserver.Extensibility/Utilities/Retry.cs @@ -12,11 +12,7 @@ namespace FabricObserver.Observers.Utilities // https://stackoverflow.com/questions/1563191/cleanest-way-to-write-retry-logic public static class Retry { - public static void Do( - Action action, - TimeSpan retryInterval, - CancellationToken token, - int maxAttempts = 3) + public static void Do(Action action, TimeSpan retryInterval, CancellationToken token, int maxAttempts = 3) { _ = Do( () => @@ -30,11 +26,7 @@ public static void Do( maxAttempts); } - public static T Do( - Func action, - TimeSpan retryInterval, - CancellationToken token, - int maxAttemptCount = 3) + public static T Do(Func action, TimeSpan retryInterval, CancellationToken token, int maxAttemptCount = 3) { var exceptions = new List(); diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 13c3c003..832795f0 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -636,7 +636,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Measure Total and Ephemeral ports. if (checkAllPorts) { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(currentProcess.Id, FabricServiceContext)); } if (checkEphemeralPorts) diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index e3a844ad..3629ee58 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -796,7 +796,7 @@ private async Task GetProcessInfoAsync(string procName) Token.ThrowIfCancellationRequested(); // Ports - Active TCP All - int activePortCount = OperatingSystemInfoProvider.Instance.GetActivePortCount(process.Id, FabricServiceContext); + int activePortCount = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(process.Id, FabricServiceContext); // This is used for info report. TotalActivePortCountAllSystemServices += activePortCount; diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 6e8aa946..13565f45 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -631,7 +631,7 @@ private async Task GetSystemCpuMemoryValuesAsync(CancellationToken token) // Ports. if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) { - int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActivePortCount(); + int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); ActivePortsData.Data.Add(activePortCountTotal); } diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 4ef15dd4..d3f52817 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -373,7 +373,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) osStatus = osInfo.Status; // Active, bound ports. - int activePorts = OperatingSystemInfoProvider.Instance.GetActivePortCount(); + int activePorts = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); // Active, ephemeral ports. int activeEphemeralPorts = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); diff --git a/FabricObserver/Observers/SFConfigurationObserver.cs b/FabricObserver/Observers/SFConfigurationObserver.cs index b7ce261f..1e38f839 100644 --- a/FabricObserver/Observers/SFConfigurationObserver.cs +++ b/FabricObserver/Observers/SFConfigurationObserver.cs @@ -333,7 +333,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) if (procId > -1) { - ports = OperatingSystemInfoProvider.Instance.GetActivePortCount(procId); + ports = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId); ephemeralPorts = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId); } diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index d2a3a6eb..d8be24c3 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -1113,8 +1113,6 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFi string outputFilePath = Path.Combine(Environment.CurrentDirectory, "observer_logs", "NetInfo.txt"); - Console.WriteLine($"outputFilePath: {outputFilePath}"); - // Output log file was created successfully during test. Assert.IsTrue(File.Exists(outputFilePath) && File.GetLastWriteTime(outputFilePath) > startDateTime @@ -1151,6 +1149,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin UseCircularBuffer = true, CpuWarningUsageThresholdPct = 10, MemWarningUsageThresholdMb = 1, // This will generate Warning for sure. + ActivePortsWarningThreshold = 100, // This will generate Warning for sure. }; var obsMgr = new ObserverManager(obs, fabricClient); @@ -1171,7 +1170,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); obs.Dispose(); - await CleanupTestHealthReportsAsync(); + //await CleanupTestHealthReportsAsync(); } /// From f3b283d616b0b2903cd71c1fc7b6ae6913971cea Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Apr 2021 16:42:57 -0700 Subject: [PATCH 09/20] Foramtting. Logging. --- .../Utilities/ObserverHealthReporter.cs | 6 ++---- .../WindowsInfoProvider.cs | 21 ++++++++++--------- FabricObserver/Observers/NetworkObserver.cs | 3 +++ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs index 437f4ceb..f41e3ac0 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs @@ -86,16 +86,14 @@ public void ReportHealthToServiceFabric(HealthReport healthReport) string errWarnPreamble = string.Empty; - if (healthReport.State == HealthState.Error - || healthReport.State == HealthState.Warning) + if (healthReport.State == HealthState.Error || healthReport.State == HealthState.Warning) { errWarnPreamble = $"{healthReport.Observer} detected " + $"{Enum.GetName(typeof(HealthState), healthReport.State)} threshold breach. "; // OSObserver does not monitor resources and therefore does not support related usage threshold configuration. - if (healthReport.Observer == ObserverConstants.OSObserverName - && healthReport.Property == "OSConfiguration") + if (healthReport.Observer == ObserverConstants.OSObserverName && healthReport.Property == "OSConfiguration") { errWarnPreamble = $"{ObserverConstants.OSObserverName} detected potential problem with OS configuration: "; } diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 7ae15ecd..6d1aa25c 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -63,11 +63,9 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem return (visibleTotal / 1024 / 1024, Math.Round(usedPct, 2)); } } - catch (Exception e) when ( - e is FormatException - || e is InvalidCastException - || e is ManagementException) + catch (Exception e) when (e is FormatException || e is InvalidCastException || e is ManagementException) { + Logger.LogWarning($"Handled failure in TupleGetTotalPhysicalMemorySizeAndPercentInUse:{Environment.NewLine}{e}"); } finally { @@ -100,9 +98,9 @@ public override (int LowPort, int HighPort) TupleGetDynamicPortRange() var stdOutput = p.StandardOutput; string output = stdOutput.ReadToEnd(); Match match = Regex.Match( - output, - @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", - RegexOptions.Singleline | RegexOptions.IgnoreCase); + output, + @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", + RegexOptions.Singleline | RegexOptions.IgnoreCase); string startPort = match.Groups["startPort"].Value; string portCount = match.Groups["numberOfPorts"].Value; @@ -142,9 +140,10 @@ e is ArgumentException public override int GetActiveEphemeralPortCount(int processId = -1, ServiceContext context = null) { int count; + try { - count = Retry.Do(() => GetEphemeralPortCount(processId), TimeSpan.FromSeconds(5), CancellationToken.None); + count = Retry.Do(() => GetEphemeralPortCount(processId), TimeSpan.FromSeconds(3), CancellationToken.None); } catch (AggregateException ae) { @@ -159,14 +158,16 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte /// Compute count of active TCP ports. /// /// Optional: If supplied, then return the number of tcp ports in use by the process. - /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. + /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): + /// If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. /// number of active TCP ports as int value public override int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null) { int count; + try { - count = Retry.Do(() => GetTcpPortCount(processId), TimeSpan.FromSeconds(5), CancellationToken.None); + count = Retry.Do(() => GetTcpPortCount(processId), TimeSpan.FromSeconds(3), CancellationToken.None); } catch (AggregateException ae) { diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 0f20dacf..76764a9f 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -169,10 +169,12 @@ public override Task ReportAsync(CancellationToken token) var report = new HealthReport { AppName = new Uri(conn.TargetApp), + Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, HealthData = telemetryData, HealthMessage = healthMessage, HealthReportTimeToLive = timeToLiveWarning, + SourceId = $"{ObserverConstants.NetworkObserverName}({FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable})", State = healthState, NodeName = NodeName, Observer = ObserverName, @@ -223,6 +225,7 @@ public override Task ReportAsync(CancellationToken token) EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, HealthMessage = healthMessage, HealthReportTimeToLive = default, + SourceId = $"{ObserverConstants.NetworkObserverName}({FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable})", State = HealthState.Ok, NodeName = NodeName, Observer = ObserverName, From b85276ab08d8688377250152beb3b97e59f8f247 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Apr 2021 16:54:30 -0700 Subject: [PATCH 10/20] Removed unused usings. --- ClusterObserver/ClusterObserverManager.cs | 2 -- ClusterObserver/Utilities/ConfigSettings.cs | 1 - ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs | 2 -- FabricObserver.Extensibility/Utilities/MemInfoConstants.cs | 5 ++++- .../Utilities/ProcessInfo/IProcessInfoProvider.cs | 2 -- .../Utilities/ProcessInfo/LinuxProcFS.cs | 1 - .../Utilities/ProcessInfo/ProcessInfoProvider.cs | 2 -- .../Utilities/Telemetry/TelemetryProdiverType.cs | 2 -- FabricObserver/Observers/ObserverManager.cs | 1 - TelemetryLib/ITelemetryEventSource.cs | 2 -- 10 files changed, 4 insertions(+), 16 deletions(-) diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index b0c88519..32fc9151 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -5,11 +5,9 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Fabric; using System.Fabric.Health; using System.IO; -using System.Text; using System.Threading; using System.Threading.Tasks; using ClusterObserver.Interfaces; diff --git a/ClusterObserver/Utilities/ConfigSettings.cs b/ClusterObserver/Utilities/ConfigSettings.cs index e9c61c99..e729eade 100644 --- a/ClusterObserver/Utilities/ConfigSettings.cs +++ b/ClusterObserver/Utilities/ConfigSettings.cs @@ -8,7 +8,6 @@ using System.Fabric; using System.Fabric.Description; using System.Linq; -using System.Threading; namespace ClusterObserver.Utilities { diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs b/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs index 54dd37c0..93e15fcb 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace ClusterObserver.Utilities.Telemetry { public enum TelemetryProviderType diff --git a/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs b/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs index 9f2fd5ee..c4401d97 100644 --- a/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs +++ b/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs @@ -1,4 +1,7 @@ -using NLog.LayoutRenderers; +// ------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License (MIT). See License.txt in the repo root for license information. +// ------------------------------------------------------------ namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index f9070a6e..61e4ea43 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -4,8 +4,6 @@ // ------------------------------------------------------------ using System.Fabric; -using System.Threading; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs index 3e3cc8c1..57734004 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs @@ -3,7 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; using System.Collections.Generic; using System.IO; using System.Text; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index 2f000b6b..a430ec2e 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -5,8 +5,6 @@ using System.Fabric; using System.Runtime.InteropServices; -using System.Threading; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs index 5dfbf99d..60cea80b 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace FabricObserver.Observers.Utilities.Telemetry { public enum TelemetryProviderType diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 0191930a..57daf06b 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -5,7 +5,6 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Fabric; using System.Fabric.Health; using System.IO; diff --git a/TelemetryLib/ITelemetryEventSource.cs b/TelemetryLib/ITelemetryEventSource.cs index ff137a71..23a5045d 100644 --- a/TelemetryLib/ITelemetryEventSource.cs +++ b/TelemetryLib/ITelemetryEventSource.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace Microsoft.ServiceFabric.TelemetryLib { /// From 538c3b74b85accb822c869ae7fc88963617a5dbe Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Apr 2021 17:21:01 -0700 Subject: [PATCH 11/20] FO 3.1.9, CO 2.1.7 nuspecs --- ClusterObserver.nuspec.template | 6 +++++- FabricObserver.nuspec.template | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index 17217cdf..5c0d98c7 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -3,7 +3,11 @@ %PACKAGE_ID% 2.1.7 - Bug fix in Application monitoring logic. + + - Fixed bug in Application health query processor. + - ApplicationInsights TelemetryProvider impl update. + - Improvements in service close code. + Microsoft MIT false diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 5c363f2e..1739ce8f 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -3,7 +3,13 @@ %PACKAGE_ID% 3.1.9 - Minor bug fixes: ETW (internal EventSource error), Telemetry provider impl (ApplicationInsights). + + - Fixed EventSource logger bug. + - Added retry logic and logging to Windows port monitoring code. + - Added result set paging support in AppObserver's DeployedApplication query logic. + - Updated ApplicationInsights telemetry provider impl. + - Code maintenance. + Microsoft MIT false From 2d2e68665a6613234cf1dde3f07e316a1cab7082 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 14 Apr 2021 17:22:14 -0700 Subject: [PATCH 12/20] Code maintenance, fabric api retries. --- .../WindowsInfoProvider.cs | 8 +- FabricObserver/Observers/AppObserver.cs | 112 ++++++---- .../Observers/CertificateObserver.cs | 3 +- FabricObserver/Observers/DiskObserver.cs | 8 +- .../Observers/FabricSystemObserver.cs | 31 +-- FabricObserver/Observers/NetworkObserver.cs | 9 +- FabricObserver/Observers/NodeObserver.cs | 15 +- FabricObserver/Observers/OSObserver.cs | 207 ++++++++++-------- FabricObserver/Observers/ObserverManager.cs | 16 +- .../Observers/SFConfigurationObserver.cs | 68 +++--- FabricObserverTests/ObserverTest.cs | 4 +- README.md | 2 +- 12 files changed, 247 insertions(+), 236 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 6d1aa25c..31caba21 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -147,7 +147,7 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte } catch (AggregateException ae) { - Logger.LogWarning($"Retry failed for GetActiveEphemeralPortCount:{Environment.NewLine}{ae.InnerException}"); + Logger.LogWarning($"Retry failed for GetActiveEphemeralPortCount:{Environment.NewLine}{ae}"); count = -1; } @@ -171,7 +171,7 @@ public override int GetActiveTcpPortCount(int processId = -1, ServiceContext con } catch (AggregateException ae) { - Logger.LogWarning($"Retry failed for GetActivePortCount:{Environment.NewLine}{ae.InnerException}"); + Logger.LogWarning($"Retry failed for GetActivePortCount:{Environment.NewLine}{ae}"); count = -1; } @@ -322,6 +322,8 @@ private int GetEphemeralPortCount(int processId = -1) // would artificially increase the count of ports that FO computes. if (processId > 0) { + /* A pid could be a subset of a port number, so make sure that we only match pid. */ + List stats = portRow.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (stats.Count != 5 || !int.TryParse(stats[4], out int pidPart)) @@ -424,6 +426,8 @@ private int GetTcpPortCount(int processId = -1) // Only add unique pid (if supplied in call) and local port data to list. if (processId > 0) { + /* A pid could be a subset of a port number, so make sure that we only match pid. */ + List stats = portRow.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (stats.Count != 5 || !int.TryParse(stats[4], out int pidPart)) diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 832795f0..b235a9f2 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -308,10 +308,10 @@ private async Task InitializeAsync() } configSettings.Initialize( - FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( - ObserverConstants.ObserverConfigurationPackageName)?.Settings, - ConfigurationSectionName, - "AppObserverDataFileName"); + FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( + ObserverConstants.ObserverConfigurationPackageName)?.Settings, + ConfigurationSectionName, + "AppObserverDataFileName"); // Unit tests may have null path and filename, thus the null equivalence operations. var appObserverConfigFileName = Path.Combine(ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); @@ -373,10 +373,12 @@ private async Task InitializeAsync() MaxResults = 150, }; - var appList = await FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token).ConfigureAwait(false); + var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports @@ -391,10 +393,13 @@ private async Task InitializeAsync() deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; - appList = await FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token).ConfigureAwait(false); + appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + apps.AddRange(appList.ToList()); } @@ -749,14 +754,8 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) #endif continue; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}", @@ -775,31 +774,53 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) { - DeployedApplicationList deployedApps; + List deployedApps = new List(); if (applicationNameFilter != null) { - deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(true); + var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(false); + deployedApps = app.ToList(); } - else + else if (!string.IsNullOrWhiteSpace(applicationType)) { - deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName).ConfigureAwait(true); + // Let's make sure that we page through app lists that are huge (like 4MB result set (that's a lot of apps)). + var deployedAppQueryDesc = new PagedDeployedApplicationQueryDescription(NodeName) + { + IncludeHealthState = false, + MaxResults = 150, + }; + + var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + + // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. + // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports + // all-app service process monitoring with a very simple configuration pattern. + deployedApps = appList.ToList(); - if (deployedApps.Count > 0 && !string.IsNullOrWhiteSpace(applicationType)) + // The GetDeployedApplicationPagedList api will set a continuation token value if it knows it did not return all the results in one swoop. + // Check that it is not null, and make a new query passing back the token it gave you. + while (appList.ContinuationToken != null) { - for (int i = 0; i < deployedApps.Count; i++) - { - Token.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); - if (deployedApps[i].ApplicationTypeName == applicationType) - { - continue; - } + deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; - deployedApps.Remove(deployedApps[i]); - --i; - } + appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + + deployedApps.AddRange(appList.ToList()); } + + deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType)?.ToList(); } var currentReplicaInfoList = new List(); @@ -856,16 +877,19 @@ private async Task> GetDeployedPrimaryRepl ServiceFilterType filterType = ServiceFilterType.None, string appTypeName = null) { - var deployedReplicaList = await FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, appName).ConfigureAwait(true); + var deployedReplicaList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, appName), + Token); + var replicaMonitoringList = new List(); SetInstanceOrReplicaMonitoringList( - appName, - serviceFilterList, - filterType, - appTypeName, - deployedReplicaList, - ref replicaMonitoringList); + appName, + serviceFilterList, + filterType, + appTypeName, + deployedReplicaList, + ref replicaMonitoringList); return replicaMonitoringList; } @@ -884,8 +908,7 @@ private void SetInstanceOrReplicaMonitoringList( ReplicaOrInstanceMonitoringInfo replicaInfo = null; - if (deployedReplica is DeployedStatefulServiceReplica statefulReplica - && statefulReplica.ReplicaRole == ReplicaRole.Primary) + if (deployedReplica is DeployedStatefulServiceReplica statefulReplica && statefulReplica.ReplicaRole == ReplicaRole.Primary) { replicaInfo = new ReplicaOrInstanceMonitoringInfo() { @@ -921,8 +944,7 @@ private void SetInstanceOrReplicaMonitoringList( ServiceName = statelessInstance.ServiceName, }; - if (filterList != null - && filterType != ServiceFilterType.None) + if (filterList != null && filterType != ServiceFilterType.None) { bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 701d8b78..1354afe9 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -79,8 +79,7 @@ public override async Task ObserveAsync(CancellationToken token) { // Only run once per specified time in Settings.xml. (default is already set to 1 day for CertificateObserver) // See Settings.xml, CertificateObserverConfiguration section, RunInterval parameter. - if (RunInterval > TimeSpan.MinValue - && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) + if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) { return; } diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index 716e7b9f..32277cfe 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -181,14 +181,8 @@ public override async Task ObserveAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 3629ee58..9c82a120 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -193,14 +193,8 @@ public override async Task ObserveAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", @@ -386,14 +380,8 @@ public override Task ReportAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ReportAsync:{Environment.NewLine}{e}", @@ -877,14 +865,8 @@ private async Task GetProcessInfoAsync(string procName) await Task.Delay(250, Token).ConfigureAwait(false); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}", @@ -908,13 +890,8 @@ private async Task GetProcessInfoAsync(string procName) #endif continue; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}", diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 76764a9f..2210e571 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -417,6 +417,8 @@ private void InternetConnectionStateIsConnected() foreach (var endpoint in config.Endpoints) { + cancellationToken.ThrowIfCancellationRequested(); + if (string.IsNullOrEmpty(endpoint.HostName)) { continue; @@ -430,16 +432,13 @@ private void InternetConnectionStateIsConnected() } bool passed = false; - cancellationToken.ThrowIfCancellationRequested(); // SQL Azure, other database services that are addressable over direct TCP. if (endpoint.Protocol == DirectInternetProtocol.Tcp) { passed = TcpEndpointDoConnectionTest(endpoint.HostName, endpoint.Port); } - - // Default is http. - else + else // Default is http. { // Service REST endpoints, CosmosDB REST endpoint, etc. // Http protocol means any enpoint/port pair that is addressable over HTTP/s. @@ -505,7 +504,7 @@ private void InternetConnectionStateIsConnected() passed = true; } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 13565f45..7a5efb54 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -394,13 +394,8 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, @@ -731,14 +726,8 @@ error on these conditions. */ timer.Stop(); timer.Reset(); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index d3f52817..4b4157f4 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -84,10 +84,10 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { + token.ThrowIfCancellationRequested(); + try { - token.ThrowIfCancellationRequested(); - // OS Health. if (osStatus != null && !string.Equals(osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { @@ -110,12 +110,26 @@ public override Task ReportAsync(CancellationToken token) if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - FabricRuntime.GetActivationContext().ApplicationName, - HealthState.Error, - $"{NodeName} - OS reporting unhealthy: {osStatus}", - ObserverName, - Token); + HealthScope.Node, + ObserverConstants.FabricObserverName, + HealthState.Error, + healthMessage, + ObserverName, + Token); + } + + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + HealthScope = HealthScope.Node, + Source = ObserverConstants.FabricObserverName, + HealthState = HealthState.Error, + Description = healthMessage, + ObserverName + }); } } else if (HasActiveFabricErrorOrWarning && string.Equals(osStatus, "OK", StringComparison.OrdinalIgnoreCase)) @@ -129,11 +143,36 @@ public override Task ReportAsync(CancellationToken token) NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Ok, - HealthReportTimeToLive = default, + HealthReportTimeToLive = GetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(healthReport); + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + HealthScope.Node, + ObserverConstants.FabricObserverName, + HealthState.Error, + healthMessage, + ObserverName, + Token); + } + + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + HealthScope = HealthScope.Node, + Source = ObserverConstants.FabricObserverName, + HealthState = HealthState.Error, + Description = healthMessage, + ObserverName + }); + } + // Reset internal health state. HasActiveFabricErrorOrWarning = false; } @@ -209,27 +248,21 @@ public override Task ReportAsync(CancellationToken token) if (IsEtwProviderEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - HealthState = "Warning", - HealthEventDescription = auServiceEnabledMessage, - ObserverName, - Metric = "WUAutoDownloadEnabled", - Value = isAUAutomaticDownloadEnabled, - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + HealthState = "Warning", + HealthEventDescription = auServiceEnabledMessage, + ObserverName, + Metric = "WUAutoDownloadEnabled", + Value = isAUAutomaticDownloadEnabled, + NodeName, + }); } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, @@ -265,16 +298,16 @@ private async Task> GetInfrastructureServiceInstancesAsync( await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => FabricClientInstance.QueryManager.GetServiceListAsync( - new Uri("fabric:/System"), - null, - ConfigurationSettings.AsyncTimeout, - Token), + new Uri("fabric:/System"), + null, + ConfigurationSettings.AsyncTimeout, + Token), Token).ConfigureAwait(false); var infraInstances = allSystemServices.Where( - i => i.ServiceTypeName.Equals( - "InfrastructureServiceType", - StringComparison.InvariantCultureIgnoreCase)); + i => i.ServiceTypeName.Equals( + "InfrastructureServiceType", + StringComparison.InvariantCultureIgnoreCase)); return infraInstances; } @@ -531,32 +564,32 @@ private async Task GetComputerInfoAsync(CancellationToken token) } ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - HealthState = "Ok", - Node = NodeName, - Observer = ObserverName, - OS = osInfo.Name, - OSVersion = osInfo.Version, - OSInstallDate = osInfo.InstallDate, - AutoUpdateEnabled = auStateUnknown ? "Unknown" : isAUAutomaticDownloadEnabled.ToString(), - osInfo.LastBootUpTime, - WindowsAutoUpdateEnabled = isAUAutomaticDownloadEnabled, - TotalMemorySizeGB = (int)(osInfo.TotalVisibleMemorySizeKB / 1048576), - AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), - AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), - LogicalProcessorCount = logicalProcessorCount, - LogicalDriveCount = logicalDriveCount, - DriveInfo = driveInfo, - NumberOfRunningProcesses = osInfo.NumberOfProcesses, - ActiveFirewallRules = firewalls, - ActivePorts = activePorts, - ActiveEphemeralPorts = activeEphemeralPorts, - WindowsDynamicPortRange = osEphemeralPortRange, - FabricAppPortRange = fabricAppPortRange, - HotFixes = hotFixes, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + HealthState = "Ok", + Node = NodeName, + Observer = ObserverName, + OS = osInfo.Name, + OSVersion = osInfo.Version, + OSInstallDate = osInfo.InstallDate, + AutoUpdateEnabled = auStateUnknown ? "Unknown" : isAUAutomaticDownloadEnabled.ToString(), + osInfo.LastBootUpTime, + WindowsAutoUpdateEnabled = isAUAutomaticDownloadEnabled, + TotalMemorySizeGB = (int)(osInfo.TotalVisibleMemorySizeKB / 1048576), + AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), + AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), + LogicalProcessorCount = logicalProcessorCount, + LogicalDriveCount = logicalDriveCount, + DriveInfo = driveInfo, + NumberOfRunningProcesses = osInfo.NumberOfProcesses, + ActiveFirewallRules = firewalls, + ActivePorts = activePorts, + ActiveEphemeralPorts = activeEphemeralPorts, + WindowsDynamicPortRange = osEphemeralPortRange, + FabricAppPortRange = fabricAppPortRange, + HotFixes = hotFixes, + }); } // Telemetry @@ -568,40 +601,34 @@ private async Task GetComputerInfoAsync(CancellationToken token) } TelemetryClient?.ReportMetricAsync( - new MachineTelemetryData - { - HealthState = "Ok", - Node = NodeName, - Observer = ObserverName, - OS = osInfo.Name, - OSVersion = osInfo.Version, - OSInstallDate = osInfo.InstallDate, - LastBootUpTime = osInfo.LastBootUpTime, - WindowsUpdateAutoDownloadEnabled = isAUAutomaticDownloadEnabled, - TotalMemorySizeGB = (int)osInfo.TotalVisibleMemorySizeKB / 1048576, - AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), - AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), - LogicalProcessorCount = logicalProcessorCount, - LogicalDriveCount = logicalDriveCount, - DriveInfo = driveInfo, - NumberOfRunningProcesses = osInfo.NumberOfProcesses, - ActiveFirewallRules = firewalls, - ActivePorts = activePorts, - ActiveEphemeralPorts = activeEphemeralPorts, - WindowsDynamicPortRange = osEphemeralPortRange, - FabricAppPortRange = fabricAppPortRange, - HotFixes = hotFixes, - }, Token); + new MachineTelemetryData + { + HealthState = "Ok", + Node = NodeName, + Observer = ObserverName, + OS = osInfo.Name, + OSVersion = osInfo.Version, + OSInstallDate = osInfo.InstallDate, + LastBootUpTime = osInfo.LastBootUpTime, + WindowsUpdateAutoDownloadEnabled = isAUAutomaticDownloadEnabled, + TotalMemorySizeGB = (int)osInfo.TotalVisibleMemorySizeKB / 1048576, + AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), + AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), + LogicalProcessorCount = logicalProcessorCount, + LogicalDriveCount = logicalDriveCount, + DriveInfo = driveInfo, + NumberOfRunningProcesses = osInfo.NumberOfProcesses, + ActiveFirewallRules = firewalls, + ActivePorts = activePorts, + ActiveEphemeralPorts = activeEphemeralPorts, + WindowsDynamicPortRange = osEphemeralPortRange, + FabricAppPortRange = fabricAppPortRange, + HotFixes = hotFixes, + }, Token); } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 57daf06b..e7825cda 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -121,7 +121,7 @@ private int MaxArchivedLogFileLifetimeDays /// /// Initializes a new instance of the class. - /// This is used for unit testing. + /// This is only used by unit tests. /// /// Observer instance. public ObserverManager(ObserverBase observer, FabricClient fabricClient) @@ -207,10 +207,10 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie } #endif telemetryEvents = new TelemetryEvents( - FabricClientInstance, - FabricServiceContext, - ServiceEventSource.Current, - this.token); + FabricClientInstance, + FabricServiceContext, + ServiceEventSource.Current, + this.token); if (telemetryEvents.FabricObserverRuntimeNodeEvent(codePkgVersion, GetFabricObserverInternalConfiguration(), "HealthState.Initialized")) { @@ -355,8 +355,9 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig await Task.Delay(250).ConfigureAwait(false); } } - catch (Exception) + catch (FabricException) { + } await Task.Delay(250).ConfigureAwait(false); @@ -388,6 +389,7 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig } catch (FabricException) { + } await Task.Delay(250).ConfigureAwait(false); @@ -451,6 +453,7 @@ private static bool IsObserverWebApiAppInstalled() } catch (Exception e) when (e is FabricException || e is TimeoutException) { + } return false; @@ -468,6 +471,7 @@ private static string GetConfigSettingValue(string parameterName) } catch (Exception e) when (e is KeyNotFoundException || e is FabricElementNotFoundException) { + } return null; diff --git a/FabricObserver/Observers/SFConfigurationObserver.cs b/FabricObserver/Observers/SFConfigurationObserver.cs index 1e38f839..2dd80ba4 100644 --- a/FabricObserver/Observers/SFConfigurationObserver.cs +++ b/FabricObserver/Observers/SFConfigurationObserver.cs @@ -100,17 +100,13 @@ public override async Task ObserveAsync(CancellationToken token) { } - catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) - { - return; - } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"this.NodeName | Unhandled Exception trying to read registry value:\n{e}"); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + $"Unhandled Exception in ObserveAsync:{Environment.NewLine}{e}"); throw; } @@ -124,7 +120,7 @@ public override async Task ReportAsync(CancellationToken token) var sb = new StringBuilder(); - _ = sb.AppendLine("\nService Fabric information:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Service Fabric information:{Environment.NewLine}"); if (!string.IsNullOrEmpty(SFVersion)) { @@ -182,10 +178,10 @@ public override async Task ReportAsync(CancellationToken token) if (!ObserverLogger.TryWriteLogFile(logPath, sb.ToString())) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create SFInfraInfo.txt file."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Unable to create SFInfraInfo.txt file."); } _ = sb.Clear(); @@ -257,7 +253,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) token.ThrowIfCancellationRequested(); // Node Information. - _ = sb.AppendLine("\nNode Info:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Node Info:{Environment.NewLine}"); _ = sb.AppendLine($"Node Name: {NodeName}"); _ = sb.AppendLine($"Node Id: {FabricServiceContext.NodeContext.NodeId}"); _ = sb.AppendLine($"Node Instance Id: {FabricServiceContext.NodeContext.NodeInstanceId}"); @@ -288,7 +284,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) // Application Info. if (appList != null) { - _ = sb.AppendLine("\nDeployed Apps:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Deployed Apps:{Environment.NewLine}"); foreach (var app in appList) { @@ -307,7 +303,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) _ = sb.AppendLine("Status: " + status); // Service(s). - _ = sb.AppendLine("\n\tServices:"); + _ = sb.AppendLine($"{Environment.NewLine}\tServices:"); var serviceList = await FabricClientInstance.QueryManager.GetServiceListAsync(app.ApplicationName).ConfigureAwait(true); var replicaList = await FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, app.ApplicationName).ConfigureAwait(true); @@ -359,25 +355,25 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Level = 0, // Info - Node = NodeName, - Observer = ObserverName, - AppName = appName, - AppType = appType, - AppVersion = appVersion, - AppHealthState = healthState, - AppStatus = status, - ServiceName = serviceName.OriginalString, - ServiceTypeName = type, - Kind = kind, - ProcessModel = processModel, - ServiceManifestVersion = serviceManifestVersion, - ActivePorts = ports, - EphemeralPorts = ephemeralPorts, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + Level = 0, // Info + Node = NodeName, + Observer = ObserverName, + AppName = appName, + AppType = appType, + AppVersion = appVersion, + AppHealthState = healthState, + AppStatus = status, + ServiceName = serviceName.OriginalString, + ServiceTypeName = type, + Kind = kind, + ProcessModel = processModel, + ServiceManifestVersion = serviceManifestVersion, + ActivePorts = ports, + EphemeralPorts = ephemeralPorts, + }); } break; diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index d8be24c3..64f8709f 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -392,8 +392,8 @@ public async Task Successful_CertificateObserver_Run_Cancellation_Via_ObserverMa obsMgr.Dispose(); } - /* NOTE: These test can be flaky due to the Test infra. Try running them as a group, after running all the other tests as a group. - If any fail, then re-run the failed ones.. */ + /* NOTE: These tests are flaky due to the Test infra. Try running them as a group, after running all the other tests as a group. + If any fail, then re-run the failed ones. */ [TestMethod] public async Task Successful_AppObserver_Run_Cancellation_Via_ObserverManager() diff --git a/README.md b/README.md index 754d7c0d..6e6616e0 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ For more information about **the design of FabricObserver**, please see the [Des 2. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) 3. Build. -Note: There is no need to run FO as system on Windows or root on Linux. +***Note: By default, FO runs as NetworkUser on Windows and sfappsuser on Linux. If you want to monitor SF service processes that run as elevated (System) on Windows, then you must also run FO as System on Windows. There is no reason to run as root on Linux under any circumstances (see the Capabilities binaries implementations, which allow for FO to run as sfappsuser and successfully execute specific commands that require elevated privilege).*** For Linux deployments, we have ensured that FO will work as expected as normal user (non-root user). In order for us to do this, we had to implement a setup script that sets [Capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html) on a proxy binary which can run netstat -tnap elevated. If you deploy from VS, then you will need to use FabricObserver/PackageRoot/ServiceManifest.linux.xml (just copy its contents into ServiceManifest.xml or add the new piece which is simply a SetupEntryPoint section). You will also need to do the same with ApplicationManifest.xml (see FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.linux.xml for required changes). If you use our build scripts, they will take care of these modifications automatically for linux build output. From 1ace3e9c4e35e2745274c32fdd518b00ef46a1a8 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 14 Apr 2021 17:28:34 -0700 Subject: [PATCH 13/20] updated test (nodeObs) --- FabricObserverTests/ObserverTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 64f8709f..8a885496 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -1170,7 +1170,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); obs.Dispose(); - //await CleanupTestHealthReportsAsync(); + await CleanupTestHealthReportsAsync(); } /// From 64a979feffc15a58df6de21a0abb1ce9fea0254a Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 14:04:27 -0700 Subject: [PATCH 14/20] Updated AppObs targetApp value handling. Udpated CSVLogger file management. Updated tests. --- .../Utilities/DataTableFileLogger.cs | 34 +---- .../Utilities/ObserverHealthReporter.cs | 16 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 24 +-- FabricObserver/Observers/AppObserver.cs | 144 +++++++++++------- FabricObserverTests/ObserverTest.cs | 43 +----- .../Config/AppObserver.config.json | 17 ++- .../Config/NetworkObserver.config.json | 22 ++- 7 files changed, 143 insertions(+), 157 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs index 03873e29..e6053e66 100644 --- a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs +++ b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs @@ -4,7 +4,6 @@ // ------------------------------------------------------------ using System; -using System.Collections.Generic; using System.IO; using System.Runtime.InteropServices; using FabricObserver.Interfaces; @@ -22,8 +21,6 @@ private static ILogger DataLogger get; set; } - private readonly Dictionary FolderCleanedState; - public string DataLogFolder { get; set; @@ -50,7 +47,7 @@ public int MaxArchiveCsvFileLifetimeDays public DataTableFileLogger() { - FolderCleanedState = new Dictionary(); + } public void ConfigureLogger(string filename) @@ -102,22 +99,10 @@ public void ConfigureLogger(string filename) var csvPath = Path.Combine(logFullPath, filename + ".csv"); - // Clean out old files. + // Clean out old files if written as MultipleFilesNoArchives. if (MaxArchiveCsvFileLifetimeDays > 0 && FileWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) { - // Add folder path to state dictionary. - if (!FolderCleanedState.ContainsKey(logFullPath)) - { - FolderCleanedState.Add(logFullPath, DateTime.UtcNow); - } - else - { - // Only clean a folder that hasn't been cleaned for MaxArchiveCsvFileLifetimeDays days. - if (DateTime.UtcNow.Subtract(FolderCleanedState[logFullPath]) >= TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)) - { - CleanLogFolder(logFullPath, TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)); - } - } + TryCleanLogFolder(logFullPath, TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)); } if (DataLogger == null) @@ -180,11 +165,9 @@ public static void Flush() LogManager.Flush(); } - private void CleanLogFolder(string folderPath, TimeSpan maxAge) + private void TryCleanLogFolder(string folderPath, TimeSpan maxAge) { - int count = 0; - - if (Directory.Exists(folderPath)) + if (Directory.Exists(folderPath) && DateTime.UtcNow.Subtract(Directory.GetLastWriteTimeUtc(folderPath)) >= maxAge) { string[] files = Directory.GetFiles(folderPath, "*", SearchOption.AllDirectories); @@ -195,7 +178,6 @@ private void CleanLogFolder(string folderPath, TimeSpan maxAge) if (DateTime.UtcNow.Subtract(File.GetCreationTime(file)) >= maxAge) { File.Delete(file); - count++; } } catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException || e is PathTooLongException) @@ -203,12 +185,6 @@ private void CleanLogFolder(string folderPath, TimeSpan maxAge) } } - - if (count > 0) - { - // The dictionary will always contain the folderPath key. See calling code. - FolderCleanedState[folderPath] = DateTime.UtcNow; - } } } } diff --git a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs index f41e3ac0..7af5cd8f 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs @@ -38,19 +38,21 @@ public ObserverHealthReporter(Logger logger, FabricClient fabricClient) /// Name of the health property. /// Health state (Ok, Error, etc). /// Description of the health condition. - public void ReportFabricObserverServiceHealth( - string serviceName, - string propertyName, - HealthState healthState, - string description) + public void ReportFabricObserverServiceHealth(string serviceName, string propertyName, HealthState healthState, string description) { + string msg = $"{propertyName} reporting {healthState}: {description}"; + if (healthState == HealthState.Error) { - logger.LogError("FabricObserver service health error: " + serviceName + " | " + propertyName + " | {0}", description); + logger.LogError(msg); } else if (healthState == HealthState.Warning) { - logger.LogWarning("FabricObserver service health warning: " + serviceName + " | " + propertyName + " | {0}", description); + logger.LogWarning(msg); + } + else if (logger.EnableVerboseLogging) + { + logger.LogInfo(msg); } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index f4721dc5..099799ce 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -28,10 +28,10 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) processName = process.ProcessName; } } - catch (ArgumentException ex) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogError(ex.Message); + Logger.LogWarning($"Handled Exception in GetProcessPrivateWorkingSetInMB: {e.Message}"); return 0F; } @@ -48,15 +48,11 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) InstanceName = processName }; - // warm up counter. - _ = memProcessPrivateWorkingSetCounter.NextValue(); - return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); } - catch (Exception e) when (e is ArgumentNullException || e is PlatformNotSupportedException || - e is Win32Exception || e is UnauthorizedAccessException) + catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogError($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return 0F; @@ -93,10 +89,10 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService processName = process.ProcessName; } } - catch (ArgumentException ex) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogError(ex.Message); + Logger.LogWarning($"Handled Exception in GetProcessAllocatedHandles: {e.Message}"); return -1F; } @@ -113,15 +109,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService InstanceName = processName }; - // warm up counter. - _ = processFileHandleCounter.NextValue(); - return processFileHandleCounter.NextValue(); } - catch (Exception e) when (e is ArgumentNullException || e is PlatformNotSupportedException || - e is Win32Exception || e is UnauthorizedAccessException) + catch (Exception e) when (e is InvalidOperationException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogError($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return -1F; diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index b235a9f2..588db4d9 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -361,7 +361,7 @@ private async Task InitializeAsync() return false; } - // Support for specifying single configuration item for any or all or * applications. + // Support for specifying single configuration item for all or * applications. if (userTargetList != null && userTargetList.Any(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*")) { ApplicationInfo application = userTargetList.Find(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); @@ -486,27 +486,57 @@ private async Task InitializeAsync() apps = null; } - int settingSFail = 0; + int settingsFail = 0; - foreach (var application in userTargetList) + for (int i = 0; i < userTargetList.Count; i++) { Token.ThrowIfCancellationRequested(); + Uri appUri = null; + ApplicationInfo application = userTargetList[i]; + if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Initialize() | {application.TargetApp}: Required setting, target, is not set."); - - settingSFail++; + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"InitializeAsync() | {application.TargetApp}: Required setting, target, is not set."); + settingsFail++; continue; } + else if (!string.IsNullOrWhiteSpace(application.TargetApp)) + { + try + { + if (!application.TargetApp.StartsWith("fabric:/")) + { + application.TargetApp = application.TargetApp.Insert(0, $"fabric:/"); + } + + if (application.TargetApp.Contains(" ")) + { + application.TargetApp = application.TargetApp.Replace(" ", string.Empty); + } + + appUri = new Uri(application.TargetApp); + } + catch (Exception e) when (e is ArgumentException || e is UriFormatException) + { + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"InitializeAsync() | {application.TargetApp}: Invalid TargetApp value. Value must be a valid Uri string of format \"fabric:/MyApp\", for example."); + + settingsFail++; + continue; + } + } // No required settings supplied for deployed application(s). - if (settingSFail == userTargetList.Count) + if (settingsFail == userTargetList.Count) { return false; } @@ -517,7 +547,7 @@ private async Task InitializeAsync() } else { - await SetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.TargetApp)).ConfigureAwait(false); + await SetDeployedApplicationReplicaOrInstanceListAsync(appUri).ConfigureAwait(false); } } @@ -528,7 +558,7 @@ private async Task InitializeAsync() try { // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. - // Please use ContainerObserver for SF container app service monitoring. + // Please use ContainerObserver for SF container app service monitoring. https://github.com/gittorre/ContainerObserver using Process p = Process.GetProcessById((int)rep.HostProcessId); if (p.ProcessName == "Fabric") @@ -668,19 +698,30 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) /* CPU and Memory Usage */ - TimeSpan duration = TimeSpan.FromSeconds(10); + TimeSpan duration = TimeSpan.FromSeconds(3); if (MonitorDuration > TimeSpan.MinValue) { duration = MonitorDuration; } - // Warm up the counters. + /* Warm up counters. */ + if (checkCpu) { _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); } + if (checkHandles) + { + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); + } + + if (checkMemMb || checkMemPct) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); + } + timer.Start(); while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) @@ -779,7 +820,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat if (applicationNameFilter != null) { var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(false); - deployedApps = app.ToList(); + deployedApps.AddRange(app.ToList()); } else if (!string.IsNullOrWhiteSpace(applicationType)) { @@ -791,11 +832,11 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat }; var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( - () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token), - Token); + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports @@ -811,11 +852,11 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( - () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token), - Token); + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); deployedApps.AddRange(appList.ToList()); } @@ -832,10 +873,9 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat List filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. - var serviceFilter = userTargetList.Find(x => (x.TargetApp != null || x.TargetAppType != null) - && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) - && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); + var serviceFilter = userTargetList.Find(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) + && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); ServiceFilterType filterType = ServiceFilterType.None; @@ -884,12 +924,12 @@ private async Task> GetDeployedPrimaryRepl var replicaMonitoringList = new List(); SetInstanceOrReplicaMonitoringList( - appName, - serviceFilterList, - filterType, - appTypeName, - deployedReplicaList, - ref replicaMonitoringList); + appName, + serviceFilterList, + filterType, + appTypeName, + deployedReplicaList, + ref replicaMonitoringList); return replicaMonitoringList; } @@ -910,6 +950,18 @@ private void SetInstanceOrReplicaMonitoringList( if (deployedReplica is DeployedStatefulServiceReplica statefulReplica && statefulReplica.ReplicaRole == ReplicaRole.Primary) { + if (filterList != null && filterType != ServiceFilterType.None) + { + bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; + } + } + replicaInfo = new ReplicaOrInstanceMonitoringInfo() { ApplicationName = appName, @@ -919,10 +971,12 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statefulReplica.Partitionid, ServiceName = statefulReplica.ServiceName, }; - + } + else if (deployedReplica is DeployedStatelessServiceInstance statelessInstance) + { if (filterList != null && filterType != ServiceFilterType.None) { - bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); switch (filterType) { @@ -931,9 +985,7 @@ private void SetInstanceOrReplicaMonitoringList( continue; } } - } - else if (deployedReplica is DeployedStatelessServiceInstance statelessInstance) - { + replicaInfo = new ReplicaOrInstanceMonitoringInfo() { ApplicationName = appName, @@ -943,18 +995,6 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statelessInstance.Partitionid, ServiceName = statelessInstance.ServiceName, }; - - if (filterList != null && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - - switch (filterType) - { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } - } } if (replicaInfo != null) diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 8a885496..0f5eba2f 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -280,47 +280,7 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() var obs = new AppObserver(fabricClient, context) { - MonitorDuration = TimeSpan.FromSeconds(5), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), - ReplicaOrInstanceList = new List(), - }; - - await obs.ObserveAsync(token).ConfigureAwait(true); - - // observer ran to completion with no errors. - Assert.IsTrue(obs.LastRunDateTime > startDateTime); - - // observer detected no warning conditions. - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); - - // observer did not have any internal errors during run. - Assert.IsFalse(obs.IsUnhealthy); - - await CleanupTestHealthReportsAsync(obs); - - obs.Dispose(); - } - - /// - /// . - /// - /// A representing the result of the asynchronous operation. - [TestMethod] - public async Task AppObserver_ObserveAsync_TargetAppType_Successful_Observer_IsHealthy() - { - if (!isSFRuntimePresentOnTestMachine) - { - return; - } - - var startDateTime = DateTime.Now; - ObserverManager.FabricServiceContext = context; - ObserverManager.TelemetryEnabled = false; - ObserverManager.EtwEnabled = false; - - var obs = new AppObserver(fabricClient, context) - { - MonitorDuration = TimeSpan.FromSeconds(5), + MonitorDuration = TimeSpan.FromSeconds(1), ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), ReplicaOrInstanceList = new List(), }; @@ -341,7 +301,6 @@ public async Task AppObserver_ObserveAsync_TargetAppType_Successful_Observer_IsH obs.Dispose(); } - [TestMethod] public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() { diff --git a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json index d1db115a..15ed9e75 100644 --- a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json @@ -1,10 +1,9 @@ [ { "targetApp": "*", - "cpuWarningLimitPercent": 60, - "networkWarningActivePorts": 1800, - "networkWarningEphemeralPorts": 1400, - "warningOpenFileHandles": 5000 + "cpuWarningLimitPercent": 90, + "networkWarningActivePorts": 8000, + "networkWarningEphemeralPorts": 5000 }, { "targetAppType": "MyAppType", @@ -15,5 +14,15 @@ "targetApp": "fabric:/MyHardWorkingApp42", "cpuWarningLimitPercent": 90, "memoryWarningLimitPercent": 60 + }, + { + "targetApp": "MalformedValueApp0", + "cpuWarningLimitPercent": 50, + "memoryWarningLimitPercent": 40 + }, + { + "targetApp": "Malformed Value App1", + "cpuWarningLimitPercent": 70, + "memoryWarningLimitPercent": 20 } ] \ No newline at end of file diff --git a/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json b/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json index 67fabea6..87412ed1 100644 --- a/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json +++ b/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json @@ -3,16 +3,24 @@ "targetApp": "fabric:/TestApp", "endpoints": [ { - "hostname": "www.facebook.com", - "port": 443 + "hostname": "https://myazuresrvice42.westus2.cloudapp.azure.com", + "port": 443, + "protocol": "http" }, { - "hostname": "www.google.com", - "port": 443 - }, + "hostname": "somesqlservername.database.windows.net", + "port": 1433, + "protocol": "tcp" + } + ] + }, + { + "targetApp": "fabric:/TestApp1", + "endpoints": [ { - "hostname": "www.microsoft.com", - "port": 443 + "hostname": "somesqlservername.database.windows.net", + "port": 1433, + "protocol": "tcp" } ] } From 5781a29fbddfa377a2260e6813c444ce22bee67c Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 15:26:59 -0700 Subject: [PATCH 15/20] FO 3.1.9 nuspec releaseNotes. --- FabricObserver.nuspec.template | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 1739ce8f..13be11e9 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -4,10 +4,13 @@ %PACKAGE_ID% 3.1.9 - - Fixed EventSource logger bug. + - Fixed minor EventSource logger bug. - Added retry logic and logging to Windows port monitoring code. - Added result set paging support in AppObserver's DeployedApplication query logic. + - Added retry logic to AppObserver's FabricClient calls. - Updated ApplicationInsights telemetry provider impl. + - Updated CsvLogger file management logic. + - Removed counter warm up calls in Windows Provider impls. Moved to callers. - Code maintenance. Microsoft From 253b1507a355f7a364add5163bba38c0be4a43e4 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 15:28:09 -0700 Subject: [PATCH 16/20] CO 2.1.7 nuspec update: releaseNotes --- ClusterObserver.nuspec.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index 5c0d98c7..9e789f12 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -4,9 +4,9 @@ %PACKAGE_ID% 2.1.7 - - Fixed bug in Application health query processor. + - Fixed important bug in Application health query processor. - ApplicationInsights TelemetryProvider impl update. - - Improvements in service close code. + - Improvements in service close cleanup code. Microsoft MIT From 1c66e0bc645d7ef4915268c3f2f82ef149a50e52 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 16:28:12 -0700 Subject: [PATCH 17/20] Formatting --- FabricObserver/FabricObserver.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index b15edd49..9352c900 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -96,10 +96,7 @@ private void LoadObserversFromPlugins(ServiceCollection services) foreach (string pluginDll in pluginDlls) { - PluginLoader loader = PluginLoader.CreateFromAssemblyFile( - pluginDll, - sharedTypes); - + PluginLoader loader = PluginLoader.CreateFromAssemblyFile(pluginDll, sharedTypes); pluginLoaders.Add(loader); } @@ -107,8 +104,7 @@ private void LoadObserversFromPlugins(ServiceCollection services) { Assembly pluginAssembly = pluginLoader.LoadDefaultAssembly(); - FabricObserverStartupAttribute[] startupAttributes = - pluginAssembly.GetCustomAttributes().ToArray(); + FabricObserverStartupAttribute[] startupAttributes = pluginAssembly.GetCustomAttributes().ToArray(); for (int i = 0; i < startupAttributes.Length; ++i) { From 14a37c14f58cfa3010404943c4378d6d85984b69 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 16:29:48 -0700 Subject: [PATCH 18/20] Default value increased. --- .../ApplicationPackageRoot/ApplicationManifest.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 8af6d042..bb644e16 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -116,7 +116,7 @@ - + From 4c7d930a446e730a59d4a7754b0b38f7ecd0583a Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 16:33:25 -0700 Subject: [PATCH 19/20] Sample obs Updated to 3.1.9 --- SampleObserverPlugin/ReadMe.md | 4 ++-- SampleObserverPlugin/SampleObserverPlugin.csproj | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SampleObserverPlugin/ReadMe.md b/SampleObserverPlugin/ReadMe.md index 42f70d17..180b44e1 100644 --- a/SampleObserverPlugin/ReadMe.md +++ b/SampleObserverPlugin/ReadMe.md @@ -126,7 +126,7 @@ You can deploy using the contents of your build out directory - just remove the * Create new instance of FO, which will contain your observer plugin ```Powershell $path = "[sourcedir]\MyObserverPlugin\bin\release\netstandard2.0\[target os platform, e.g., win-x64 or linux-x64]" -Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV316 -TimeoutSec 1800 +Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV319 -TimeoutSec 1800 Register-ServiceFabricApplicationType -ApplicationPathInImageStore FabricObserverV316 -New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.6 +New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.9 ``` diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index c8b68837..edb0b677 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -10,6 +10,6 @@ - + From 6d59fb63182843423fddcae9c6f1e261f60e5649 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 15 Apr 2021 16:39:35 -0700 Subject: [PATCH 20/20] nuspec update (FO) --- FabricObserver.nuspec.template | 1 + 1 file changed, 1 insertion(+) diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 13be11e9..0cbfd072 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -8,6 +8,7 @@ - Added retry logic and logging to Windows port monitoring code. - Added result set paging support in AppObserver's DeployedApplication query logic. - Added retry logic to AppObserver's FabricClient calls. + - Added best effort code to automatically fix malformed targetApp values supplied in AppObserver configuration. - Updated ApplicationInsights telemetry provider impl. - Updated CsvLogger file management logic. - Removed counter warm up calls in Windows Provider impls. Moved to callers.