diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index 7ea220b3..680ebf37 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -17,11 +17,13 @@ using FabricObserver.Observers.MachineInfoModel; using System.Fabric.Description; using System.Fabric.Health; +using System.ComponentModel; namespace FabricObserver.Observers { public class ContainerObserver : ObserverBase { + private const int MaxProcessExitWaitTimeMS = 60000; private List> allCpuDataPercentage; private List> allMemDataMB; @@ -445,22 +447,41 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc output.Add(l); } - p.WaitForExit(); + if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) + { + try + { + p?.Kill(); + } + catch (Exception e) when (e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) + { + + } + + return; + } int exitStatus = p.ExitCode; stdOutput.Close(); + // Was there an error running docker stats? if (exitStatus != 0) { - // there was an error associated with the non-zero exit code. Log it and throw. string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + msg += " NOTE: You must run FabricObserver as System user or Admin user on Windows " + + "in order for ContainerObserver to function correctly on Windows."; + } + ObserverLogger.LogWarning(msg); var healthReport = new Utilities.HealthReport { AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), EmitLogEvent = EnableVerboseLogging, - HealthMessage = msg + " " + "NOTE: You must run FabricObserver as System user or Admin user in order for ContainerObserver to function correctly.", + HealthMessage = $"{msg}", HealthReportTimeToLive = GetHealthReportTimeToLive(), Property = "docker_stats_failure", ReportType = HealthReportType.Application, diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index fccb2427..bf6adc03 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -173,7 +173,7 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie linkedSFRuntimeObserverTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cts.Token, this.token); FabricClientInstance = fabricClient; FabricServiceContext = serviceProvider.GetRequiredService(); - nodeName = FabricServiceContext?.NodeContext.NodeName; + nodeName = FabricServiceContext.NodeContext.NodeName; FabricServiceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent; // Observer Logger setup. @@ -940,7 +940,7 @@ private async Task RunObserversAsync() var exceptionBuilder = new StringBuilder(); bool allExecuted = true; - for (int i = 0; i < observers.Count(); ++i) + for (int i = 0; i < observers.Count; ++i) { var observer = observers[i]; @@ -963,7 +963,6 @@ private async Task RunObserversAsync() } Logger.LogInfo($"Starting {observer.ObserverName}"); - IsObserverRunning = true; // Synchronous call. @@ -973,7 +972,7 @@ private async Task RunObserversAsync() // Currently, this observer will not run again for the lifetime of this FO service instance. if (!isCompleted && !(TaskCancelled || shutdownSignaled)) { - string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified Maximum run time of {ObserverExecutionTimeout.TotalSeconds} seconds. " + + string observerHealthWarning = $"{observer.ObserverName} on node {nodeName} has exceeded its specified Maximum run time of {ObserverExecutionTimeout.TotalSeconds} seconds. " + $"This means something is wrong with {observer.ObserverName}. It will not be run again. Please look into it."; Logger.LogError(observerHealthWarning); diff --git a/FabricObserver/PackageRoot/Config/ContainerObserver.config.json b/FabricObserver/PackageRoot/Config/ContainerObserver.config.json index 13a5b4e0..05ec1439 100644 --- a/FabricObserver/PackageRoot/Config/ContainerObserver.config.json +++ b/FabricObserver/PackageRoot/Config/ContainerObserver.config.json @@ -1,7 +1,7 @@ [ { "targetApp": "*", - "cpuWarningLimitPercent": 30, - "memoryWarningLimitMB": 500 + "cpuWarningLimitPercent": 50, + "memoryWarningLimitMB": 1048 } ] \ No newline at end of file diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index beae8af4..0ac1f50e 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -14,8 +14,8 @@ your deployment and configuration - like if you watch all services of a all applications and there are hundreds of them. You can do this easily by setting EnableVerboseLogging to true on AppObserver, for example, and it will log how long it takes to run each time it runs to completion. Any observer that monitors stuff supports run duration logging. Use this these values to determine what constitutes too long and that an observer needs to be fixed. - No Observer should ever take 2 hours to run, so the below setting is an extreme maximum. Vary the number to meet your actual needs. --> - + No Observer should ever take 1 hour to run, for example, so the default setting of 3600 seconds is an extreme maximum. Vary the number to meet your actual needs. --> +