diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index 677e2c8f..40a9658a 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.16" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.16" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.16" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.16" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index e05d4275..bde01b26 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -251,11 +251,12 @@ private async Task ReportClusterHealthAsync(CancellationToken token) { } - break; + case HealthEvaluationKind.Application: case HealthEvaluationKind.Applications: case HealthEvaluationKind.SystemApplication: + try { await ProcessApplicationHealthAsync(clusterHealth.ApplicationHealthStates, token).ConfigureAwait(true); @@ -264,9 +265,10 @@ private async Task ReportClusterHealthAsync(CancellationToken token) { } - break; + default: + try { await ProcessGenericEntityHealthAsync(evaluation, token).ConfigureAwait(true); @@ -275,7 +277,6 @@ private async Task ReportClusterHealthAsync(CancellationToken token) { } - break; } } @@ -633,7 +634,8 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = $"{nodeDictItem.Key} is now Up.", Metric = "NodeStatus", NodeName = nodeDictItem.Key, - Source = ObserverName + Source = ObserverName, + Value = 0 }; await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); @@ -646,13 +648,12 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) ObserverConstants.ClusterObserverETWEventName, new { - HealthScope = "Node", HealthState = "Ok", - HealthEventDescription = $"{nodeDictItem.Key} is now Up.", + Description = $"{nodeDictItem.Key} is now Up.", Metric = "NodeStatus", NodeName = nodeDictItem.Key, - NodeStatus = "Up", - Source = ObserverName + Source = ObserverName, + Value = 0 }); } @@ -706,7 +707,8 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = message, Metric = "NodeStatus", NodeName = kvp.Key, - Source = ObserverName + Source = ObserverName, + Value = 1, }; await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); @@ -719,13 +721,12 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) ObserverConstants.ClusterObserverETWEventName, new { - HealthScope = "Node", HealthState = "Warning", - HealthEventDescription = message, + Description = message, Metric = "NodeStatus", NodeName = kvp.Key, - NodeStatus = $"{kvp.Value.NodeStatus}", - Source = ObserverName + Source = ObserverName, + Value = 1, }); } } diff --git a/ClusterObserver/ClusterObserver.csproj b/ClusterObserver/ClusterObserver.csproj index b1c42d98..5ddc5ccc 100644 --- a/ClusterObserver/ClusterObserver.csproj +++ b/ClusterObserver/ClusterObserver.csproj @@ -27,6 +27,6 @@ - + diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index 2e66eec3..ab429cde 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -133,7 +133,7 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", telemetryData.Value.ToString() }, + { "Value", telemetryData.Value == 0 ? "Up" : "Down" }, { "Partition", telemetryData.PartitionId }, { "Replica", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, diff --git a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs index f6728af0..3a683bbf 100644 --- a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -156,10 +156,8 @@ private string GetSignature( string message = $"{method}\n{contentLength}\n{contentType}\nx-ms-date:{date}\n{resource}"; byte[] bytes = Encoding.UTF8.GetBytes(message); - using (var encryptor = new HMACSHA256(Convert.FromBase64String(Key))) - { - return $"SharedKey {WorkspaceId}:{Convert.ToBase64String(encryptor.ComputeHash(bytes))}"; - } + using var encryptor = new HMACSHA256(Convert.FromBase64String(Key)); + return $"SharedKey {WorkspaceId}:{Convert.ToBase64String(encryptor.ComputeHash(bytes))}"; } // These two overloads of ReportHealthAsync are the only function impls that really makes sense for ClusterObserver diff --git a/Documentation/Observers.md b/Documentation/Observers.md index ef1e6051..1f6a1f57 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -20,6 +20,7 @@ Service Fabric Error Health Events can block upgrades and other important Fabric | Observer | Description | | :--- | :--- | | [AppObserver](#appobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric Application services (processes) and their spawn (child processes). Alerts when user-supplied thresholds are reached. | +| [AzureStorageUploadObserver](#azurestorageuploadobserver) | Runs periodically (do set its RunInterval setting) and will upload dmp files that AppObserver creates when you set dumpProcessOnError to true. It will clean up files after successful upload. | | [CertificateObserver](#certificateobserver) | Monitors the expiration date of the cluster certificate and any other certificates provided by the user. Warns when close to expiration. | | [DiskObserver](#diskobserver) | Monitors, storage disk information like capacity and IO rates. Alerts when user-supplied thresholds are reached. | | [FabricSystemObserver](#fabricsystemobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric System services (compare to AppObserver) | @@ -45,6 +46,38 @@ For every other observer, it's XML as per usual. Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application service processes and the child processes they spawn. If a service process creates child processes, then these processes will be monitored and their summed resource usage for some metric you are observing will be applied to the parent process (added) and a threshold breach will be determined based on the sum of children and parent resource usage. This observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. +***Important: By default, FabricObserver runs as an unprivileged user (NetworkUser on Windows and sfappsuser on Linux). If you want to monitor services that are running as System user (or Admin user) on Windows, you must run FabricObserver as System user.*** + +***For Linux, there is no need to run as root, so do not do that.*** + +You configure FO's user account type in ApplicationManifest.xml (only Windows would need this. FO's build script automatically inserts this setting for Linux target, and for running Setup scripts only (not Code package binaries), to support FO's Linux Capabilities implementation): + +```XML + + + + + + + + + + + + + + +``` + ### A note on child process monitoring AppObserver (FO version >= 3.1.15) will automatically monitor up to 50 process descendants of your primary service process (50 is extreme. You should not design services that own that many descendant processes..). If your services launch child processes, then AppObserver will automatically monitor them for the same metrics and thresholds you supply for the containing Application. @@ -109,6 +142,30 @@ All settings are optional, ***except target OR targetType***, and can be omitted AppObserver also supports non-JSON parameters for configuration unrelated to thresholds. Like all observers these settings are located in ApplicationManifest.xml to support versionless configuration updates via application upgrade. +#### Non-json settings + +```XML + + + + + + + + + + + + + + + + + + +``` + Example AppObserver Output (Warning - Ephemeral Ports Usage): ![alt text](/Documentation/Images/AppObsWarn.png "AppObserver Warning output example.") @@ -120,6 +177,72 @@ as explained above. Like FabricSystemObserver, all data is stored in in-memory d This observer also monitors the FabricObserver service itself across CPU/Mem/FileHandles/Ports. +## AzureStorageUploadObserver +Runs periodically (you can set its RunInterval setting, just like any observer) and will upload dmp files of user services that AppObserver creates when you set dumpProcessOnError to true and supply Error thresholds in AppObserver configuration. The files are compressed and uploaded to a specified Azure Storage Account (blob storage only) and blob container name (default is fodumps, but you can configure this). It will delete dmp files from local storage after each successful upload. +For authentication to Azure Storage, Storage Connection String and Account Name/Account Key pair are supported today. Since there is currently only support for Windows process dumps (by AppObserver only), there is no need to run this Observer on Linux (today..). +The dumps created are *not* crash dumps, they are live dumps of a process's memory, handles, threads. The target process will not be killed or blow up in memory size. The offending service will keep on doing what it's doing wrong. +By default, the dmp files are MiniPlus mini dumps, so they will be roughly as large as the target process's private working set and stack. You can set to Mini (similar size) or +Full, which is much larger. You probably do not need to create Full dumps in most cases. + +Note that this feature does not apply to the FabricObserver process, even if specifying a configuration setting to do so. FabricObserver will not dump itself. + +#### Compression + +All dmp files are compressed to zip files before uploading to your storage account over the Internet. By default, the compression level is set to Optimal, which means the files will be compressed to the *smallest size possible*. You can change this in configuration to Fastest or NoCompression. We do not recommend NoCompression. The choice is yours to own. + +**Optimal**: Best compression, uses more CPU for a short duration (this should not be an issue nor a deciding factor). + +**Fastest**: Fastest compression, uses less CPU than Optimal, produces non-optimally compressed files. + +**NoCompression**: Don't compress. This is NOT recommended. You should reduce the size of these files before uploading them to your cloud storage (blob) container. + +A note on resource usage: This feature is intended for the exceptional case - when your app service is truly doing something really wrong (like leaking memory, ports, handles). Make sure that you set your Error thresholds to meaningfully high values. Internally, FabricObserver will only dump a configured amount of times in a specified time window per service, per observed metric. The idea +is to not eat your local disk space and use up too much CPU for too long. Please be mindful of how you utilize this **debugging** feature. It is best to enable it in Test and Staging clusters to find the egregious bugs in your service code *before* you ship your services to production clusters. + +#### Encrypting your secrets + +It is very important that you generate an encrypted Connection String or Account Key string in a supported way: Use Service Fabric's Invoke-ServiceFabricEncryptText PowerShell cmdlet with your Cluster thumbprint or cert name/location. +Please see the [related documentation with samples](https://docs.microsoft.com/en-us/powershell/module/servicefabric/invoke-servicefabricencrypttext?view=azureservicefabricps). It is really easy to do! Non-encrypted strings are supported, but we do not recommend using them. The decision is yours to own. + +Also, since FO runs as NetworkUser by default, you will need to supply a SecretsCertificate setting in ApplicationManifest.xml which will enable FO to run as unprivileged user and access your private key for the cert installed on the local machine. +This section is already present in ApplicationManifest.xml. Just add the thumbprint you used to create your encrypted connection string or account key and a friendly name for the cert. +If you do not do this, then you will need to run FO as System in order for decryption of your connection string to work and for blob uploads to succeed. + +***As always, if you want to monitor user services on Windows that are running as System user (or Admin user), you must run FabricObserver as System user on Windows.*** In the FO-as-System-user-on-Windows case, you do not need to set SecretsCertificate. + +SecretsCertificate configuration in ApplicationManifest.xml: + +```XML +... + + + + + +``` + +Example AzureStorageUploadObserver configuration in ApplicationManifest.xml: + +```XML + + + + + + + + + + +``` + + ## CertificateObserver Monitors the expiration date of the cluster certificate and any other certificates provided by the user. diff --git a/Documentation/Using.md b/Documentation/Using.md index bb9729cd..6a2bc0f4 100644 --- a/Documentation/Using.md +++ b/Documentation/Using.md @@ -333,6 +333,23 @@ The configuration below specifies that AppObserver is to monitor and report thre } ] ``` +***Problem:*** I want to dump any user Windows SF service process that is eating too much memory. This is not supported on Linux. + +***Solution:*** AppObserver is your friend. Note, you can specify all app targets using either "*" or "All"(case doesn't matter). +In this case, AppObserver will initiate a mini dump (MiniPlus by default) of an offending process running on Windows. You can configure [AzureStorageUploadObserver](/Documentation/Observers.md#azurestorageuploadobserver) to ship the dmp (compressed to zip file) to a blob in your Azure storage account. +Please see [Observers documentation](/Documentation/Observers.md), specifically App and AzureStorageUpload observer sections for details on this process dump and upload feature. + +```JSON +{ + "targetApp": "*", + "appExcludeList": "fabric:/SomeApp, fabric:/SomeOtherApp", + "cpuWarningLimitPercent": 85, + "memoryErrorLimitMb": 1048, + "dumpProcessOnError": true, + "networkWarningActivePorts": 8000, + "networkWarningEphemeralPorts": 7500 + } +``` > You can learn all about the currently implemeted Observers and their supported resource properties [***here***](/Documentation/Observers.md). diff --git a/FabricObserver.Extensibility/FabricObserver.Extensibility.csproj b/FabricObserver.Extensibility/FabricObserver.Extensibility.csproj index 8a48d17f..de67f52a 100644 --- a/FabricObserver.Extensibility/FabricObserver.Extensibility.csproj +++ b/FabricObserver.Extensibility/FabricObserver.Extensibility.csproj @@ -12,16 +12,22 @@ + + + + + + - + - + diff --git a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs index 4da1ef3b..169773af 100644 --- a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs +++ b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs @@ -63,7 +63,6 @@ Task ReportAvailabilityAsync( /// Optional: TraceTelemetry context cloud instance name. /// a Task. Task ReportHealthAsync( - HealthScope scope, string propertyName, HealthState state, string unhealthyEvaluations, diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 1b8ccf44..8a454e8b 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -28,11 +28,36 @@ public abstract class ObserverBase : IObserver { private const int TtlAddMinutes = 5; private const string FabricSystemAppName = "fabric:/System"; - private const int MaxDumps = 5; - private Dictionary serviceDumpCountDictionary; - private string SFLogRoot; - private string SFDumpsPath; private bool disposed; + private Dictionary ServiceDumpCountDictionary; + + public bool EnableProcessDumps + { + get;set; + } + + /* Process dump settings. TODO: Only AppObserver and Windows is supported today. */ + public string DumpsPath + { + get; set; + } + + public int MaxDumps + { + get; set; + } + + public TimeSpan MaxDumpsTimeWindow + { + get; set; + } = TimeSpan.FromHours(4); + + public DumpType DumpType + { + get; set; + } = DumpType.MiniPlus; + + /* End AO procsess dump settings. */ public string ObserverName { @@ -325,10 +350,9 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel EnableETWLogging = IsEtwProviderEnabled }; - // Only supported on Windows (dump on error). - if (string.IsNullOrWhiteSpace(SFDumpsPath) && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + if (string.IsNullOrWhiteSpace(DumpsPath)) { - SetDefaultSFWindowsDumpPath(); + SetDumpPath(); } ConfigurationSettings = new ConfigSettings( @@ -364,35 +388,6 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel /// A Task. public abstract Task ReportAsync(CancellationToken token); - /// - /// - /// - /// - /// - /// - public void WriteToLogWithLevel(string property, string description, LogLevel level) - { - switch (level) - { - case LogLevel.Information: - ObserverLogger.LogInfo("{0} logged at level {1}: {2}", property, level, description); - break; - - case LogLevel.Warning: - ObserverLogger.LogWarning("{0} logged at level {1}: {2}", property, level, description); - break; - - case LogLevel.Error: - ObserverLogger.LogError("{0} logged at level {1}: {2}", property, level, description); - break; - - default: - return; - } - - Logger.Flush(); - } - /// /// Gets a parameter value from the specified section. /// @@ -448,32 +443,107 @@ public void Dispose() Dispose(true); } - // Windows process dmp creator. + // Windows process dmp creator.\\ + /// /// This function will create Windows process dumps in supplied location if there is enough disk space available. - /// This function runs if you set dumpProcessOnError to true in AppObserver.config.json, for example. + /// Only AppObserver is supported today since it will generate memory dumps for the service processes it monitors when an Error threshold has been breached. + /// In the future, this may be applied to FabricSystemObserver as well, thus this code is located in ObserverBase.. + /// This function runs if you set dumpProcessOnError to true in AppObserver.config.json AND enable process dumps in AppObserver configuration in ApplicationManifest.xml. /// /// Process id of the target process to dump. - /// Optional: The type of dump to generate. Default is DumpType.Full. - /// Optional: The full path to store dump file. Default is %SFLogRoot%\CrashDumps + /// Process name. + /// The name of the metric threshold that was breached, leading to dump. /// true or false if the operation succeeded. - private bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpType.Full, string folderPath = null, string fileName = null) + public bool DumpWindowsServiceProcess(int processId, string procName, string metric) { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return false; } - if (string.IsNullOrWhiteSpace(SFDumpsPath) && string.IsNullOrWhiteSpace(folderPath)) + // Must provide a process name.. & do not try and dump yourself.. + if (string.IsNullOrEmpty(procName) || procName == ObserverConstants.FabricObserverName) + { + return false; + } + + if (string.IsNullOrWhiteSpace(DumpsPath)) { return false; } - string path = !string.IsNullOrWhiteSpace(folderPath) ? folderPath : SFDumpsPath; - string processName = !string.IsNullOrWhiteSpace(fileName) ? fileName : string.Empty; + if (!Directory.Exists(DumpsPath)) + { + try + { + Directory.CreateDirectory(DumpsPath); + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) + { + ObserverLogger.LogWarning($"Can't create dump directory for path {DumpsPath}. Will not generate dmp file for {procName}. " + + $"Error info:{Environment.NewLine}{e}"); + return false; + } + } + + if (ServiceDumpCountDictionary == null) + { + ServiceDumpCountDictionary = new Dictionary(5); + } + + StringBuilder sb = new StringBuilder(metric); + string metricName = sb.Replace(" ", string.Empty) + .Replace("Total", string.Empty) + .Replace("MB", string.Empty) + .Replace("%", string.Empty) + .Replace("Active", string.Empty) + .Replace("Allocated", string.Empty) + .Replace("Length", string.Empty) + .Replace("Consumption", string.Empty) + .Replace("Time", string.Empty) + .Replace("TCP", string.Empty).ToString(); + sb.Clear(); + string dumpKey = $"{procName}_{metricName}"; + string dumpFileName = $"{dumpKey}_{NodeName}"; + + try + { + if (Directory.Exists(DumpsPath) && Directory.GetFiles(DumpsPath, $"{dumpKey}*.dmp", SearchOption.AllDirectories).Length >= MaxDumps) + { + ObserverLogger.LogWarning($"Reached maximum number({MaxDumps}) of {dumpKey} dmp files stored on local disk. Will not create dmp file. " + + $"If enabled, please make sure that AzureStorageObserver is configured correctly. " + + $"Will attempt to delete old (>= 1 day) local files now."); + + // Clean out old dmp files, if any. Generally, there will only be some dmp files remaining on disk if customer has not configured + // AzureStorageObserver correctly or some error occurred during some stage of the upload process. + ObserverLogger.TryCleanFolder(DumpsPath, $"{dumpKey}*.dmp", TimeSpan.FromDays(1)); + return false; + } + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) + { + + } + + if (!ServiceDumpCountDictionary.ContainsKey(dumpKey)) + { + ServiceDumpCountDictionary.Add(dumpKey, (0, DateTime.UtcNow)); + } + else if (DateTime.UtcNow.Subtract(ServiceDumpCountDictionary[dumpKey].LastDumpDate) >= MaxDumpsTimeWindow) + { + ServiceDumpCountDictionary[dumpKey] = (0, DateTime.UtcNow); + } + else if (ServiceDumpCountDictionary[dumpKey].DumpCount >= MaxDumps) + { + ObserverLogger.LogWarning($"Reached maximum number of process dumps({MaxDumps}) for key {dumpKey} " + + $"within {MaxDumpsTimeWindow.TotalHours} hour period. Will not create dmp file."); + return false; + } + NativeMethods.MINIDUMP_TYPE miniDumpType; - switch (dumpType) + switch (DumpType) { case DumpType.Full: miniDumpType = NativeMethods.MINIDUMP_TYPE.MiniDumpWithFullMemory | @@ -498,49 +568,35 @@ private bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpTy break; default: - throw new ArgumentOutOfRangeException(nameof(dumpType), dumpType, null); + throw new ArgumentOutOfRangeException(nameof(DumpType), DumpType, null); } + string dumpFilePath = null; + try { using (Process process = Process.GetProcessById(processId)) { - if (processName == string.Empty) + if (dumpFileName == string.Empty) { - processName = process.ProcessName; + dumpFileName = process.ProcessName; } IntPtr processHandle = process.Handle; - processName += $"_{DateTime.Now:ddMMyyyyHHmmss}.dmp"; + dumpFileName += $"_{DateTime.Now:ddMMyyyyHHmmssFFF}.dmp"; // Check disk space availability before writing dump file. - string driveName = path.Substring(0, 2); + string driveName = DumpsPath.Substring(0, 2); if (DiskUsage.GetCurrentDiskSpaceUsedPercent(driveName) > 90) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Not enough disk space available for dump file creation."); - + ObserverLogger.LogWarning("Not enough disk space available for dump file creation."); return false; } - if (!Directory.Exists(path)) - { - try - { - Directory.CreateDirectory(path); - } - catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) - { - // Can't create directory in SF dumps folder, so dump into top level directory.. - path = SFDumpsPath; - } - } + dumpFilePath = Path.Combine(DumpsPath, dumpFileName); - using (FileStream file = File.Create(Path.Combine(path, processName))) + using (FileStream file = File.Create(dumpFilePath)) { if (!NativeMethods.MiniDumpWriteDump( processHandle, @@ -553,6 +609,11 @@ private bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpTy { throw new Win32Exception(Marshal.GetLastWin32Error()); } + + if (!string.IsNullOrWhiteSpace(metric)) + { + ServiceDumpCountDictionary[dumpKey] = (ServiceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); + } } } @@ -567,9 +628,23 @@ e is UnauthorizedAccessException || e is Win32Exception) { ObserverLogger.LogWarning( - $"Failure generating Windows process dump file {processName} with error:{Environment.NewLine}{e}"); - } + $"Failure generating Windows process dump file {dumpFileName} with error:{Environment.NewLine}{e}"); + if (File.Exists(dumpFilePath)) + { + // This means a partial file may have been created (like the process went away during dump capture). Delete it. + try + { + Retry.Do(() => File.Delete(Path.Combine(DumpsPath, dumpFileName)), TimeSpan.FromSeconds(1), Token); + } + catch(AggregateException) + { + // Couldn't delete file. + // Retry.Do throws AggregateException containing list of exceptions caught. In this case, we don't really care.. + } + } + } + return false; } @@ -674,7 +749,7 @@ public void ProcessResourceDataReportHealth( // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. If the process has children, then don't emit this raw data since it will already // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to send data to ApplicationInsights or LogAnalytics for each resource usage observation it makes per specified metric. - if (IsTelemetryEnabled && replicaOrInstance.ChildProcesses == null) + if (IsTelemetryEnabled && replicaOrInstance?.ChildProcesses == null) { _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token).ConfigureAwait(true); } @@ -683,7 +758,7 @@ public void ProcessResourceDataReportHealth( // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to log etw (which can then be read by some agent that will send it to some endpoint) // for each resource usage observation it makes per specified metric. - if (IsEtwEnabled && replicaOrInstance.ChildProcesses == null) + if (IsEtwEnabled && replicaOrInstance?.ChildProcesses == null) { ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, @@ -696,7 +771,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ProcessId = procId, - ReplicaId = replicaOrInstance?.ReplicaOrInstanceId, + ReplicaId = replicaOrInstance?.ReplicaOrInstanceId != null ? replicaOrInstance.ReplicaOrInstanceId : 0, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, SystemServiceProcessName = appName?.OriginalString == FabricSystemAppName ? name : string.Empty @@ -762,52 +837,27 @@ public void ProcessResourceDataReportHealth( // part of the base class for future use, like for plugins that manage service processes. if (replicaOrInstance != null && dumpOnError && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - if (serviceDumpCountDictionary == null) - { - serviceDumpCountDictionary = new Dictionary(5); - } - - try + if (!string.IsNullOrWhiteSpace(DumpsPath)) { - int pid = (int)replicaOrInstance.HostProcessId; + if (ServiceDumpCountDictionary == null) + { + ServiceDumpCountDictionary = new Dictionary(5); + } - using (var proc = Process.GetProcessById(pid)) + try { - string procName = proc?.ProcessName; - StringBuilder sb = new StringBuilder(data.Property); - string metricName = sb.Replace(" ", string.Empty) - .Replace("Total", string.Empty) - .Replace("MB", string.Empty) - .Replace("%", string.Empty) - .Replace("Active", string.Empty) - .Replace("TCP", string.Empty).ToString(); - sb.Clear(); - string dumpKey = $"{procName}_{metricName}"; - - if (!serviceDumpCountDictionary.ContainsKey(dumpKey)) - { - serviceDumpCountDictionary.Add(dumpKey, (0, DateTime.UtcNow)); - } - else if (DateTime.UtcNow.Subtract(serviceDumpCountDictionary[dumpKey].LastDumpDate) >= TimeSpan.FromDays(1)) - { - serviceDumpCountDictionary[dumpKey] = (0, DateTime.UtcNow); - } + int pid = (int)replicaOrInstance.HostProcessId; - if (serviceDumpCountDictionary[dumpKey].DumpCount < MaxDumps) + using (var proc = Process.GetProcessById(pid)) { - // DumpServiceProcess defaults to a Full dump with process memory, handles and thread data. - bool success = DumpServiceProcessWindows(pid, DumpType.Full, Path.Combine(SFDumpsPath, procName), dumpKey); - - if (success) - { - serviceDumpCountDictionary[dumpKey] = (serviceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); - } + string procName = proc?.ProcessName; + _ = DumpWindowsServiceProcess(pid, procName, data.Property); } } - } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) - { - ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); + } } } } @@ -1139,47 +1189,29 @@ protected virtual void Dispose(bool disposing) } } - private void SetDefaultSFWindowsDumpPath() + private void SetDumpPath() { - // This only needs to be set once. - if (string.IsNullOrWhiteSpace(SFDumpsPath)) + if (ObserverName != ObserverConstants.AppObserverName) { - SFLogRoot = ServiceFabricConfiguration.Instance.FabricLogRoot; - - if (string.IsNullOrWhiteSpace(SFLogRoot)) - { - SFDumpsPath = null; - return; - } + return; } - SFDumpsPath = Path.Combine(SFLogRoot, "ApplicationCrashDumps"); - - if (Directory.Exists(SFDumpsPath)) + // This only needs to be set once. + if (!string.IsNullOrWhiteSpace(DumpsPath) && Directory.Exists(DumpsPath)) { return; } - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Unable to locate dump directory {SFDumpsPath}. Trying another one..."); - - SFDumpsPath = Path.Combine(SFLogRoot, "CrashDumps"); - - if (Directory.Exists(SFDumpsPath)) + try + { + DumpsPath = Path.Combine(ObserverLogger.LogFolderBasePath, ObserverName, "MemoryDumps"); + Directory.CreateDirectory(DumpsPath); + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is NotSupportedException || e is UnauthorizedAccessException) { + ObserverLogger.LogWarning($"Unable to create dump directory {DumpsPath}."); return; } - - SFDumpsPath = null; - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Unable to locate dump directory {SFDumpsPath}. Aborting. Will not generate application service dumps."); - return; } private void SetObserverConfiguration() @@ -1250,7 +1282,6 @@ private void SetObserverConfiguration() new CancellationToken()); break; - case TelemetryProviderType.AzureApplicationInsights: @@ -1264,7 +1295,6 @@ private void SetObserverConfiguration() TelemetryClient = new AppInsightsTelemetry(aiKey); break; - default: diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index f8134f97..af1a9f5f 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -206,21 +206,16 @@ public bool TryDeleteInstanceLogFile() return false; } - for (var i = 0; i < Retries; i++) + try { - try - { - File.Delete(FilePath); - return true; - } - catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) - { - - } - - Thread.Sleep(1000); + Retry.Do(() => File.Delete(FilePath), TimeSpan.FromSeconds(1), CancellationToken.None); + return true; } + catch (AggregateException) + { + } + return false; } @@ -276,13 +271,6 @@ private void InitializeLoggers() FilePath = file; - // Clean out old log files. This is to ensure the supplied policy is enforced if FO is restarted before the MaxArchiveFileLifetimeDays has been reached. - // This is because Logger FileTarget settings are not preserved across FO deployments. - if (MaxArchiveFileLifetimeDays > 0) - { - TryCleanLogFolder(Path.Combine(logFolderBase, FolderName), TimeSpan.FromDays(MaxArchiveFileLifetimeDays)); - } - var targetName = loggerName + "LogFile"; if (LogManager.Configuration == null) @@ -308,25 +296,39 @@ private void InitializeLoggers() }; LogManager.Configuration.AddTarget(loggerName + "LogFile", target); - var ruleInfo = new LoggingRule(loggerName, NLog.LogLevel.Debug, target); - LogManager.Configuration.LoggingRules.Add(ruleInfo); LogManager.ReconfigExistingLoggers(); } TimeSource.Current = new AccurateUtcTimeSource(); OLogger = LogManager.GetLogger(loggerName); + + // Clean out old log files. This is to ensure the supplied policy is enforced if FO is restarted before the MaxArchiveFileLifetimeDays has been reached. + // This is because Logger FileTarget settings are not preserved across FO deployments. + if (MaxArchiveFileLifetimeDays > 0) + { + TryCleanFolder(Path.Combine(logFolderBase, FolderName), "*.log", TimeSpan.FromDays(MaxArchiveFileLifetimeDays)); + } } - private static void TryCleanLogFolder(string folderPath, TimeSpan maxAge) + public void TryCleanFolder(string folderPath, string searchPattern, TimeSpan maxAge) { if (!Directory.Exists(folderPath)) { return; } - string[] files = Directory.GetFiles(folderPath, "*", SearchOption.AllDirectories); + string[] files = new string[] { }; + + try + { + files = Directory.GetFiles(folderPath, searchPattern, SearchOption.AllDirectories); + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) + { + return; + } foreach (string file in files) { @@ -334,12 +336,12 @@ private static void TryCleanLogFolder(string folderPath, TimeSpan maxAge) { if (DateTime.UtcNow.Subtract(File.GetCreationTime(file)) >= maxAge) { - File.Delete(file); + Retry.Do(() => File.Delete(file), TimeSpan.FromSeconds(1), CancellationToken.None); } } - catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) + catch (Exception e) when (e is ArgumentException || e is AggregateException) { - + LogWarning($"Unable to delete file {file}:{Environment.NewLine}{e}"); } } } diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index c55ad358..41aa1460 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -19,6 +19,7 @@ public sealed class ObserverConstants public const string AsyncClusterOperationTimeoutSeconds = "ClusterOperationTimeoutSeconds"; public const string FabricObserverName = "FabricObserver"; public const string FabricObserverETWEventName = "FabricObserverDataEvent"; + public const string ObserverFailureHealthStateLevelParameter = "ObserverFailureHealthStateLevel"; // The name of the package that contains this Observer's configuration public const string ObserverConfigurationPackageName = "Config"; @@ -55,8 +56,19 @@ public sealed class ObserverConstants // AppObserver. public const string AppObserverName = "AppObserver"; public const string AppObserverConfigurationSectionName = "AppObserverConfiguration"; - public const string EnableChildProcessMonitoring = "EnableChildProcessMonitoring"; + public const string EnableChildProcessMonitoringParameter = "EnableChildProcessMonitoring"; public const string MaxChildProcTelemetryDataCountParameter = "MaxChildProcTelemetryDataCount"; + public const string EnableProcessDumpsParameter = "EnableProcessDumps"; + public const string DumpTypeParameter = "DumpType"; + public const string MaxDumpsParameter = "MaxDumps"; + public const string MaxDumpsTimeWindowParameter = "MaxDumpsTimeWindow"; + + // AzureStorageObserver + public const string AzureStorageConnectionStringParameter = "AzureStorageConnectionString"; + public const string AzureBlobContainerNameParameter = "BlobContainerName"; + public const string AzureStorageAccountNameParameter = "AzureStorageAccountName"; + public const string AzureStorageAccountKeyParameter = "AzureStorageAccountKey"; + public const string ZipFileCompressionLevelParameter = "ZipFileCompressionLevel"; // Certificate Observer public const string CertificateObserverName = "CertificateObserver"; diff --git a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs index 28cdc642..cdeca5c1 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs @@ -32,39 +32,6 @@ public ObserverHealthReporter(Logger logger, FabricClient fabricClient) this.logger = logger; } - /// - /// Report FabricObserver service health as log event (not to SF Health). - /// - /// Name of the service. - /// Name of the health property. - /// Health state (Ok, Error, etc). - /// Description of the health condition. - public void ReportFabricObserverServiceHealth(string serviceName, string propertyName, HealthState healthState, string description) - { - string msg = $"{propertyName} reporting {healthState}: {description}"; - - switch (healthState) - { - case HealthState.Error: - logger.LogError(msg); - break; - - case HealthState.Warning: - logger.LogWarning(msg); - break; - - default: - { - if (logger.EnableVerboseLogging) - { - logger.LogInfo(msg); - } - - break; - } - } - } - /// /// This function generates Service Fabric Health Reports that will show up in SFX. /// @@ -99,10 +66,6 @@ public void ReportHealthToServiceFabric(HealthReport healthReport) if (healthReport.State == HealthState.Error || healthReport.State == HealthState.Warning) { - errWarnPreamble = - $"{healthReport.Observer} detected " + - $"{Enum.GetName(typeof(HealthState), healthReport.State)} threshold breach. "; - // OSObserver does not monitor resources and therefore does not support related usage threshold configuration. if (healthReport.Observer == ObserverConstants.OSObserverName && healthReport.Property == "OSConfiguration") { @@ -149,7 +112,7 @@ public void ReportHealthToServiceFabric(HealthReport healthReport) break; default: - healthReport.Property = $"{healthReport.Observer}_{(!string.IsNullOrWhiteSpace(healthReport.ResourceUsageDataProperty) ? healthReport.ResourceUsageDataProperty : "GenericHealthProperty")}"; + healthReport.Property = $"{healthReport.Observer}_{(!string.IsNullOrWhiteSpace(healthReport.ResourceUsageDataProperty) ? healthReport.ResourceUsageDataProperty : "GenericHealth")}"; break; } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index d9665905..76ac5c72 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -6,11 +6,14 @@ using System.Collections.Generic; using System.Diagnostics; using System.Fabric; +using System.Linq; namespace FabricObserver.Observers.Utilities { public class LinuxProcessInfoProvider : ProcessInfoProvider { + private const int MaxDescendants = 50; + public override float GetProcessPrivateWorkingSetInMB(int processId) { if (LinuxProcFS.TryParseStatusFile(processId, out ParsedStatus status)) @@ -68,22 +71,80 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService public override List<(string ProcName, int Pid)> GetChildProcessInfo(int processId) { - string pidCmdResult = $"ps -o pid= --ppid {processId}".Bash(); - string procNameCmdResult = $"ps -o comm= --ppid {processId}".Bash(); - List<(string procName, int Pid)> childProcesses = new List<(string procName, int Pid)>(); + if (processId < 1) + { + return null; + } - if (!string.IsNullOrWhiteSpace(pidCmdResult) && !string.IsNullOrWhiteSpace(procNameCmdResult)) + // Get child procs. + List<(string ProcName, int Pid)> childProcesses = TupleGetChildProcessInfo(processId); + + if (childProcesses == null || childProcesses.Count == 0) { - var sPids = pidCmdResult.Trim().Split('\n'); - var sProcNames = procNameCmdResult.Trim().Split('\n'); + return null; + } + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } - if (sPids?.Length > 0 && sProcNames.Length > 0) + // Get descendant proc at max depth = 5 and max number of descendants = 50. + for (int i = 0; i < childProcesses.Count; ++i) + { + List<(string ProcName, int Pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].Pid); + + if (c1 != null && c1.Count > 0) { - for (int i = 0; i < sPids.Length; ++i) + childProcesses.AddRange(c1); + + if (childProcesses.Count >= MaxDescendants) { - if (int.TryParse(sPids[i], out int childProcId)) + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int j = 0; j < c1.Count; ++j) + { + List<(string ProcName, int Pid)> c2 = TupleGetChildProcessInfo(c1[j].Pid); + + if (c2 != null && c2.Count > 0) { - childProcesses.Add((sProcNames[i], childProcId)); + childProcesses.AddRange(c2); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int k = 0; k < c2.Count; ++k) + { + List<(string ProcName, int Pid)> c3 = TupleGetChildProcessInfo(c2[k].Pid); + + if (c3 != null && c3.Count > 0) + { + childProcesses.AddRange(c3); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int l = 0; l < c3.Count; ++l) + { + List<(string ProcName, int Pid)> c4 = TupleGetChildProcessInfo(c3[l].Pid); + + if (c4 != null && c4.Count > 0) + { + childProcesses.AddRange(c4); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + } + } + } + } } } } @@ -96,5 +157,33 @@ protected override void Dispose(bool disposing) { // nothing to do here. } + + private List<(string ProcName, int Pid)> TupleGetChildProcessInfo(int processId) + { + string pidCmdResult = $"ps -o pid= --ppid {processId}".Bash(); + string procNameCmdResult = $"ps -o comm= --ppid {processId}".Bash(); + List<(string ProcName, int Pid)> childProcesses = null; + + if (!string.IsNullOrWhiteSpace(pidCmdResult) && !string.IsNullOrWhiteSpace(procNameCmdResult)) + { + var sPids = pidCmdResult.Trim().Split(new char[] { '\n' }, System.StringSplitOptions.RemoveEmptyEntries); + var sProcNames = procNameCmdResult.Trim().Split(new char[] { '\n' }, System.StringSplitOptions.RemoveEmptyEntries); + + if (sPids?.Length > 0 && sProcNames?.Length > 0) + { + childProcesses = new List<(string ProcName, int Pid)>(); + + for (int i = 0; i < sPids.Length; ++i) + { + if (int.TryParse(sPids[i], out int childProcId)) + { + childProcesses.Add((sProcNames[i], childProcId)); + } + } + } + } + + return childProcesses; + } } } \ No newline at end of file diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 203563ff..99a6d521 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -123,9 +123,9 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService } // Get child procs. - List<(string procName, int pid)> childProcesses = TupleGetChildProcessInfo(processId); + List<(string ProcName, int Pid)> childProcesses = TupleGetChildProcessInfo(processId); - if (childProcesses == null) + if (childProcesses == null || childProcesses.Count == 0) { return null; } @@ -138,9 +138,9 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService // Get descendant proc at max depth = 5 and max number of descendants = 50. for (int i = 0; i < childProcesses.Count; ++i) { - List<(string procName, int pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].pid); + List<(string ProcName, int Pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].Pid); - if (c1?.Count > 0) + if (c1 != null && c1.Count > 0) { childProcesses.AddRange(c1); @@ -151,9 +151,9 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService for (int j = 0; j < c1.Count; ++j) { - List<(string procName, int pid)> c2 = TupleGetChildProcessInfo(c1[j].pid); + List<(string ProcName, int Pid)> c2 = TupleGetChildProcessInfo(c1[j].Pid); - if (c2?.Count > 0) + if (c2 != null && c2.Count > 0) { childProcesses.AddRange(c2); @@ -164,9 +164,9 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService for (int k = 0; k < c2.Count; ++k) { - List<(string procName, int pid)> c3 = TupleGetChildProcessInfo(c2[k].pid); + List<(string ProcName, int Pid)> c3 = TupleGetChildProcessInfo(c2[k].Pid); - if (c3?.Count > 0) + if (c3 != null && c3.Count > 0) { childProcesses.AddRange(c3); @@ -177,9 +177,9 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService for (int l = 0; l < c3.Count; ++l) { - List<(string procName, int pid)> c4 = TupleGetChildProcessInfo(c3[l].pid); + List<(string ProcName, int Pid)> c4 = TupleGetChildProcessInfo(c3[l].Pid); - if (c4?.Count > 0) + if (c4 != null && c4.Count > 0) { childProcesses.AddRange(c4); diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 550d607c..f875a3c9 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -115,7 +115,6 @@ public Task ReportAvailabilityAsync( /// Optional: TraceTelemetry context cloud instance name. /// A representing the asynchronous operation. public Task ReportHealthAsync( - HealthScope scope, string propertyName, HealthState state, string unhealthyEvaluations, @@ -143,7 +142,11 @@ public Task ReportHealthAsync( healthInfo += $"{Environment.NewLine}{unhealthyEvaluations}"; } - var tt = new TraceTelemetry($"Service Fabric Health report - {Enum.GetName(typeof(HealthScope), scope)}: {Enum.GetName(typeof(HealthState), state)} -> {source}:{propertyName}{healthInfo}", sev); + var tt = new TraceTelemetry( + $"{Enum.GetName(typeof(HealthState), state)} from {source}:{Environment.NewLine}" + + $"{propertyName}{Environment.NewLine}" + + $"{healthInfo}", sev); + tt.Context.Cloud.RoleName = serviceName; tt.Context.Cloud.RoleInstance = instanceName; diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs index 3c1409e7..91d7710f 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -66,7 +66,6 @@ public LogAnalyticsTelemetry( } public async Task ReportHealthAsync( - HealthScope scope, string propertyName, HealthState state, string unhealthyEvaluations, @@ -75,19 +74,15 @@ public async Task ReportHealthAsync( string serviceName = null, string instanceName = null) { - var (clusterId, _, clusterType) = + var (clusterId, _, _) = await ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).ConfigureAwait(true); string jsonPayload = JsonConvert.SerializeObject( new { - id = $"FO_{Guid.NewGuid()}", - datetime = DateTime.UtcNow, clusterId = clusterId ?? string.Empty, - clusterType = clusterType ?? string.Empty, source, property = propertyName, - healthScope = scope.ToString(), healthState = state.ToString(), healthEvaluation = unhealthyEvaluations, serviceName = serviceName ?? string.Empty, diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 4f0baada..ebb436c3 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,14 +2,8 @@ %PACKAGE_ID% - 3.1.15 - - This release adds support for process tree monitoring by AppObserver: - Any child process (and descendants at max depth = 4) launched by a service process that is being monitored by AppObserver will also be monitored and its resource usage will be added to the parent's for use in threshold violation checks for an observed (configured) metric. - Added support for new child process monitoring data in ETW, AppInsights and LogAnalytics telemetry provider impls. - Minor bug fix in AppObserver monitor duration logic. - Added lifetime management to DumpOnError feature (max 5 dumps per process per metric for 24 hour period. Then, new cycle) - + 3.1.16 + New observer: AzureStorageUploadObserver. This adds Windows-only (for now) support for uploading dmp files to your specified Azure Storage account (blob). This is a Windows-only feature today, so there is no need to enable the new observer for Linux deployments. Design fix: decreased CPU use by AppObserver and NodeObserver. Code enhancements and bug fixes. See release notes on repo for details. Microsoft MIT true diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index 0c00d336..b713ca81 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -8,7 +8,6 @@ using System.IO; using System.Linq; using System.Reflection; -using System.Runtime.Loader; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers; @@ -57,6 +56,7 @@ protected override async Task RunAsync(CancellationToken cancellationToken) private void ConfigureServices(IServiceCollection services) { _ = services.AddScoped(typeof(ObserverBase), s => new AppObserver(fabricClient, Context)); + _ = services.AddScoped(typeof(ObserverBase), s => new AzureStorageUploadObserver(fabricClient, Context)); _ = services.AddScoped(typeof(ObserverBase), s => new CertificateObserver(fabricClient, Context)); _ = services.AddScoped(typeof(ObserverBase), s => new DiskObserver(fabricClient, Context)); _ = services.AddScoped(typeof(ObserverBase), s => new FabricSystemObserver(fabricClient, Context)); @@ -95,7 +95,7 @@ private void LoadObserversFromPlugins(IServiceCollection services) for (int i = 0; i < pluginDlls.Length; ++i) { string dll = pluginDlls[i]; - PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes, a => a.IsUnloadable = true); + PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes, a => a.IsUnloadable = false); pluginLoaders[i] = loader; } @@ -128,7 +128,6 @@ private void LoadObserversFromPlugins(IServiceCollection services) } catch (Exception e) when (e is ArgumentException || e is BadImageFormatException || e is IOException) { - pluginLoader?.Dispose(); continue; } } diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index 0166967c..90b73d83 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -12,52 +12,39 @@ linux-x64;win-x64 - 3.1.15.0 + 3.1.16.0 Copyright © 2020 FabricObserver Service Fabric Observer - 3.1.15 + 3.1.16 true true FabricObserver.Program - false - false CA1822;$(NoWarn) true AnyCPU;x64 - - - - - - - + + + - - - - - + - - - - + diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 640776c3..3f6d41d1 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -74,7 +74,6 @@ public AppObserver(FabricClient fabricClient, StatelessServiceContext context) { configSettings = new ConfigSettings(FabricServiceContext); ConfigPackagePath = configSettings.ConfigPackagePath; - stopwatch = new Stopwatch(); } @@ -92,16 +91,10 @@ public override async Task ObserveAsync(CancellationToken token) if (!initialized) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "AppObserver was unable to initialize correctly due to misconfiguration. " + - "Please check your AppObserver configuration settings."); - + ObserverLogger.LogWarning("AppObserver was unable to initialize correctly due to misconfiguration. " + + "Please check your AppObserver configuration settings."); stopwatch.Stop(); stopwatch.Reset(); - return; } @@ -148,25 +141,17 @@ public override Task ReportAsync(CancellationToken token) childProcessTelemetryDataList = new List(); } + app = deployedTargetList.Find( + a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || + (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); try { - app = deployedTargetList.Find( - a => a.TargetApp == repOrInst.ApplicationName.OriginalString || a.TargetAppType == repOrInst.ApplicationTypeName); - using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); - - // If the process is no longer running, then don't report on it. - if (p.HasExited) - { - continue; - } - processName = p.ProcessName; processId = p.Id; } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { - ObserverLogger.LogWarning($"Handled Exception in ReportAsync:{Environment.NewLine}{e}"); continue; } @@ -206,7 +191,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppCpuData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppCpuData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -217,7 +202,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // Memory MB - Parent process @@ -227,7 +212,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppMemDataMb, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppMemDataMb, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -238,7 +223,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // Memory Percent - Parent process @@ -248,7 +233,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppMemDataPercent, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppMemDataPercent, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -259,7 +244,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Active - Parent process @@ -269,7 +254,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppTotalActivePortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppTotalActivePortsData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -280,7 +265,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process @@ -290,7 +275,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppEphemeralPortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppEphemeralPortsData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -301,7 +286,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // Allocated (in use) Handles - Parent process @@ -311,7 +296,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppHandlesData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + ProcessChildProcs(ref AllAppHandlesData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -322,7 +307,7 @@ public override Task ReportAsync(CancellationToken token) healthReportTimeToLive, HealthReportType.Application, repOrInst, - app.DumpProcessOnError); + app.DumpProcessOnError && EnableProcessDumps); } // Child proc info telemetry. @@ -348,12 +333,12 @@ public override Task ReportAsync(CancellationToken token) } private void ProcessChildProcs( - ref List> fruds, - ref List childProcessTelemetryDataList, - ReplicaOrInstanceMonitoringInfo repOrInst, - ApplicationInfo app, - ref FabricResourceUsageData parentFrud, - CancellationToken token) where T : struct + ref List> fruds, + ref List childProcessTelemetryDataList, + ReplicaOrInstanceMonitoringInfo repOrInst, + ref ApplicationInfo app, + ref FabricResourceUsageData parentFrud, + CancellationToken token) where T : struct { token.ThrowIfCancellationRequested(); @@ -361,7 +346,7 @@ private void ProcessChildProcs( { string metric = parentFrud.Property; var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref fruds, repOrInst, app, token); + var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref fruds, repOrInst, ref app, token); double sumAllValues = Sum + parentDataAvg; childProcInfo.Metric = metric; childProcInfo.Value = sumAllValues; @@ -378,7 +363,7 @@ private void ProcessChildProcs( private (ChildProcessTelemetryData childProcInfo, double Sum) ProcessChildFrudsGetDataSum( ref List> fruds, ReplicaOrInstanceMonitoringInfo repOrInst, - ApplicationInfo app, + ref ApplicationInfo app, CancellationToken token) where T : struct { var childProcs = repOrInst.ChildProcesses; @@ -430,6 +415,64 @@ private void ProcessChildProcs( childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } + // Windows process dump support for descendant/child processes \\ + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && app.DumpProcessOnError && EnableProcessDumps) + { + string prop = frud.Property; + + switch (prop) + { + case ErrorWarningProperty.TotalCpuTime: + if (frud.IsUnhealthy(app.CpuErrorLimitPercent)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalCpuTime); + app.DumpProcessOnError = false; + } + break; + + case ErrorWarningProperty.TotalMemoryConsumptionMb: + if (frud.IsUnhealthy(app.MemoryErrorLimitMb)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalMemoryConsumptionMb); + app.DumpProcessOnError = false; + } + break; + + case ErrorWarningProperty.TotalMemoryConsumptionPct: + if (frud.IsUnhealthy(app.MemoryErrorLimitPercent)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalMemoryConsumptionPct); + app.DumpProcessOnError = false; + } + break; + + case ErrorWarningProperty.TotalActivePorts: + if (frud.IsUnhealthy(app.NetworkErrorActivePorts)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalActivePorts); + app.DumpProcessOnError = false; + } + break; + + case ErrorWarningProperty.TotalEphemeralPorts: + if (frud.IsUnhealthy(app.NetworkErrorEphemeralPorts)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalEphemeralPorts); + app.DumpProcessOnError = false; + } + break; + + case ErrorWarningProperty.TotalFileHandles: + if (frud.IsUnhealthy(app.ErrorOpenFileHandles)) + { + DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalFileHandles); + app.DumpProcessOnError = false; + } + break; + } + } + // Remove child FRUD from ref FRUD. fruds.Remove(frud); } @@ -476,11 +519,11 @@ private async Task InitializeAsync() userTargetList = new List(); deployedTargetList = new List(); - /* For descendant proc monitoring */ + /* Child/Descendant proc monitoring config */ if (bool.TryParse( GetSettingParameterValue( ConfigurationSectionName, - ObserverConstants.EnableChildProcessMonitoring), out bool enableDescendantMonitoring)) + ObserverConstants.EnableChildProcessMonitoringParameter), out bool enableDescendantMonitoring)) { EnableChildProcessMonitoring = enableDescendantMonitoring; } @@ -492,7 +535,27 @@ private async Task InitializeAsync() { MaxChildProcTelemetryDataCount = maxChildProcs; } - /* End descendant proc monitoring */ + + /* dumpProcessOnError config */ + if (bool.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableProcessDumpsParameter), out bool enableDumps)) + { + EnableProcessDumps = enableDumps; + } + + if (Enum.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.DumpTypeParameter), out DumpType dumpType)) + { + DumpType = dumpType; + } + + if (int.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.MaxDumpsParameter), out int maxDumps)) + { + MaxDumps = maxDumps; + } + + if (TimeSpan.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.MaxDumpsTimeWindowParameter), out TimeSpan dumpTimeWindow)) + { + MaxDumpsTimeWindow = dumpTimeWindow; + } configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( @@ -509,17 +572,104 @@ private async Task InitializeAsync() return false; } - await using Stream stream = new FileStream(appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read); + bool isJson = JsonHelper.IsJson>(await File.ReadAllTextAsync(appObserverConfigFileName)); - if (stream.Length > 0 && JsonHelper.IsJson>(await File.ReadAllTextAsync(appObserverConfigFileName))) + if (!isJson) { - userTargetList.AddRange(JsonHelper.ReadFromJsonStream(stream)); + string message = "AppObserver's JSON configuration file is malformed. Please fix the JSON and redeploy FabricObserver if you want AppObserver to monitor service processes."; + var healthReport = new Utilities.HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = message, + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = "JsonValidation", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverConstants.AppObserverName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + "JsonValidation", + HealthState.Warning, + message, + ObserverName, + Token); + } + + // ETW. + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = "JsonValidation", + Level = "Warning", + Message = message, + ObserverName + }); + } + + return false; } - // Are any of the config-supplied apps deployed?. + await using Stream stream = new FileStream(appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read); + userTargetList.AddRange(JsonHelper.ReadFromJsonStream(stream)); + + // Does the configuration have any objects (targets) defined? if (userTargetList.Count == 0) { - ObserverLogger.LogWarning($"Will not observe service resource consumption on node {NodeName} as no configuration parameters have been supplied."); + string message = $"Please add targets to AppObserver's JSON configuration file and redeploy FabricObserver if you want AppObserver to monitor service processes."; + + var healthReport = new Utilities.HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = message, + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = "Misconfiguration", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverConstants.AppObserverName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + "Misconfiguration", + HealthState.Warning, + message, + ObserverName, + Token); + } + + // ETW. + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = "Misconfiguration", + Level = "Warning", + Message = message, + ObserverName + }); + } + return false; } @@ -580,12 +730,12 @@ private async Task InitializeAsync() } // App filtering: AppExcludeList, AppIncludeList. This is only useful when you are observing All/* applications for a range of thresholds. - if (!string.IsNullOrWhiteSpace(application.AppExcludeList) && application.AppExcludeList.Contains(app.ApplicationName.OriginalString)) + if (!string.IsNullOrWhiteSpace(application.AppExcludeList) && application.AppExcludeList.Contains(app.ApplicationName.OriginalString.Replace("fabric:/", string.Empty))) { continue; } - if (!string.IsNullOrWhiteSpace(application.AppIncludeList) && !application.AppIncludeList.Contains(app.ApplicationName.OriginalString)) + if (!string.IsNullOrWhiteSpace(application.AppIncludeList) && !application.AppIncludeList.Contains(app.ApplicationName.OriginalString.Replace("fabric:/", string.Empty))) { continue; } @@ -659,22 +809,23 @@ private async Task InitializeAsync() for (int i = 0; i < userTargetList.Count; ++i) { Token.ThrowIfCancellationRequested(); - - var application = userTargetList[i]; Uri appUri = null; + ApplicationInfo application = userTargetList[i]; if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"InitializeAsync() | {application.TargetApp}: Required setting, target, is not set."); + ObserverLogger.LogWarning($"InitializeAsync: Required setting, targetApp or targetAppType, is not set in AppObserver.config.json."); settingsFail++; continue; } - + + // No required settings supplied for deployed application(s). + if (settingsFail == userTargetList.Count) + { + return false; + } + if (!string.IsNullOrWhiteSpace(application.TargetApp)) { try @@ -693,24 +844,13 @@ private async Task InitializeAsync() } catch (Exception e) when (e is ArgumentException || e is UriFormatException) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"InitializeAsync() | {application.TargetApp}: Invalid TargetApp value. " + - $"Value must be a valid Uri string of format \"fabric:/MyApp\" OR just \"MyApp\""); - - settingsFail++; + ObserverLogger.LogWarning($"InitializeAsync: Unexpected TargetApp value {application.TargetApp}. " + + $"Value must be a valid Uri string of format \"fabric:/MyApp\" OR just \"MyApp\""); + continue; } } - // No required settings supplied for deployed application(s). - if (settingsFail == userTargetList.Count) - { - return false; - } - if (!string.IsNullOrWhiteSpace(application.TargetAppType)) { await SetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetAppType); @@ -721,7 +861,9 @@ private async Task InitializeAsync() } } - for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) + int repCount = ReplicaOrInstanceList.Count; + + for (int i = 0; i < repCount; ++i) { Token.ThrowIfCancellationRequested(); @@ -782,12 +924,84 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) try { - using Process parentProc = Process.GetProcessById(parentPid); - string parentProcName = parentProc.ProcessName; + Process parentProc = null; + + try + { + parentProc = Process.GetProcessById(parentPid); + + // This is strange and can happen during a redeployment. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && parentProc?.ProcessName == "Idle") + { + continue; + } + + // This will throw Win32Exception if process is running at higher elevation than FO. + // If it is not, then this would mean the process has exited so move on to next process. + if (parentProc.HasExited) + { + continue; + } + } + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + if (e is Win32Exception exception && exception.NativeErrorCode == 5 || e.Message.ToLower().Contains("access is denied")) + { + string message = $"Host process {parentProc?.ProcessName} for service {repOrInst?.ServiceName?.OriginalString} is running at a higher user privilege than FabricObserver.{Environment.NewLine}" + + $"Note: you must run FabricObserver as System user if you want to monitor services that run as System or Admin user."; + + var healthReport = new Utilities.HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = message, + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = $"PermissionViolation({parentProc?.ProcessName})", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverConstants.AppObserverName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + $"PermissionViolation({parentProc?.ProcessName})", + HealthState.Warning, + message, + ObserverName, + token, + repOrInst?.ServiceName?.OriginalString); + } + + // ETW. + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = $"PermissionViolation({parentProc?.ProcessName})", + Level = "Warning", + Message = message, + ObserverName, + ServiceName = repOrInst?.ServiceName?.OriginalString + }); + } + } + + continue; + } + + string parentProcName = parentProc?.ProcessName; // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. // Please use ContainerObserver for SF container app service monitoring. - if (parentProcName == "Fabric") + if (parentProcName == null || parentProcName == "Fabric") { continue; } @@ -923,6 +1137,64 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); } } + + // Handles/FDs + if (checkHandles) + { + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + + if (handles > -1) + { + if (procId == parentPid) + { + AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + } + else + { + if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(handles); + } + } + } + + // Total TCP ports usage + if (checkAllPorts) + { + // Parent process (the service process). + if (procId == parentPid) + { + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + else + { + // Child procs spawned by the parent service process. + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + } + + // Ephemeral TCP ports usage + if (checkEphemeralPorts) + { + if (procId == parentPid) + { + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + else + { + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + } // Monitor Duration applies to the code below. timer.Start(); @@ -1008,79 +1280,18 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } } - if (checkHandles) - { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); - - if (handles > -1) - { - if (procId == parentPid) - { - AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); - } - else - { - if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procName}")) - { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(handles); - } - } - } - - // Total TCP ports usage - if (checkAllPorts) - { - // Parent process (the service process). - if (procId == parentPid) - { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); - } - else - { - // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procName}")) - { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); - } - } - - // Ephemeral TCP ports usage - if (checkEphemeralPorts) - { - if (procId == parentPid) - { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); - } - else - { - if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procName}")) - { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); - } - } - - await Task.Delay(250, Token); + await Task.Delay(250, Token).ConfigureAwait(false); } timer.Stop(); timer.Reset(); + + await Task.Delay(250, Token).ConfigureAwait(false); } } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) - { - ObserverLogger.LogWarning( - $"Handled exception in MonitorDeployedAppsAsync: Process {parentPid} is not running or it's running at a higher privilege than FabricObserver.{Environment.NewLine}" + - $"ServiceName: {repOrInst.ServiceName?.OriginalString ?? "unknown"}{Environment.NewLine}Error message: {e.Message}"); - } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - ObserverLogger.LogWarning($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}"); + ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -1348,6 +1559,8 @@ private void CleanUp() AllAppTotalActivePortsData?.Clear(); AllAppTotalActivePortsData = null; } + + ProcessInfoProvider.Instance?.Dispose(); } private void LogAllAppResourceDataToCsv(string appName) diff --git a/FabricObserver/Observers/AzureStorageUploadObserver.cs b/FabricObserver/Observers/AzureStorageUploadObserver.cs new file mode 100644 index 00000000..1ee52950 --- /dev/null +++ b/FabricObserver/Observers/AzureStorageUploadObserver.cs @@ -0,0 +1,520 @@ +// ------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License (MIT). See License.txt in the repo root for license information. +// ------------------------------------------------------------ + +using Azure; +using Azure.Storage; +using Azure.Storage.Blobs; +using FabricObserver.Observers.Utilities; +using System; +using System.Diagnostics; +using System.Fabric; +using System.IO; +using System.IO.Compression; +using System.Runtime.InteropServices; +using System.Security; +using System.Threading; +using System.Threading.Tasks; + +namespace FabricObserver.Observers +{ + // AzureStorageObserver is an observer that periodically checks for the existence of dmp files in observer_logs folder. It then uploads them if it has been + // configured to do so - assuming correctly encrypted and specified ConnectionString for an Azure Storage Account, a container name, and other basic settings. + // Since only Windows is supported for dumping service processes today by FO, this observer is not useful for Liunx in this version. + // So, if you are deploying FO to Linux servers, then don't enable this observer (it won't do anything if it is enabled, so no need have it resident in memory). + public class AzureStorageUploadObserver : ObserverBase + { + private readonly Stopwatch stopwatch; + + // Only AppObserver is supported today. No other observers generate dmp files. + private const string AppObserverDumpFolder = "MemoryDumps"; + private string appObsDumpFolderPath; + + private SecureString StorageConnectionString + { + get; set; + } + + private string BlobContainerName + { + get; set; + } + + private AuthenticationType AuthenticationType + { + get; set; + } + + private string StorageAccountName + { + get; set; + } + + private SecureString StorageAccountKey + { + get; set; + } + + private CompressionLevel ZipCompressionLevel + { + get; set; + } = CompressionLevel.Optimal; + + public AzureStorageUploadObserver(FabricClient fabricClient, StatelessServiceContext context) + : base(fabricClient, context) + { + stopwatch = new Stopwatch(); + } + + public override async Task ObserveAsync(CancellationToken token) + { + // Since there is currently only support for Windows process dumps (by AppObserver only), there is no need to run this Observer on Linux (today..). + // The dumps created are *not* crash dumps, they are live dumps of a process's memory, handles, threads, stack.. So, the target process will not be killed. + // By default, the dmp files are MiniPlus, so they will roughly be as large as the process's private working set. You can set to Mini (similar size) or + // Full, much larger. You probably do not need to create Full dumps in most cases. + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + return; + } + + // If set, this observer will only run during the supplied interval. + if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) + { + return; + } + + if (token.IsCancellationRequested) + { + return; + } + + Token = token; + stopwatch.Start(); + + if (!Initialize()) + { + stopwatch.Stop(); + stopwatch.Reset(); + LastRunDateTime = DateTime.Now; + return; + } + + // In case upload failed, also try and upload any zip files that remained in target local directory. + await ProcessFilesAsync(appObsDumpFolderPath, new[] { "*.zip", "*.dmp" }, token); + await ReportAsync(token); + + CleanUp(); + + // The time it took to run this observer. + stopwatch.Stop(); + RunDuration = stopwatch.Elapsed; + + if (EnableVerboseLogging) + { + ObserverLogger.LogInfo($"Run Duration: {RunDuration}"); + } + + stopwatch.Reset(); + LastRunDateTime = DateTime.Now; + } + + private void CleanUp() + { + if (StorageConnectionString != null) + { + StorageConnectionString.Dispose(); + StorageConnectionString = null; + } + + if (StorageAccountKey != null) + { + StorageAccountKey?.Dispose(); + StorageAccountKey = null; + } + } + + private async Task ProcessFilesAsync(string folderPath, string[] searchPatterns, CancellationToken token) + { + for (int i = 0; i < searchPatterns.Length; ++i) + { + token.ThrowIfCancellationRequested(); + + string searchPattern = searchPatterns[i]; + string[] files = Directory.GetFiles(folderPath, searchPattern, SearchOption.AllDirectories); + + for (int j = 0; j < files.Length; ++j) + { + token.ThrowIfCancellationRequested(); + + string file = files[j]; + + if (searchPattern == "*.dmp") + { + if (!CompressFileForUpload(file)) + { + continue; + } + } + + string zipfile = file.Replace(".dmp", ".zip"); + bool success = await UploadBlobAsync(zipfile, token); + + await Task.Delay(1000, token); + + if (!success) + { + continue; + } + + try + { + Retry.Do(() => File.Delete(zipfile), TimeSpan.FromSeconds(1), token, 3); + } + catch (AggregateException ae) + { + ObserverLogger.LogWarning($"Unable to delete file {Path.GetFileName(zipfile)} after successful upload " + + $"to blob container {BlobContainerName}:{Environment.NewLine}{ae}"); + } + } + } + } + + private bool CompressFileForUpload(string file, bool deleteOriginal = true) + { + if (!File.Exists(file)) + { + return false; + } + + if (file.EndsWith(".zip")) + { + return true; + } + + string zipPath = null; + + try + { + zipPath = file.Replace(".dmp", ".zip"); + using FileStream fs = new FileStream(zipPath, FileMode.Create); + using ZipArchive zip = new ZipArchive(fs, ZipArchiveMode.Create); + zip.CreateEntryFromFile(file, Path.GetFileName(file), ZipCompressionLevel); + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is NotSupportedException || e is UnauthorizedAccessException) + { + ObserverLogger.LogWarning($"Unable to compress file for uploading:{Environment.NewLine}{e}"); + return false; + } + + // Delete the original file if compression succeeds. + if (deleteOriginal && File.Exists(zipPath)) + { + try + { + Retry.Do(() => File.Delete(file), TimeSpan.FromSeconds(1), Token, 3); + } + catch (AggregateException ae) + { + ObserverLogger.LogWarning($"Unable to delete original file after successful compression to zip file:{Environment.NewLine}{ae}"); + } + } + + return true; + } + + public override Task ReportAsync(CancellationToken token) + { + // This observer does not report. + return Task.FromResult(0); + } + + private bool Initialize() + { + appObsDumpFolderPath = Path.Combine(ObserverLogger.LogFolderBasePath, ObserverConstants.AppObserverName, AppObserverDumpFolder); + + // Nothing to do here. + if (!Directory.Exists(appObsDumpFolderPath)) + { + return false; + } + + try + { + var files = Directory.GetFiles(appObsDumpFolderPath, "*.dmp", SearchOption.AllDirectories); + + if (files.Length == 0) + { + files = Directory.GetFiles(appObsDumpFolderPath, "*.zip", SearchOption.AllDirectories); + + if (files.Length == 0) + { + return false; + } + } + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException) + { + ObserverLogger.LogWarning("Initialize(): Unable to determine existence of dmp files in observer log directories. Aborting.."); + return false; + } + + string connString = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.AzureStorageConnectionStringParameter); + string accountName = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.AzureStorageAccountNameParameter); + string accountKey = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.AzureStorageAccountKeyParameter); + + if (string.IsNullOrWhiteSpace(connString) && string.IsNullOrWhiteSpace(accountName) && string.IsNullOrWhiteSpace(accountKey)) + { + ObserverLogger.LogWarning("Initialize: No authentication information provided. Aborting.."); + return false; + } + + BlobContainerName = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.AzureBlobContainerNameParameter); + + if (string.IsNullOrWhiteSpace(BlobContainerName)) + { + ObserverLogger.LogWarning("Initialize: No container name provided. Aborting.."); + return false; + } + + // Clean out old dmp files, if any. Generally, there will only be some dmp files remaining on disk if customer has not configured + // uploads correctly or some error during some stage of the upload process. Under normal circumstances, there will be no dmp (or zip) files remaining on + // disk after successful uploads to configured blob storage container. + ObserverLogger.TryCleanFolder(appObsDumpFolderPath, "*.dmp", TimeSpan.FromDays(3)); + + // Compression setting. + string compressionLevel = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.ZipFileCompressionLevelParameter); + + if (Enum.TryParse(compressionLevel, true, out CompressionLevel compressLevel)) + { + ZipCompressionLevel = compressLevel; + } + + // Decrypt connection string.\\ + + ConfigurationPackage configPackage = FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config"); + + if (!string.IsNullOrWhiteSpace(connString)) + { + AuthenticationType = AuthenticationType.ConnectionString; + bool isEncrypted = configPackage.Settings.Sections[ConfigurationSectionName].Parameters[ObserverConstants.AzureStorageConnectionStringParameter].IsEncrypted; + + if (isEncrypted) + { + try + { + StorageConnectionString = configPackage.Settings.Sections[ConfigurationSectionName].Parameters[ObserverConstants.AzureStorageConnectionStringParameter].DecryptValue(); + } + catch (Exception e) + { + ObserverLogger.LogWarning($"Unable to decrypt Azure Storage Connection String:{Environment.NewLine}{e}"); + return false; + } + } + else + { + ObserverLogger.LogWarning("You have not encrypted your Azure Storage ConnectionString. This is not safe. " + + "Please encrypt it using the Invoke-ServiceFabricEncryptText PowerShell cmdlet."); + + // TOTHINK: Don't enable non-encrypted connection string support. Just return false here? + char[] cArr = connString.ToCharArray(); + StorageConnectionString = SecureStringFromCharArray(cArr, 0, cArr.Length); + + // SecureStringFromCharArray returns null if it fails to convert. It logs failure reason to the local log directory of this observer. + if (StorageConnectionString == null) + { + return false; + } + } + } + else + { + if (string.IsNullOrWhiteSpace(accountName) || string.IsNullOrWhiteSpace(accountKey)) + { + ObserverLogger.LogWarning("You have not provided required Azure Storage account information. Aborting..."); + return false; + } + + AuthenticationType = AuthenticationType.SharedKey; + StorageAccountName = accountName; + bool isEncrypted = configPackage.Settings.Sections[ConfigurationSectionName].Parameters[ObserverConstants.AzureStorageAccountKeyParameter].IsEncrypted; + + if (isEncrypted) + { + try + { + StorageAccountKey = configPackage.Settings.Sections[ConfigurationSectionName].Parameters[ObserverConstants.AzureStorageAccountKeyParameter].DecryptValue(); + } + catch (Exception e) + { + ObserverLogger.LogWarning($"Unable to decrypt Azure Storage Account Key:{Environment.NewLine}{e}"); + return false; + } + } + else + { + ObserverLogger.LogWarning("You have not encrypted your Azure Storage Account Key. This is not safe. " + + "Please encrypt it using the Invoke-ServiceFabricEncryptText PowerShell cmdlet."); + + // TOTHINK: Don't enable non-encrypted connection string support. Just return false here? + char[] cArr = accountKey.ToCharArray(); + StorageAccountKey = SecureStringFromCharArray(cArr, 0, cArr.Length); + + // SecureStringFromCharArray returns null if it fails to convert. It logs failure reason to the local log directory of this observer. + if (StorageAccountKey == null) + { + return false; + } + } + } + + return true; + } + + private async Task UploadBlobAsync(string filePath, CancellationToken token) + { + if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath) || token.IsCancellationRequested) + { + return false; + } + + string blobName = Path.GetFileName(filePath); + bool success = false; + BlobContainerClient container = null; + + if (AuthenticationType == AuthenticationType.ConnectionString) + { + char[] arr = SecureStringToCharArray(StorageConnectionString); + + // SecureStringToCharArray returns null if it fails to convert. It logs failure reason to the local log directory of this observer. + if (arr == null) + { + return false; + } + + string s = new string(arr); + + // Create a client that can authenticate with a connection string. + container = new BlobContainerClient(s, BlobContainerName); + } + else + { + string accountName = StorageAccountName; + char[] arr = SecureStringToCharArray(StorageAccountKey); + + // SecureStringToCharArray returns null if it fails to convert. It logs failure reason to the local log directory of this observer. + if (arr == null) + { + return false; + } + + string accountKey = new string(arr); + Uri serviceUri = new Uri($"https://{accountName}.blob.core.windows.net/{BlobContainerName}"); + + // Create a SharedKeyCredential that we can use to authenticate. + StorageSharedKeyCredential credential = new StorageSharedKeyCredential(accountName, accountKey); + + // Create a client that can authenticate with a shared key credential. + container = new BlobContainerClient(serviceUri, credential); + } + + _ = container.CreateIfNotExists(); + token.ThrowIfCancellationRequested(); + BlobClient blob = container.GetBlobClient(blobName); + + // Upload local zip file. + + await blob.UploadAsync(filePath, token).ContinueWith( + (response) => + { + if (response.IsFaulted) + { + ObserverLogger.LogWarning($"Upload of blob {Path.GetFileName(filePath)} " + + $"failed:{Environment.NewLine}{response.Exception.Message}"); + + // Try and delete local duplicate blob file (it already exists in the blob storage account). + if (response.Exception.InnerException is RequestFailedException ex) + { + if (ex.ErrorCode == "BlobAlreadyExists" || ex.HResult == -2146233088) + { + ObserverLogger.LogWarning($"Deleting duplicate blob {blobName} from local disk.."); + try + { + Retry.Do(() => File.Delete(filePath), TimeSpan.FromSeconds(1), token); + ObserverLogger.LogWarning($"Successfully deleted duplicate blob {blobName} from local disk.."); + } + catch (AggregateException ae) + { + ObserverLogger.LogWarning($"Can't delete local duplicate blob {blobName}:{Environment.NewLine}{ae}"); + } + } + } + success = false; + } + else if (response.IsCompletedSuccessfully) + { + ObserverLogger.LogInfo($"Successfully uploaded file {Path.GetFileName(filePath)} " + + $"to blob container {BlobContainerName}."); + success = true; + } + + }, token).ConfigureAwait(true); + + return success; + } + + // SecureString helpers \\ + + private SecureString SecureStringFromCharArray(char[] charArray, int start, int end) + { + SecureString secureString = new SecureString(); + + try + { + for (int i = start; i < end; i++) + { + Token.ThrowIfCancellationRequested(); + + secureString.AppendChar(charArray[i]); + } + } + catch (Exception e) when (!(e is OperationCanceledException)) + { + ObserverLogger.LogWarning($"Unable to create SecureString from supplied char array:{Environment.NewLine}{e}"); + return null; + } + + secureString.MakeReadOnly(); + return secureString; + } + + private char[] SecureStringToCharArray(SecureString secureString) + { + char[] charArray = new char[secureString.Length]; + IntPtr ptr = IntPtr.Zero; + + try + { + ptr = SecureStringMarshal.SecureStringToGlobalAllocUnicode(secureString); + Marshal.Copy(ptr, charArray, 0, secureString.Length); + } + catch (Exception e) when (e is ArgumentException || e is NotSupportedException) + { + ObserverLogger.LogWarning($"Can't convert SecureString instance to string:{Environment.NewLine}{e}"); + charArray = null; + } + finally + { + Marshal.ZeroFreeGlobalAllocUnicode(ptr); + } + + return charArray; + } + } + + public enum AuthenticationType + { + ConnectionString, + SharedKey + } +} diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 0d44aeac..74e020f4 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -7,7 +7,6 @@ using System.Runtime.InteropServices; using System.Security; using System.Security.Cryptography.X509Certificates; -using System.ServiceModel; using System.Threading; using System.Threading.Tasks; using System.Xml; @@ -78,7 +77,6 @@ public SecurityConfiguration SecurityConfiguration public override async Task ObserveAsync(CancellationToken token) { // Only run once per specified time in Settings.xml. (default is already set to 1 day for CertificateObserver) - // See Settings.xml, CertificateObserverConfiguration section, RunInterval parameter. if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) { return; @@ -89,6 +87,8 @@ public override async Task ObserveAsync(CancellationToken token) return; } + Token = token; + await Initialize(token).ConfigureAwait(true); ExpiredWarnings = new List(); @@ -135,10 +135,7 @@ public override async Task ObserveAsync(CancellationToken token) } catch (SecurityException e) { - WriteToLogWithLevel( - ObserverName, - $"Can't access {store.Name} due to {e.Message} - {e.StackTrace}", - LogLevel.Warning); + ObserverLogger.LogWarning($"Can't access {store.Name} due to {e.Message} - {e.StackTrace}"); } finally { @@ -395,7 +392,7 @@ private async Task GetSecurityTypes(CancellationToken token) } else { - throw new ActionNotSupportedException("if X509FindTime attribute, value should be FindBySubjectName"); + throw new Exception("if X509FindTime attribute, value should be FindBySubjectName"); } } @@ -411,10 +408,7 @@ private async Task GetSecurityTypes(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"There was an issue parsing the cluster manifest. Observer cannot run. Error Details:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError($"There was an issue parsing the cluster manifest. Observer cannot run. Error Details:{Environment.NewLine}{e}"); throw; } diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index ba83152c..d53bf44d 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -186,10 +186,7 @@ public override async Task ObserveAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError($"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -291,11 +288,7 @@ in FabricObserver.Extensibility project. */ throw; } - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -360,11 +353,7 @@ private void SetErrorWarningThresholds() throw; } - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"Unhandled exception in SetErrorWarningThresholds:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unhandled exception in SetErrorWarningThresholds:{Environment.NewLine}{e}"); // Fix the bug... throw; } diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 919d5d45..49f1cdff 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -810,7 +810,7 @@ private async Task GetProcessInfoAsync(string procName) // This is used for info report. TotalAllocatedHandlesAllSystemServices += handles; - + // No need to proceed further if there are no configuration settings for CPU, Memory, Handles thresholds. // Returning here is correct as supplied thresholds apply to all system services. if (CpuErrorUsageThresholdPct <= 0 && CpuWarnUsageThresholdPct <= 0 && MemErrorUsageThresholdMb <= 0 && MemWarnUsageThresholdMb <= 0 @@ -819,6 +819,12 @@ private async Task GetProcessInfoAsync(string procName) return; } + // Handles/FDs + if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) + { + allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handles); + } + CpuUsage cpuUsage = new CpuUsage(); // Mem @@ -831,13 +837,6 @@ private async Task GetProcessInfoAsync(string procName) } } - // Allocated Handles - if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) - { - float handleCount = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); - allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handleCount); - } - TimeSpan duration = TimeSpan.FromSeconds(1); if (MonitorDuration > TimeSpan.MinValue) @@ -902,6 +901,8 @@ private async Task GetProcessInfoAsync(string procName) timer.Stop(); timer.Reset(); + + await Task.Delay(250, Token).ConfigureAwait(false); } } @@ -1015,6 +1016,8 @@ private void CleanUp() allActiveTcpPortData?.Clear(); allActiveTcpPortData = null; } + + ProcessInfoProvider.Instance?.Dispose(); } } } diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 011991ef..8eedec89 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -69,7 +69,6 @@ public class NetworkObserver : ObserverBase private readonly ConfigSettings configSettings; private HealthState healthState = HealthState.Ok; private bool hasRun; - private CancellationToken cancellationToken; private int tcpConnTestRetried; /// @@ -96,22 +95,19 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Stop(); stopwatch.Reset(); LastRunDateTime = DateTime.Now; - return; } - cancellationToken = token; - - if (cancellationToken.IsCancellationRequested) + if (token.IsCancellationRequested) { return; } + Token = token; stopwatch.Start(); // Run conn tests. Retry.Do(InternetConnectionStateIsConnected, TimeSpan.FromSeconds(10), token); - await ReportAsync(token).ConfigureAwait(true); // The time it took to run this observer. @@ -336,7 +332,7 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) private async Task InitializeAsync() { - cancellationToken.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); // This only needs to be logged once. // This file is used by the ObserverWebApi application. @@ -346,13 +342,9 @@ private async Task InitializeAsync() Console.WriteLine($"logPath: {logPath}"); - if (!ObserverLogger.TryWriteLogFile(logPath, GetNetworkInterfaceInfo(cancellationToken))) + if (!ObserverLogger.TryWriteLogFile(logPath, GetNetworkInterfaceInfo(Token))) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create NetInfo.txt file."); + ObserverLogger.LogWarning("Unable to create NetInfo.txt file."); } } @@ -407,11 +399,11 @@ private void InternetConnectionStateIsConnected() foreach (var config in configList) { - cancellationToken.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); foreach (var endpoint in config.Endpoints) { - cancellationToken.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); if (string.IsNullOrEmpty(endpoint.HostName)) { @@ -439,7 +431,7 @@ private void InternetConnectionStateIsConnected() // E.g., REST endpoints, etc. try { - cancellationToken.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); ServicePointManager.SecurityProtocol = SecurityProtocolType.SystemDefault; string prefix = endpoint.Port == 443 ? "https://" : "http://"; @@ -498,11 +490,7 @@ private void InternetConnectionStateIsConnected() } catch (Exception e) when (!(e is OperationCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - e.ToString()); + ObserverLogger.LogWarning(e.ToString()); // Fix the bug.. throw; diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index b3e7da80..571aa13a 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -373,11 +373,8 @@ public override Task ReportAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"Unhandled exception re-thrown:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unhandled exception re-thrown:{Environment.NewLine}{e}"); + // Fix the bug.. throw; } @@ -679,6 +676,19 @@ error on these conditions. } } + // Ports. + if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) + { + int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); + ActivePortsData.Data.Add(activePortCountTotal); + } + + if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) + { + int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); + EphemeralPortsData.Data.Add(ephemeralPortCountTotal); + } + timer.Start(); while (timer.Elapsed <= duration) @@ -703,19 +713,6 @@ error on these conditions. MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); } - // Ports. - if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) - { - int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); - ActivePortsData.Data.Add(activePortCountTotal); - } - - if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) - { - int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); - EphemeralPortsData.Data.Add(ephemeralPortCountTotal); - } - await Task.Delay(250, Token).ConfigureAwait(true); } @@ -724,11 +721,7 @@ error on these conditions. } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; } diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 8f5151b8..d9a91d89 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -200,11 +200,7 @@ public override async Task ReportAsync(CancellationToken token) // This file is used by the web application (log reader.). if (!ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow:M/d/yyyy HH:mm:ss} UTC
{osReport}")) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create SysInfo.txt file."); + ObserverLogger.LogWarning("Unable to create SysInfo.txt file."); } } @@ -281,11 +277,7 @@ public override async Task ReportAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Error, - $"Unhandled exception processing OS information:{Environment.NewLine}{e}"); + ObserverLogger.LogError($"Unhandled exception processing OS information:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -663,11 +655,7 @@ await TelemetryClient.ReportMetricAsync( } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Error, - $"Unhandled Exception processing OS information:{Environment.NewLine}{e}"); + ObserverLogger.LogError($"Unhandled Exception processing OS information:{Environment.NewLine}{e}"); } } } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 78b6c03f..c0a9210a 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -9,7 +9,6 @@ using System.Fabric.Health; using System.IO; using System.Linq; -using System.Runtime; using System.Runtime.InteropServices; using System.Text; using System.Threading; @@ -96,6 +95,12 @@ public bool IsObserverRunning private set; } + public HealthState ObserverFailureHealthStateLevel + { + get; + set; + } = HealthState.Warning; + private ObserverHealthReporter HealthReporter { get; @@ -176,34 +181,35 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie logFolderBasePath = logFolderBase; } - if (int.TryParse(GetConfigSettingValue(ObserverConstants.MaxArchivedLogFileLifetimeDays), - out int maxArchivedLogFileLifetimeDays)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.MaxArchivedLogFileLifetimeDays), out int maxArchivedLogFileLifetimeDays)) { MaxArchivedLogFileLifetimeDays = maxArchivedLogFileLifetimeDays; } // this logs error/warning/info messages for ObserverManager. Logger = new Logger("ObserverManager", logFolderBasePath, MaxArchivedLogFileLifetimeDays > 0 ? MaxArchivedLogFileLifetimeDays : 7); - HealthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); - SetPropertiesFromConfigurationParameters(); serviceCollection = serviceProvider.GetServices(); // Populate the Observer list for the sequential run loop. int capacity = serviceCollection.Count(o => o.IsEnabled); + if (capacity > 0) { observers = new List(capacity); observers.AddRange(serviceCollection.Where(o => o.IsEnabled)); } - - // FabricObserver Internal Diagnostic Telemetry (Non-PII). - // Internally, TelemetryEvents determines current Cluster Id as a unique identifier for transmitted events. - if (!FabricObserverInternalTelemetryEnabled) + else { + Logger.LogWarning("There are no observers enabled. Aborting.."); return; } - if (FabricServiceContext == null) + HealthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); + SetPropertiesFromConfigurationParameters(); + + // FabricObserver Internal Diagnostic Telemetry (Non-PII). + // Internally, TelemetryEvents determines current Cluster Id as a unique identifier for transmitted events. + if (!FabricObserverInternalTelemetryEnabled) { return; } @@ -211,8 +217,8 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie string codePkgVersion = FabricServiceContext.CodePackageActivationContext.CodePackageVersion; string serviceManifestVersion = FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Description.ServiceManifestVersion; string filepath = Path.Combine(Logger.LogFolderBasePath, - $"fo_telemetry_sent_{codePkgVersion.Replace(".", string.Empty)}_" + - $"{serviceManifestVersion.Replace(".", string.Empty)}_{FabricServiceContext.NodeContext.NodeType}.log"); + $"fo_telemetry_sent_{codePkgVersion.Replace(".", string.Empty)}_" + + $"{serviceManifestVersion.Replace(".", string.Empty)}_{FabricServiceContext.NodeContext.NodeType}.log"); #if !DEBUG // If this has already been sent for this activated version (code/config) of node type x @@ -221,18 +227,20 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie return; } #endif - var telemetryEvents = new TelemetryEvents( - FabricClientInstance, - FabricServiceContext, - ServiceEventSource.Current, - this.token); + try + { + var telemetryEvents = new TelemetryEvents(FabricClientInstance, FabricServiceContext, ServiceEventSource.Current, this.token); - if (telemetryEvents.FabricObserverRuntimeNodeEvent(codePkgVersion, - GetFabricObserverInternalConfiguration(), "HealthState.Initialized")) + if (telemetryEvents.FabricObserverRuntimeNodeEvent(codePkgVersion, GetFabricObserverInternalConfiguration(), "HealthState.Initialized")) + { + // Log a file to prevent re-sending this in case of process restart(s). + // This non-PII FO/Cluster info is versioned and should only be sent once per deployment (config or code updates). + _ = Logger.TryWriteLogFile(filepath, GetFabricObserverInternalConfiguration()); + } + } + catch { - // Log a file to prevent re-sending this in case of process restart(s). - // This non-PII FO/Cluster info is versioned and should only be sent once per deployment (config or code updates). - _ = Logger.TryWriteLogFile(filepath, GetFabricObserverInternalConfiguration()); + } } @@ -241,7 +249,7 @@ public async Task StartObserversAsync() try { // Nothing to do here. - if (observers?.Count == 0) + if (observers == null || observers.Count == 0) { return; } @@ -254,14 +262,12 @@ public async Task StartObserversAsync() { if (!isConfigurationUpdateInProgress && (shutdownSignaled || token.IsCancellationRequested)) { - await ShutDownAsync().ConfigureAwait(true); + await ShutDownAsync().ConfigureAwait(false); break; } - if (!await RunObserversAsync().ConfigureAwait(true)) - { - continue; - } + _ = await RunObserversAsync().ConfigureAwait(false); + /* Note the below use of GC.Collect is NOT a general recommendation for what to do in your own managed service code or app code. Please don't make that connection. You should generally not have to call GC.Collect from user service code. It just depends on your performance needs. @@ -273,13 +279,10 @@ make that connection. You should generally not have to call GC.Collect from user Out of the box, FO will generally consume less than 100MB of workingset. Most of this (~65-70%) is held in native memory. FO workingset can increase depending upon how many services you monitor, how you write your plugins with respect to memory consumption, etc.. */ + // SOH, sweep-only collection (no compaction). This will clear the early generation objects (short-lived) from memory. This only impacts the FO process. GC.Collect(0, GCCollectionMode.Forced, true, false); GC.Collect(1, GCCollectionMode.Forced, true, false); - // Compact LOH - GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; - GC.Collect(2, GCCollectionMode.Forced, true, true); - if (ObserverExecutionLoopSleepSeconds > 0) { await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds), token); @@ -676,6 +679,17 @@ private void SetPropertiesFromConfigurationParameters() // ObserverWebApi. ObserverWebAppDeployed = bool.TryParse(GetConfigSettingValue(ObserverConstants.ObserverWebApiEnabled), out bool obsWeb) && obsWeb && IsObserverWebApiAppInstalled(); + // ObserverFailure HealthState Level + string state = GetConfigSettingValue(ObserverConstants.ObserverFailureHealthStateLevelParameter); + if (string.IsNullOrWhiteSpace(state) || state?.ToLower() == "none") + { + ObserverFailureHealthStateLevel = HealthState.Unknown; + } + else if (Enum.TryParse(state, out HealthState healthState)) + { + ObserverFailureHealthStateLevel = healthState; + } + // (Assuming Diagnostics/Analytics cloud service implemented) Telemetry. if (bool.TryParse(GetConfigSettingValue(ObserverConstants.TelemetryEnabled), out bool telemEnabled)) { @@ -790,7 +804,7 @@ private async Task RunObserversAsync() { if (TaskCancelled || shutdownSignaled) { - return false; + return true; } // Is it healthy? @@ -804,14 +818,14 @@ private async Task RunObserversAsync() IsObserverRunning = true; // Synchronous call. - var isCompleted = observer.ObserveAsync(linkedSFRuntimeObserverTokenSource?.Token ?? token).Wait(observerExecTimeout); + bool isCompleted = observer.ObserveAsync(linkedSFRuntimeObserverTokenSource?.Token ?? token).Wait(observerExecTimeout); // The observer is taking too long (hung?), move on to next observer. // Currently, this observer will not run again for the lifetime of this FO service instance. if (!isCompleted && !(TaskCancelled || shutdownSignaled)) { - string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified run time of {observerExecTimeout.TotalSeconds} seconds. " + - $"This means something is wrong with {observer.ObserverName}. It will not be run again. Look into it."; + string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified Maximum run time of {observerExecTimeout.TotalSeconds} seconds. " + + $"This means something is wrong with {observer.ObserverName}. It will not be run again. Please look into it."; Logger.LogError(observerHealthWarning); observer.IsUnhealthy = true; @@ -823,7 +837,7 @@ private async Task RunObserversAsync() { Description = observerHealthWarning, HealthState = "Error", - Metric = $"{observer.ObserverName}_ServiceHealth", + Metric = $"{observer.ObserverName}_HealthState", NodeName = nodeName, ObserverName = ObserverConstants.ObserverManagerName, Source = ObserverConstants.FabricObserverName @@ -836,16 +850,36 @@ private async Task RunObserversAsync() if (EtwEnabled) { Logger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Description = observerHealthWarning, - HealthState = "Error", - Metric = $"{observer.ObserverName}_ServiceHealth", - NodeName = nodeName, - ObserverName = ObserverConstants.ObserverManagerName, - Source = ObserverConstants.FabricObserverName - }); + ObserverConstants.FabricObserverETWEventName, + new + { + Description = observerHealthWarning, + HealthState = "Error", + Metric = $"{observer.ObserverName}_HealthState", + NodeName = nodeName, + ObserverName = ObserverConstants.ObserverManagerName, + Source = ObserverConstants.FabricObserverName + }); + } + + // Put FO into Warning or Error (health state is configurable in Settings.xml) + if (ObserverFailureHealthStateLevel != HealthState.Unknown) + { + var healthReport = new HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = false, + HealthMessage = observerHealthWarning, + HealthReportTimeToLive = TimeSpan.MaxValue, + Property = $"{observer.ObserverName}_HealthState", + ReportType = HealthReportType.Application, + State = ObserverFailureHealthStateLevel, + NodeName = this.nodeName, + Observer = ObserverConstants.ObserverManagerName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); } continue; @@ -920,12 +954,7 @@ ex.InnerException is OperationCanceledException || } catch (Exception e) { - HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Error, - $"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); - + Logger.LogWarning($"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); allExecuted = false; } @@ -938,12 +967,7 @@ ex.InnerException is OperationCanceledException || } else { - HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Warning, - exceptionBuilder.ToString()); - + Logger.LogWarning(exceptionBuilder.ToString()); _ = exceptionBuilder.Clear(); } diff --git a/FabricObserver/Observers/SFConfigurationObserver.cs b/FabricObserver/Observers/SFConfigurationObserver.cs index 82a20426..003c2fc2 100644 --- a/FabricObserver/Observers/SFConfigurationObserver.cs +++ b/FabricObserver/Observers/SFConfigurationObserver.cs @@ -67,7 +67,7 @@ public SFConfigurationObserver(FabricClient fabricClient, StatelessServiceContex public override async Task ObserveAsync(CancellationToken token) { - if (!IsObserverWebApiAppDeployed || RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) + if (!IsObserverWebApiAppDeployed || (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval)) { return; } @@ -77,6 +77,8 @@ public override async Task ObserveAsync(CancellationToken token) return; } + Token = token; + try { var config = ServiceFabricConfiguration.Instance; @@ -100,11 +102,7 @@ public override async Task ObserveAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException)) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"Unhandled Exception in ObserveAsync:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unhandled Exception in ObserveAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -175,11 +173,7 @@ public override async Task ReportAsync(CancellationToken token) // This file is used by the web application (ObserverWebApi). if (!ObserverLogger.TryWriteLogFile(logPath, sb.ToString())) { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create SFInfraInfo.txt file."); + ObserverLogger.LogWarning("Unable to create SFInfraInfo.txt file."); } _ = sb.Clear(); diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index db8fb24f..f751ab78 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -1,7 +1,7 @@ [ { "targetApp": "*", - "appExcludeList": "fabric:/MyApp, fabric:/MyApp42", + "appExcludeList": "fabric:/MyApp42, fabric:/AnotherApp", "cpuWarningLimitPercent": 85, "memoryWarningLimitMb": 1048, "networkWarningActivePorts": 8000, @@ -9,7 +9,7 @@ }, { "targetAppType": "SomeAppType", - "memoryWarningLimitMb": 500, + "memoryWarningLimitPercent": 20, "networkWarningEphemeralPorts": 5000 } ] \ No newline at end of file diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 50ada035..b7b4f36f 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -88,6 +88,10 @@ + + + - + - + - + + + + + + + + +
@@ -236,10 +251,8 @@ - - @@ -276,22 +289,40 @@
- + + + + + + + + + + + + - - + + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index 0dae5929..56000619 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,10 +27,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 52abdf89..1b9bde6a 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,10 +21,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index c35b3467..5c127159 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + + + @@ -27,9 +28,10 @@ - + + - + @@ -37,7 +39,8 @@ - + + @@ -51,11 +54,12 @@ - + + - + @@ -65,7 +69,8 @@ - + + @@ -76,6 +81,15 @@ + + + + + + + + @@ -108,7 +122,6 @@ - @@ -130,12 +143,26 @@ + + + + + + + + + + - + @@ -154,6 +181,10 @@ + + + +
@@ -255,17 +286,36 @@
- + + + + + + + + + +
-
--> +
-
@@ -280,9 +330,17 @@ - + + --> + + +
\ No newline at end of file diff --git a/FabricObserverTests/FabricObserverTests.csproj b/FabricObserverTests/FabricObserverTests.csproj index a76fe764..a58b85e1 100644 --- a/FabricObserverTests/FabricObserverTests.csproj +++ b/FabricObserverTests/FabricObserverTests.csproj @@ -44,10 +44,10 @@ - - - - + + + + all runtime; build; native; contentfiles; analyzers; buildtransitive @@ -56,4 +56,9 @@ + + + PreserveNewest + + diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index b2285489..e08073b4 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -260,7 +260,7 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -289,6 +289,43 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() await CleanupTestHealthReportsAsync(obs).ConfigureAwait(true); } + [TestMethod] + public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_IsHealthy() + { + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + + using var client = new FabricClient(FabricClientRole.User); + var startDateTime = DateTime.Now; + + ObserverManager.FabricServiceContext = context; + ObserverManager.FabricClientInstance = client; + ObserverManager.TelemetryEnabled = false; + ObserverManager.EtwEnabled = false; + + using var obs = new AppObserver(client, context) + { + MonitorDuration = TimeSpan.FromSeconds(1), + ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.oldstyle.json"), + ReplicaOrInstanceList = new List() + }; + + await obs.ObserveAsync(token).ConfigureAwait(true); + + // observer ran to completion with no errors. + Assert.IsTrue(obs.LastRunDateTime > startDateTime); + + // observer detected no warning conditions. + Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); + + // observer did not have any internal errors during run. + Assert.IsFalse(obs.IsUnhealthy); + + await CleanupTestHealthReportsAsync(obs).ConfigureAwait(true); + } + [TestMethod] public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() { @@ -312,6 +349,11 @@ public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() [TestMethod] public async Task Successful_CertificateObserver_Run_Cancellation_Via_ObserverManager() { + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + using var client = new FabricClient(FabricClientRole.User); ObserverManager.FabricServiceContext = context; @@ -408,6 +450,11 @@ public async Task Successful_ClusterObserver_Run_Cancellation_Via_ClusterObserve [TestMethod] public async Task Successful_FabricSystemObserver_Run_Cancellation_Via_ObserverManager() { + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + using var client = new FabricClient(); ObserverManager.FabricServiceContext = context; @@ -439,7 +486,12 @@ public async Task Successful_FabricSystemObserver_Run_Cancellation_Via_ObserverM [TestMethod] public async Task Successful_NetworkObserver_Run_Cancellation_Via_ObserverManager() { - using var client = new FabricClient(FabricClientRole.User); + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + + using var client = new FabricClient(); ObserverManager.FabricServiceContext = context; ObserverManager.FabricClientInstance = client; @@ -466,6 +518,11 @@ public async Task Successful_NetworkObserver_Run_Cancellation_Via_ObserverManage [TestMethod] public async Task Successful_NodeObserver_Run_Cancellation_Via_ObserverManager() { + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + using var client = new FabricClient(); ObserverManager.FabricServiceContext = context; @@ -498,6 +555,11 @@ public async Task Successful_NodeObserver_Run_Cancellation_Via_ObserverManager() [TestMethod] public async Task Successful_OSObserver_Run_Cancellation_Via_ObserverManager() { + if (!isSFRuntimePresentOnTestMachine) + { + return; + } + using var client = new FabricClient(FabricClientRole.User); ObserverManager.FabricServiceContext = context; diff --git a/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json b/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json new file mode 100644 index 00000000..7d9b7732 --- /dev/null +++ b/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json @@ -0,0 +1,32 @@ +[ + { + "targetAppType": "FabricObserverType", + "cpuWarningLimitPercent": 30, + "memoryWarningLimitPercent": 20 + }, + { + "targetApp": "fabric:/CpuStress", + "cpuWarningLimitPercent": 90, + "memoryWarningLimitPercent": 60 + }, + { + "targetApp": "Malformed AppName App1", + "cpuWarningLimitPercent": 70, + "memoryWarningLimitPercent": 20 + }, + { + "targetAppType": "ClusterObserverType", + "cpuWarningLimitPercent": 30, + "memoryWarningLimitPercent": 20 + }, + { + "targetApp": "fabric:/BadApp", + "cpuWarningLimitPercent": 60, + "memoryWarningLimitPercent": 30 + }, + { + "targetApp": "Malformed AppName App1", + "cpuWarningLimitPercent": 70, + "memoryWarningLimitPercent": 20 + } +] diff --git a/README.md b/README.md index fcd4f8f3..ee684b50 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.15 +# FabricObserver 3.1.16 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of machine resources that tend to be very important to all Service Fabric applications, like disk space consumption, CPU use, memory use, endpoint availability, ephemeral TCP port use, and app/cluster certificate health out-of-the-box. @@ -159,6 +159,7 @@ Here are the current observers and what they monitor: | Resource | Observer | | --- | --- | | Application (services) resource usage health monitoring across CPU, File Handles, Memory, Ports (TCP) | AppObserver | +| Looks for dmp and zip files in AppObserver's MemoryDumps folder, compresses (if necessary) and uploads them to your specified Azure storage account (blob only, AppObserver only, and still Windows only in this version of FO) | AzureStorageUploadObserver | | Application (user) and cluster certificate health monitoring | CertificateObserver | | Disk (local storage disk health/availability, space usage, IO) | DiskObserver | | SF System Services resource usage health monitoring across CPU, File Handles, Memory, Ports (TCP) | FabricSystemObserver | diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index ab6c5083..a9f4e334 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -13,7 +13,7 @@ - + diff --git a/TelemetryLib/TelemetryLib.csproj b/TelemetryLib/TelemetryLib.csproj index 23eb5339..9fd2856d 100644 --- a/TelemetryLib/TelemetryLib.csproj +++ b/TelemetryLib/TelemetryLib.csproj @@ -14,9 +14,9 @@ AnyCPU;x64 - + - +