From 2e45db7f5d00c81f76a4d3dfc3cf5b54d77eb712 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Wed, 14 Apr 2021 15:29:37 -0700 Subject: [PATCH 01/11] job first stab --- .../scripts/tomlparser-mdm-metrics-config.rb | 21 +++++++++++ kubernetes/container-azm-ms-agentconfig.yaml | 5 +++ source/plugins/ruby/MdmAlertTemplates.rb | 5 ++- source/plugins/ruby/MdmMetricsGenerator.rb | 36 +++++++++++++++---- source/plugins/ruby/constants.rb | 9 ++--- source/plugins/ruby/podinventory_to_mdm.rb | 3 +- 6 files changed, 64 insertions(+), 15 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 345c51633..4e9e5000d 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -13,6 +13,7 @@ @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD +@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end + + # Get mdm metrics config settings for job completion + begin + jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] + if !jobCompletion.nil? + jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] + jobCompletionThresholdInt = cpuThreshold.to_i + if jobCompletionThresholdInt.kind_of? Integer + @jobCompletionThresholdMinutes = jobCompletionThresholdInt + else + puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default " + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end + puts "config::Using config map settings for MDM metric configuration settings for job completion" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors") + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end end end @@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index e38d9b4ab..543f270c1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -126,6 +126,11 @@ data: [alertable_metrics_configuration_settings.pv_utilization_thresholds] # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index ef63cf219..7d167f420 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -28,7 +28,7 @@ class MdmAlertTemplates } }' - Stable_job_metrics_template = ' + Stable_job_metrics_template = ' { "time": "%{timestamp}", "data": { @@ -45,7 +45,7 @@ class MdmAlertTemplates "dimValues": [ "%{controllerNameDimValue}", "%{namespaceDimValue}", - "6" + "%{jobCompletionThreshold}" ], "min": %{containerCountMetricValue}, "max": %{containerCountMetricValue}, @@ -123,7 +123,6 @@ class MdmAlertTemplates } }' - Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 12d462e44..6d0bd3467 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -96,13 +96,26 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podControllerNameDimValue = key_elements[0] podNamespaceDimValue = key_elements[1] - record = metricsTemplate % { - timestamp: batch_time, - metricName: metricName, - controllerNameDimValue: podControllerNameDimValue, - namespaceDimValue: podNamespaceDimValue, - containerCountMetricValue: value, - } + # Special handling for jobs since we need to send the threshold as a dimension as it is configurable + metric_threshold_hash = getContainerResourceUtilizationThresholds + if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + jobCompletionThreshold: metric_threshold_hash[Constants::JOB_COMPLETION_TIME], + } + else + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + } + end records.push(Yajl::Parser.parse(StringIO.new(record))) } else @@ -129,9 +142,11 @@ def flushPodMdmMetricTelemetry staleJobHashValues = @stale_job_count_hash.values staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x } + metric_threshold_hash = getContainerResourceUtilizationThresholds properties["ContainerRestarts"] = containerRestartMetricCount properties["OomKilledContainers"] = oomKilledContainerMetricCount properties["OldCompletedJobs"] = staleJobMetricCount + properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties) ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {}) rescue => errorStr @@ -408,6 +423,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -433,6 +449,12 @@ def getContainerResourceUtilizationThresholds pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end + + jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"] + if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty? + jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index cf41900dc..5adb4b5c0 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -65,21 +65,22 @@ class Constants MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" PV_USED_BYTES = "pvUsedBytes" + JOB_COMPLETION_TIME = "completedJobTimeMinutes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" VOLUME_NAME_ZERO_FILL = "-" - PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", - "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", - "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] + PV_TYPES = ["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 77370e284..d9cb71bd4 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -88,6 +88,7 @@ def initialize() @pod_count_by_phase = {} @pod_uids = {} @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability + @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames if !containerFinishedTime.nil? && !containerFinishedTime.empty? finishedTimeParsed = Time.parse(containerFinishedTime) # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES - if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES + if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME] MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue, podNamespaceDimValue) end From 33bf7e042275d149844a972b7a4bd1bb8c95e313 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Wed, 14 Apr 2021 18:26:10 -0700 Subject: [PATCH 02/11] telemetry changes --- source/plugins/ruby/KubernetesApiClient.rb | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index c5a363741..944561707 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -31,6 +31,7 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@telemetryTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -403,9 +404,12 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] + podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -456,6 +460,20 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + #Telemetry about omsagent requests and limits + begin + if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = nodeName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") + end #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect @@ -791,7 +809,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| From 89287845c15305b13e877b5ae46203743147ad51 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 15 Apr 2021 16:26:31 -0700 Subject: [PATCH 03/11] changes --- source/plugins/ruby/KubernetesApiClient.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 944561707..e223dc0b8 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -502,6 +502,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle end end end + # reset time outside pod iterator + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + end end rescue => error @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") From 6c71cbe199d75258e9b86faa977df069a9b8371a Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 15 Apr 2021 17:07:44 -0700 Subject: [PATCH 04/11] bug fix --- build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 4e9e5000d..5ce5d79d2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -108,7 +108,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] if !jobCompletion.nil? jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] - jobCompletionThresholdInt = cpuThreshold.to_i + jobCompletionThresholdInt = jobCompletionThreshold.to_i if jobCompletionThresholdInt.kind_of? Integer @jobCompletionThresholdMinutes = jobCompletionThresholdInt else From 9d8110d5931c4e6e46044916cd577c09106ead37 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 15 Apr 2021 18:43:11 -0700 Subject: [PATCH 05/11] hours fix --- source/plugins/ruby/MdmMetricsGenerator.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 6d0bd3467..0be48a7aa 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -97,15 +97,17 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podNamespaceDimValue = key_elements[1] # Special handling for jobs since we need to send the threshold as a dimension as it is configurable - metric_threshold_hash = getContainerResourceUtilizationThresholds if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + metric_threshold_hash = getContainerResourceUtilizationThresholds + #Converting this to hours since we already have olderThanHours dimension. + jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60 record = metricsTemplate % { timestamp: batch_time, metricName: metricName, controllerNameDimValue: podControllerNameDimValue, namespaceDimValue: podNamespaceDimValue, containerCountMetricValue: value, - jobCompletionThreshold: metric_threshold_hash[Constants::JOB_COMPLETION_TIME], + jobCompletionThreshold: jobCompletionThresholdHours, } else record = metricsTemplate % { From 418efa77dafe4a0862735665224c619ed2ffb710 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 15 Apr 2021 18:55:27 -0700 Subject: [PATCH 06/11] bug fix --- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 0be48a7aa..591cd5903 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -100,7 +100,7 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT metric_threshold_hash = getContainerResourceUtilizationThresholds #Converting this to hours since we already have olderThanHours dimension. - jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60 + jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0 record = metricsTemplate % { timestamp: batch_time, metricName: metricName, From f2e828117c5a46efc29e3010a597c936fd1773fc Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 15 Apr 2021 19:07:31 -0700 Subject: [PATCH 07/11] bug fix --- source/plugins/ruby/KubernetesApiClient.rb | 4 ++-- source/plugins/ruby/in_kube_podinventory.rb | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index e223dc0b8..571fc0987 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -502,8 +502,8 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle end end end - # reset time outside pod iterator - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + # reset time outside pod iterator and after the last metric call from KubePodInventory + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) && (metricNametoReturn == "memoryLimitBytes") @@telemetryTimeTracker = DateTime.now.to_time.to_i end end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5256eb159..ae4afccf8 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -250,6 +250,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end #container perf records + # The order of calls below has impact on telemetry in methos getContainerResourceRequestsAndLimits. Make sure to update the check if this order of calls id changed. containerMetricDataItems = [] containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) From 1e9689c9eedd74ce26ab0cc8d68d92003c7dbab6 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Mon, 19 Apr 2021 17:28:09 -0700 Subject: [PATCH 08/11] fix --- source/plugins/ruby/KubernetesApiClient.rb | 30 ++++++++++++++------- source/plugins/ruby/in_kube_podinventory.rb | 1 - 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 571fc0987..98347d272 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -32,6 +32,7 @@ class KubernetesApiClient @@NodeMetrics = Hash.new @@WinNodeArray = [] @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} def initialize end @@ -463,13 +464,26 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle #Telemetry about omsagent requests and limits begin if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") + @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue + end + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@resourceLimitsTelemetryHash.each { |key, value| + keyElements = key.split("~~") + if keyElements.length != 4 + next + end + + # get dimension values by key telemetryProps = {} - telemetryProps["PodName"] = podName - telemetryProps["ContainerName"] = containerName - telemetryProps["Computer"] = nodeName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end + telemetryProps["Computer"] = keyElements[0] + telemetryProps["PodName"] = keyElements[1] + telemetryProps["ContainerName"] = keyElements[2] + metricNameFromKey = keyElements[3] + ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) + } + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} end rescue => errorStr $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") @@ -502,10 +516,6 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle end end end - # reset time outside pod iterator and after the last metric call from KubePodInventory - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) && (metricNametoReturn == "memoryLimitBytes") - @@telemetryTimeTracker = DateTime.now.to_time.to_i - end end rescue => error @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index ae4afccf8..5256eb159 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -250,7 +250,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end #container perf records - # The order of calls below has impact on telemetry in methos getContainerResourceRequestsAndLimits. Make sure to update the check if this order of calls id changed. containerMetricDataItems = [] containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) From 117e7c9bf15c5242c81505e07b35eb92ac8395c5 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Tue, 20 Apr 2021 18:04:12 -0700 Subject: [PATCH 09/11] updating fbit settings --- build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 2 +- kubernetes/omsagent.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 339e509b0..b37309455 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -30,7 +30,7 @@ Port 25229 Chunk_Size 1m Buffer_Size 1m - Mem_Buf_Limit 20m + Mem_Buf_Limit 200m [OUTPUT] Name oms diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 206d9a8f0..505388665 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -451,7 +451,7 @@ spec: resources: limits: cpu: 500m - memory: 400Mi + memory: 1Gi requests: cpu: 75m memory: 225Mi From 67aa1eddbe567ab96c494a605387bcc8fae60b98 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Tue, 20 Apr 2021 19:49:59 -0700 Subject: [PATCH 10/11] adding precision --- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index f2aa92c14..6641456af 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -111,7 +111,7 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT metric_threshold_hash = getContainerResourceUtilizationThresholds #Converting this to hours since we already have olderThanHours dimension. - jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0 + jobCompletionThresholdHours = (metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0).round(2) record = metricsTemplate % { timestamp: batch_time, metricName: metricName, From 515e1a4524edee197549f025624c77cbb88983f2 Mon Sep 17 00:00:00 2001 From: Rashmi Chandrashekar Date: Thu, 22 Apr 2021 11:38:03 -0700 Subject: [PATCH 11/11] increase chunk size to 10m --- build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 1db269a79..8a69f7995 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -29,8 +29,8 @@ Tag oms.container.perf.telegraf.* Listen 0.0.0.0 Port 25229 - Chunk_Size 1m - Buffer_Size 1m + Chunk_Size 10m + Buffer_Size 10m Mem_Buf_Limit 200m [OUTPUT]