From 2e45db7f5d00c81f76a4d3dfc3cf5b54d77eb712 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Wed, 14 Apr 2021 15:29:37 -0700
Subject: [PATCH 01/11] job first stab

---
 .../scripts/tomlparser-mdm-metrics-config.rb  | 21 +++++++++++
 kubernetes/container-azm-ms-agentconfig.yaml  |  5 +++
 source/plugins/ruby/MdmAlertTemplates.rb      |  5 ++-
 source/plugins/ruby/MdmMetricsGenerator.rb    | 36 +++++++++++++++----
 source/plugins/ruby/constants.rb              |  9 ++---
 source/plugins/ruby/podinventory_to_mdm.rb    |  3 +-
 6 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
index 345c51633..4e9e5000d 100644
--- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
+++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
@@ -13,6 +13,7 @@
 @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
 @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
 @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
 
 # Use parser to parse the configmap toml file to a ruby structure
 def parseConfigMap
@@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig)
       ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors")
       @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
     end
+
+    # Get mdm metrics config settings for job completion
+    begin
+      jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold]
+      if !jobCompletion.nil?
+        jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes]
+        jobCompletionThresholdInt = cpuThreshold.to_i
+        if jobCompletionThresholdInt.kind_of? Integer
+          @jobCompletionThresholdMinutes = jobCompletionThresholdInt
+        else
+          puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default "
+          @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
+        end
+        puts "config::Using config map settings for MDM metric configuration settings for job completion"
+      end
+    rescue => errorStr
+      ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors")
+      @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
+    end
   end
 end
 
@@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
   file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n")
   file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n")
   file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n")
+  file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n")
   # Close file after writing all MDM setting environment variables
   file.close
   puts "****************End MDM Metrics Config Processing********************"
diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml
index e38d9b4ab..543f270c1 100644
--- a/kubernetes/container-azm-ms-agentconfig.yaml
+++ b/kubernetes/container-azm-ms-agentconfig.yaml
@@ -126,6 +126,11 @@ data:
     [alertable_metrics_configuration_settings.pv_utilization_thresholds]
         # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
         pv_usage_threshold_percentage = 60.0
+
+    # Alertable metrics configuration settings for completed jobs count
+    [alertable_metrics_configuration_settings.job_completion_threshold]
+        # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold 
+        job_completion_threshold_time_minutes = 360
   integrations: |-
     [integrations.azure_network_policy_manager]
         collect_basic_metrics = false
diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb
index ef63cf219..7d167f420 100644
--- a/source/plugins/ruby/MdmAlertTemplates.rb
+++ b/source/plugins/ruby/MdmAlertTemplates.rb
@@ -28,7 +28,7 @@ class MdmAlertTemplates
         }
     }'
 
-    Stable_job_metrics_template = '
+  Stable_job_metrics_template = '
     {
         "time": "%{timestamp}",
         "data": {
@@ -45,7 +45,7 @@ class MdmAlertTemplates
                     "dimValues": [
                         "%{controllerNameDimValue}",
                         "%{namespaceDimValue}",
-                        "6"
+                        "%{jobCompletionThreshold}"
                     ],
                     "min": %{containerCountMetricValue},
                     "max": %{containerCountMetricValue},
@@ -123,7 +123,6 @@ class MdmAlertTemplates
         }
     }'
 
-
   Node_resource_metrics_template = '
             {
                 "time": "%{timestamp}",
diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb
index 12d462e44..6d0bd3467 100644
--- a/source/plugins/ruby/MdmMetricsGenerator.rb
+++ b/source/plugins/ruby/MdmMetricsGenerator.rb
@@ -96,13 +96,26 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
             podControllerNameDimValue = key_elements[0]
             podNamespaceDimValue = key_elements[1]
 
-            record = metricsTemplate % {
-              timestamp: batch_time,
-              metricName: metricName,
-              controllerNameDimValue: podControllerNameDimValue,
-              namespaceDimValue: podNamespaceDimValue,
-              containerCountMetricValue: value,
-            }
+            # Special handling for jobs since we need to send the threshold as a dimension as it is configurable
+            metric_threshold_hash = getContainerResourceUtilizationThresholds
+            if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
+              record = metricsTemplate % {
+                timestamp: batch_time,
+                metricName: metricName,
+                controllerNameDimValue: podControllerNameDimValue,
+                namespaceDimValue: podNamespaceDimValue,
+                containerCountMetricValue: value,
+                jobCompletionThreshold: metric_threshold_hash[Constants::JOB_COMPLETION_TIME],
+              }
+            else
+              record = metricsTemplate % {
+                timestamp: batch_time,
+                metricName: metricName,
+                controllerNameDimValue: podControllerNameDimValue,
+                namespaceDimValue: podNamespaceDimValue,
+                containerCountMetricValue: value,
+              }
+            end
             records.push(Yajl::Parser.parse(StringIO.new(record)))
           }
         else
@@ -129,9 +142,11 @@ def flushPodMdmMetricTelemetry
         staleJobHashValues = @stale_job_count_hash.values
         staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x }
 
+        metric_threshold_hash = getContainerResourceUtilizationThresholds
         properties["ContainerRestarts"] = containerRestartMetricCount
         properties["OomKilledContainers"] = oomKilledContainerMetricCount
         properties["OldCompletedJobs"] = staleJobMetricCount
+        properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
         ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties)
         ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {})
       rescue => errorStr
@@ -408,6 +423,7 @@ def getContainerResourceUtilizationThresholds
         metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
         metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
         metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+        metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
 
         cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"]
         if !cpuThreshold.nil? && !cpuThreshold.empty?
@@ -433,6 +449,12 @@ def getContainerResourceUtilizationThresholds
           pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2)
           metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat
         end
+
+        jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"]
+        if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty?
+          jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i
+          metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt
+        end
       rescue => errorStr
         @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}"
         ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index cf41900dc..5adb4b5c0 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -65,21 +65,22 @@ class Constants
   MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes"
   MEMORY_RSS_BYTES = "memoryRssBytes"
   PV_USED_BYTES = "pvUsedBytes"
+  JOB_COMPLETION_TIME = "completedJobTimeMinutes"
   DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
   DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
   DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
   DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
+  DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360
   CONTROLLER_KIND_JOB = "job"
   CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
   CONTAINER_STATE_TERMINATED = "terminated"
-  STALE_JOB_TIME_IN_MINUTES = 360
   TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
   OMSAGENT_ZERO_FILL = "omsagent"
   KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"
   VOLUME_NAME_ZERO_FILL = "-"
-  PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume",
-    "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs",
-    "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"]
+  PV_TYPES = ["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume",
+              "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs",
+              "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"]
 
   #Telemetry constants
   CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent"
diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb
index 77370e284..d9cb71bd4 100644
--- a/source/plugins/ruby/podinventory_to_mdm.rb
+++ b/source/plugins/ruby/podinventory_to_mdm.rb
@@ -88,6 +88,7 @@ def initialize()
     @pod_count_by_phase = {}
     @pod_uids = {}
     @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability
+    @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds
     @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}"
     @log.debug { "Starting podinventory_to_mdm plugin" }
   end
@@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames
             if !containerFinishedTime.nil? && !containerFinishedTime.empty?
               finishedTimeParsed = Time.parse(containerFinishedTime)
               # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES
-              if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES
+              if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
                 MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue,
                                                                  podNamespaceDimValue)
               end

From 33bf7e042275d149844a972b7a4bd1bb8c95e313 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Wed, 14 Apr 2021 18:26:10 -0700
Subject: [PATCH 02/11] telemetry changes

---
 source/plugins/ruby/KubernetesApiClient.rb | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index c5a363741..944561707 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -31,6 +31,7 @@ class KubernetesApiClient
   @@TokenStr = nil
   @@NodeMetrics = Hash.new
   @@WinNodeArray = []
+  @@telemetryTimeTracker = DateTime.now.to_time.to_i
 
   def initialize
   end
@@ -403,9 +404,12 @@ def getPodUid(podNameSpace, podMetadata)
 
     def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
       metricItems = []
+      timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs
+      timeDifferenceInMinutes = timeDifference / 60
       begin
         clusterId = getClusterId
         podNameSpace = pod["metadata"]["namespace"]
+        podName = pod["metadata"]["name"]
         podUid = getPodUid(podNameSpace, pod["metadata"])
         if podUid.nil?
           return metricItems
@@ -456,6 +460,20 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               metricProps["Collections"].push(metricCollections)
               metricItem["DataItems"].push(metricProps)
               metricItems.push(metricItem)
+              #Telemetry about omsagent requests and limits
+              begin
+                if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent"))
+                  if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+                    telemetryProps = {}
+                    telemetryProps["PodName"] = podName
+                    telemetryProps["ContainerName"] = containerName
+                    telemetryProps["Computer"] = nodeName
+                    ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps)
+                  end
+                end
+              rescue => errorStr
+                $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}")
+              end
               #No container level limit for the given metric, so default to node level limit
             else
               nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
@@ -791,7 +809,7 @@ def getKubeAPIServerUrl
     def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
       kubeServiceRecords = []
       begin
-        if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? )
+        if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?)
           servicesCount = serviceList["items"].length
           @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList  #{servicesCount} @ #{Time.now.utc.iso8601}")
           serviceList["items"].each do |item|

From 89287845c15305b13e877b5ae46203743147ad51 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 15 Apr 2021 16:26:31 -0700
Subject: [PATCH 03/11] changes

---
 source/plugins/ruby/KubernetesApiClient.rb | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 944561707..e223dc0b8 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -502,6 +502,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               end
             end
           end
+          # reset time outside pod iterator
+          if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+            @@telemetryTimeTracker = DateTime.now.to_time.to_i
+          end
         end
       rescue => error
         @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")

From 6c71cbe199d75258e9b86faa977df069a9b8371a Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 15 Apr 2021 17:07:44 -0700
Subject: [PATCH 04/11] bug fix

---
 build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
index 4e9e5000d..5ce5d79d2 100644
--- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
+++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
@@ -108,7 +108,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
       jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold]
       if !jobCompletion.nil?
         jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes]
-        jobCompletionThresholdInt = cpuThreshold.to_i
+        jobCompletionThresholdInt = jobCompletionThreshold.to_i
         if jobCompletionThresholdInt.kind_of? Integer
           @jobCompletionThresholdMinutes = jobCompletionThresholdInt
         else

From 9d8110d5931c4e6e46044916cd577c09106ead37 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 15 Apr 2021 18:43:11 -0700
Subject: [PATCH 05/11] hours fix

---
 source/plugins/ruby/MdmMetricsGenerator.rb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb
index 6d0bd3467..0be48a7aa 100644
--- a/source/plugins/ruby/MdmMetricsGenerator.rb
+++ b/source/plugins/ruby/MdmMetricsGenerator.rb
@@ -97,15 +97,17 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
             podNamespaceDimValue = key_elements[1]
 
             # Special handling for jobs since we need to send the threshold as a dimension as it is configurable
-            metric_threshold_hash = getContainerResourceUtilizationThresholds
             if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
+              metric_threshold_hash = getContainerResourceUtilizationThresholds
+              #Converting this to hours since we already have olderThanHours dimension.
+              jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60
               record = metricsTemplate % {
                 timestamp: batch_time,
                 metricName: metricName,
                 controllerNameDimValue: podControllerNameDimValue,
                 namespaceDimValue: podNamespaceDimValue,
                 containerCountMetricValue: value,
-                jobCompletionThreshold: metric_threshold_hash[Constants::JOB_COMPLETION_TIME],
+                jobCompletionThreshold: jobCompletionThresholdHours,
               }
             else
               record = metricsTemplate % {

From 418efa77dafe4a0862735665224c619ed2ffb710 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 15 Apr 2021 18:55:27 -0700
Subject: [PATCH 06/11] bug fix

---
 source/plugins/ruby/MdmMetricsGenerator.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb
index 0be48a7aa..591cd5903 100644
--- a/source/plugins/ruby/MdmMetricsGenerator.rb
+++ b/source/plugins/ruby/MdmMetricsGenerator.rb
@@ -100,7 +100,7 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
             if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
               metric_threshold_hash = getContainerResourceUtilizationThresholds
               #Converting this to hours since we already have olderThanHours dimension.
-              jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60
+              jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0
               record = metricsTemplate % {
                 timestamp: batch_time,
                 metricName: metricName,

From f2e828117c5a46efc29e3010a597c936fd1773fc Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 15 Apr 2021 19:07:31 -0700
Subject: [PATCH 07/11] bug fix

---
 source/plugins/ruby/KubernetesApiClient.rb  | 4 ++--
 source/plugins/ruby/in_kube_podinventory.rb | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index e223dc0b8..571fc0987 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -502,8 +502,8 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               end
             end
           end
-          # reset time outside pod iterator
-          if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+          # reset time outside pod iterator and after the last metric call from KubePodInventory
+          if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) && (metricNametoReturn == "memoryLimitBytes")
             @@telemetryTimeTracker = DateTime.now.to_time.to_i
           end
         end
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 5256eb159..ae4afccf8 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -250,6 +250,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           end
 
           #container perf records
+          # The order of calls below has impact on telemetry in methos getContainerResourceRequestsAndLimits. Make sure to update the check if this order of calls id changed.
           containerMetricDataItems = []
           containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime))
           containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime))

From 1e9689c9eedd74ce26ab0cc8d68d92003c7dbab6 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Mon, 19 Apr 2021 17:28:09 -0700
Subject: [PATCH 08/11] fix

---
 source/plugins/ruby/KubernetesApiClient.rb  | 30 ++++++++++++++-------
 source/plugins/ruby/in_kube_podinventory.rb |  1 -
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 571fc0987..98347d272 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -32,6 +32,7 @@ class KubernetesApiClient
   @@NodeMetrics = Hash.new
   @@WinNodeArray = []
   @@telemetryTimeTracker = DateTime.now.to_time.to_i
+  @@resourceLimitsTelemetryHash = {}
 
   def initialize
   end
@@ -463,13 +464,26 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               #Telemetry about omsagent requests and limits
               begin
                 if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent"))
-                  if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+                  nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~")
+                  @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue
+                end
+                if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+                  @@resourceLimitsTelemetryHash.each { |key, value|
+                    keyElements = key.split("~~")
+                    if keyElements.length != 4
+                      next
+                    end
+
+                    # get dimension values by key
                     telemetryProps = {}
-                    telemetryProps["PodName"] = podName
-                    telemetryProps["ContainerName"] = containerName
-                    telemetryProps["Computer"] = nodeName
-                    ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps)
-                  end
+                    telemetryProps["Computer"] = keyElements[0]
+                    telemetryProps["PodName"] = keyElements[1]
+                    telemetryProps["ContainerName"] = keyElements[2]
+                    metricNameFromKey = keyElements[3]
+                    ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps)
+                  }
+                  @@telemetryTimeTracker = DateTime.now.to_time.to_i
+                  @@resourceLimitsTelemetryHash = {}
                 end
               rescue => errorStr
                 $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}")
@@ -502,10 +516,6 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               end
             end
           end
-          # reset time outside pod iterator and after the last metric call from KubePodInventory
-          if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) && (metricNametoReturn == "memoryLimitBytes")
-            @@telemetryTimeTracker = DateTime.now.to_time.to_i
-          end
         end
       rescue => error
         @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index ae4afccf8..5256eb159 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -250,7 +250,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           end
 
           #container perf records
-          # The order of calls below has impact on telemetry in methos getContainerResourceRequestsAndLimits. Make sure to update the check if this order of calls id changed.
           containerMetricDataItems = []
           containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime))
           containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime))

From 117e7c9bf15c5242c81505e07b35eb92ac8395c5 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Tue, 20 Apr 2021 18:04:12 -0700
Subject: [PATCH 09/11] updating fbit settings

---
 build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 2 +-
 kubernetes/omsagent.yaml                                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
index 339e509b0..b37309455 100644
--- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
+++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
@@ -30,7 +30,7 @@
     Port        25229
     Chunk_Size  1m
     Buffer_Size 1m
-    Mem_Buf_Limit 20m
+    Mem_Buf_Limit 200m
 
 [OUTPUT]
     Name                            oms
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 206d9a8f0..505388665 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -451,7 +451,7 @@ spec:
           resources:
             limits:
               cpu: 500m
-              memory: 400Mi
+              memory: 1Gi
             requests:
               cpu: 75m
               memory: 225Mi

From 67aa1eddbe567ab96c494a605387bcc8fae60b98 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Tue, 20 Apr 2021 19:49:59 -0700
Subject: [PATCH 10/11] adding precision

---
 source/plugins/ruby/MdmMetricsGenerator.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb
index f2aa92c14..6641456af 100644
--- a/source/plugins/ruby/MdmMetricsGenerator.rb
+++ b/source/plugins/ruby/MdmMetricsGenerator.rb
@@ -111,7 +111,7 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
             if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
               metric_threshold_hash = getContainerResourceUtilizationThresholds
               #Converting this to hours since we already have olderThanHours dimension.
-              jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0
+              jobCompletionThresholdHours = (metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0).round(2)
               record = metricsTemplate % {
                 timestamp: batch_time,
                 metricName: metricName,

From 515e1a4524edee197549f025624c77cbb88983f2 Mon Sep 17 00:00:00 2001
From: Rashmi Chandrashekar <rashmy@microsoft.com>
Date: Thu, 22 Apr 2021 11:38:03 -0700
Subject: [PATCH 11/11] increase chunk size to 10m

---
 build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
index 1db269a79..8a69f7995 100644
--- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
+++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf
@@ -29,8 +29,8 @@
     Tag         oms.container.perf.telegraf.*
     Listen      0.0.0.0
     Port        25229
-    Chunk_Size  1m
-    Buffer_Size 1m
+    Chunk_Size  10m
+    Buffer_Size 10m
     Mem_Buf_Limit 200m
 
 [OUTPUT]