Skip to content

Commit

Permalink
MDM exception aggregation (#470)
Browse files Browse the repository at this point in the history
  • Loading branch information
rashmichandrashekar authored Nov 10, 2020
1 parent aff1e13 commit ca18850
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 59 deletions.
112 changes: 57 additions & 55 deletions source/plugins/ruby/constants.rb
Original file line number Diff line number Diff line change
@@ -1,61 +1,61 @@
# frozen_string_literal: true

class Constants
INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
REASON_OOM_KILLED = "oomkilled"
#Kubestate (common)
INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate"
INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime"
#Kubestate (deployments)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available"
#Kubestate (HPA)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime"
# MDM Metric names
MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount"
MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount"
MDM_POD_READY_PERCENTAGE = "podReadyPercentage"
MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount"
MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage"
MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"
INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
REASON_OOM_KILLED = "oomkilled"
#Kubestate (common)
INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate"
INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime"
#Kubestate (deployments)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available"
#Kubestate (HPA)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas"

INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime"
# MDM Metric names
MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount"
MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount"
MDM_POD_READY_PERCENTAGE = "podReadyPercentage"
MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount"
MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage"
MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"

CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5
OBJECT_NAME_K8S_CONTAINER = "K8SContainer"
Expand Down Expand Up @@ -88,6 +88,8 @@ class Constants
KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30
MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"
MDM_EXCEPTION_TELEMETRY_METRIC = "AKSCustomMetricsMdmExceptions"
MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL = 30

#Pod Statuses
POD_STATUS_TERMINATING = "Terminating"
Expand Down
51 changes: 47 additions & 4 deletions source/plugins/ruby/out_mdm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def initialize
@cluster_identity = nil
@isArcK8sCluster = false
@get_access_token_backoff_expiry = Time.now

@mdm_exceptions_hash = {}
@mdm_exceptions_count = 0
@mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
end

def configure(conf)
Expand Down Expand Up @@ -221,10 +225,49 @@ def format(tag, time, record)
end
end

def exception_aggregator(error)
begin
errorStr = error.to_s
if (@mdm_exceptions_hash[errorStr].nil?)
@mdm_exceptions_hash[errorStr] = 1
else
@mdm_exceptions_hash[errorStr] += 1
end
#Keeping track of all exceptions to send the total in the last flush interval as a metric
@mdm_exceptions_count += 1
rescue => error
@log.info "Error in MDM exception_aggregator method: #{error}"
ApplicationInsightsUtility.sendExceptionTelemetry(error)
end
end

def flush_mdm_exception_telemetry
begin
#Flush out exception telemetry as a metric for the last 30 minutes
timeDifference = (DateTime.now.to_time.to_i - @mdm_exception_telemetry_time_tracker).abs
timeDifferenceInMinutes = timeDifference / 60
if (timeDifferenceInMinutes >= Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL)
telemetryProperties = {}
telemetryProperties["ExceptionsHashForFlushInterval"] = @mdm_exceptions_hash.to_json
telemetryProperties["FlushInterval"] = Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL
ApplicationInsightsUtility.sendMetricTelemetry(Constants::MDM_EXCEPTION_TELEMETRY_METRIC, @mdm_exceptions_count, telemetryProperties)
# Resetting values after flushing
@mdm_exceptions_count = 0
@mdm_exceptions_hash = {}
@mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
end
rescue => error
@log.info "Error in flush_mdm_exception_telemetry method: #{error}"
ApplicationInsightsUtility.sendExceptionTelemetry(error)
end
end

# This method is called every flush interval. Send the buffer chunk to MDM.
# 'chunk' is a buffer chunk that includes multiple formatted records
def write(chunk)
begin
# Adding this before trying to flush out metrics, since adding after can lead to metrics never being sent
flush_mdm_exception_telemetry
if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm
post_body = []
chunk.msgpack_each { |(tag, record)|
Expand All @@ -247,7 +290,8 @@ def write(chunk)
end
end
rescue Exception => e
ApplicationInsightsUtility.sendExceptionTelemetry(e)
# Adding exceptions to hash to aggregate and send telemetry for all write errors
exception_aggregator(e)
@log.info "Exception when writing to MDM: #{e}"
raise e
end
Expand Down Expand Up @@ -282,7 +326,6 @@ def send_to_mdm(post_body)
else
@log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}"
end
#@log.info "MDM request : #{post_body}"
@log.debug_backtrace(e.backtrace)
if !response.code.empty? && response.code == 403.to_s
@log.info "Response Code #{response.code} Updating @last_post_attempt_time"
Expand All @@ -297,15 +340,15 @@ def send_to_mdm(post_body)
@log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
raise e
end
# Adding exceptions to hash to aggregate and send telemetry for all 400 error codes
exception_aggregator(e)
rescue Errno::ETIMEDOUT => e
@log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(e)
raise e
rescue Exception => e
@log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(e)
raise e
end
end
Expand Down

0 comments on commit ca18850

Please sign in to comment.