From c7075394ce704193a0198a5e7b93a5b6d7186054 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 21 May 2021 18:26:24 -0700 Subject: [PATCH] Gangams/aad stage2 full switch to mdsd (#559) * full switch to mdsd, upgrade to ruby v1 & omsagent removal * add odsdirect as fallback option * cleanup * cleanup * move customRegion to stage3 * updates related to containerlog route * make xml eventschema consistent * add buffer settings * address HTTPServerException deprecation in ruby 2.6 * update to official mdsd version * fix log message issue * fix pr feedback * get ridoff unused code from omscommon * fix pr feedback * fix pr feedback * clean up * clean up * fix missing conf --- build/common/installer/scripts/tomlparser.rb | 16 +- build/linux/installer/conf/container.conf | 318 ++--- build/linux/installer/conf/kube.conf | 509 +++++--- build/linux/installer/conf/out_oms.conf | 5 +- .../installer/datafiles/base_container.data | 297 +++-- build/linux/installer/datafiles/linux.data | 18 +- .../linux/installer/datafiles/linux_dpkg.data | 2 +- .../linux/installer/datafiles/linux_rpm.data | 2 +- .../linux/installer/scripts/livenessprobe.sh | 18 +- .../scripts/tomlparser-mdm-metrics-config.rb | 2 +- .../tomlparser-metric-collection-config.rb | 2 +- kubernetes/linux/envmdsd | 2 - kubernetes/linux/main.sh | 285 ++--- kubernetes/linux/mdsd.xml | 345 +++++- kubernetes/linux/setup.sh | 52 +- source/plugins/go/src/oms.go | 386 ++++-- source/plugins/go/src/telemetry.go | 17 + source/plugins/go/src/utils.go | 114 +- .../ruby/ApplicationInsightsUtility.rb | 22 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 212 ++-- source/plugins/ruby/DockerApiClient.rb | 2 +- source/plugins/ruby/KubernetesApiClient.rb | 100 +- source/plugins/ruby/MdmMetricsGenerator.rb | 16 +- source/plugins/ruby/constants.rb | 2 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 111 +- .../ruby/filter_cadvisor_health_container.rb | 15 +- .../ruby/filter_cadvisor_health_node.rb | 28 +- source/plugins/ruby/filter_container.rb | 59 - source/plugins/ruby/filter_docker_log.rb | 103 -- .../ruby/filter_health_model_builder.rb | 43 +- source/plugins/ruby/filter_inventory2mdm.rb | 24 +- source/plugins/ruby/filter_telegraf2mdm.rb | 8 +- ...h_container_cpu_memory_record_formatter.rb | 8 +- .../ruby/health/health_monitor_utils.rb | 12 +- source/plugins/ruby/in_cadvisor_perf.rb | 42 +- source/plugins/ruby/in_containerinventory.rb | 29 +- source/plugins/ruby/in_kube_events.rb | 31 +- source/plugins/ruby/in_kube_health.rb | 16 +- source/plugins/ruby/in_kube_nodes.rb | 111 +- source/plugins/ruby/in_kube_podinventory.rb | 116 +- source/plugins/ruby/in_kube_pvinventory.rb | 37 +- .../plugins/ruby/in_kubestate_deployments.rb | 37 +- source/plugins/ruby/in_kubestate_hpa.rb | 33 +- source/plugins/ruby/in_win_cadvisor_perf.rb | 28 +- source/plugins/ruby/out_health_forward.rb | 1074 ++++++++++------- source/plugins/ruby/out_mdm.rb | 85 +- source/plugins/ruby/podinventory_to_mdm.rb | 10 +- source/plugins/utils/oms_common.rb | 143 +++ source/plugins/utils/omslog.rb | 50 + 49 files changed, 2821 insertions(+), 2176 deletions(-) delete mode 100644 source/plugins/ruby/filter_container.rb delete mode 100644 source/plugins/ruby/filter_docker_log.rb create mode 100644 source/plugins/utils/oms_common.rb create mode 100644 source/plugins/utils/omslog.rb diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index a0f3c2f0a..b173ecfe3 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -25,8 +25,10 @@ @enrichContainerLogs = false @containerLogSchemaVersion = "" @collectAllKubeEvents = false -@containerLogsRoute = "" - +@containerLogsRoute = "v2" # default for linux +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama +end # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap begin @@ -162,8 +164,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? - @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] - puts "config::Using config map setting for container logs route" + if !parsedConfig[:log_collection_settings][:route_container_logs][:version].empty? + @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] + puts "config::Using config map setting for container logs route: #{@containerLogsRoute}" + else + puts "config::Ignoring config map settings and using default value since provided container logs route value is empty" + end end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for container logs route - #{errorStr}, using defaults, please check config map for errors") @@ -256,7 +262,7 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) + commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) file.write(commands) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index 958a85eb6..093c9ef12 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -1,141 +1,179 @@ -# Fluentd config file for OMS Docker - container components (non kubeAPI) - -# Forward port 25225 for container logs - - type forward - port 25225 - bind 127.0.0.1 - - -# MDM metrics from telegraf - - @type tcp - tag oms.mdm.container.perf.telegraf.* - bind 0.0.0.0 - port 25228 - format json - - -# Container inventory - - type containerinventory - tag oms.containerinsights.containerinventory - run_interval 60 - log_level debug - - -#cadvisor perf - - type cadvisorperf - tag oms.api.cadvisorperf - run_interval 60 - log_level debug - - - - type filter_cadvisor_health_node - log_level debug - - - - type filter_cadvisor_health_container - log_level debug - - -#custom_metrics_mdm filter plugin - - type filter_cadvisor2mdm - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes - log_level info - - - - type filter_telegraf2mdm - log_level debug - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - - - @type health_forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type tcp - skip_network_error_at_init true - expire_dns_cache 600s - buffer_queue_full_action drop_oldest_chunk - buffer_type file - buffer_path %STATE_DIR_WS%/out_health_forward*.buffer - buffer_chunk_limit 3m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - - - + # Fluentd config file for OMS Docker - container components (non kubeAPI) + + # Forward port 25225 for container logs + # gangams - not used and get ridoff after confirming safe to remove + + @type forward + port 25225 + bind 127.0.0.1 + + + # MDM metrics from telegraf + + @type tcp + tag oms.mdm.container.perf.telegraf.* + bind 0.0.0.0 + port 25228 + format json + + + # Container inventory + + @type containerinventory + tag oneagent.containerInsights.CONTAINER_INVENTORY_BLOB + run_interval 60 + @log_level debug + + + #cadvisor perf + + @type cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + + + @type cadvisor_health_node + @log_level debug + + + + @type cadvisor_health_container + @log_level debug + + + #custom_metrics_mdm filter plugin + + @type cadvisor2mdm + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes + @log_level info + + + + @type telegraf2mdm + @log_level debug + + + #containerinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + @type file - path %STATE_DIR_WS%/fluent_forward_failed.buffer - - - - - type out_mdm - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - retry_mdm_post_wait_minutes 30 - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - + path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + #cadvisorperf + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/cadvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + + @type health_forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + transport tcp + ignore_network_errors_at_startup true + expire_dns_cache 600s + + @type file + overflow_action drop_oldest_chunk + path /var/opt/microsoft/docker-cimprov/state/out_health_forward*.buffer + chunk_limit_size 3m + flush_interval 20s + retry_max_times 10 + retry_max_interval 5m + retry_wait 5s + + + host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/fluent_forward_failed.buffer + + + + + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + retry_mdm_post_wait_minutes 30 + + + #InsightsMetrics + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index fb566c360..a1c8bf928 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -1,7 +1,6 @@ -# Fluentd config file for OMS Docker - cluster components (kubeAPI) #fluent forward plugin - type forward + @type forward port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" bind 0.0.0.0 chunk_size_limit 4m @@ -9,262 +8,378 @@ #Kubernetes pod inventory - type kubepodinventory - tag oms.containerinsights.KubePodInventory + @type kube_podinventory + tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes Persistent Volume inventory - type kubepvinventory - tag oms.containerinsights.KubePVInventory + @type kube_pvinventory + tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes events - type kubeevents - tag oms.containerinsights.KubeEvents + @type kube_events + tag oneagent.containerInsights.KUBE_EVENTS_BLOB run_interval 60 - log_level debug - + @log_level debug + #Kubernetes Nodes - type kubenodeinventory - tag oms.containerinsights.KubeNodeInventory + @type kube_nodes + tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes health - type kubehealth + @type kube_health tag kubehealth.ReplicaSet run_interval 60 - log_level debug + @log_level debug #cadvisor perf- Windows nodes - type wincadvisorperf - tag oms.api.wincadvisorperf + @type win_cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes object state - deployments - - type kubestatedeployments - tag oms.containerinsights.KubeStateDeployments - run_interval 60 - log_level debug - + + @type kubestate_deployments + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + - #Kubernetes object state - HPA - - type kubestatehpa - tag oms.containerinsights.KubeStateHpa - run_interval 60 - log_level debug - + #Kubernetes object state - HPA + + @type kubestate_hpa + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + - type filter_inventory2mdm - log_level info + @type inventory2mdm + @log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes - type filter_cadvisor2mdm + @type cadvisor2mdm metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes - log_level info + @log_level info #health model aggregation filter - type filter_health_model_builder + @type health_model_builder - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubepodinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - + #kubepvinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #InsightsMetrics + #kubestate + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 2 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubeevents + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + #kubeservices + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 2 + + keepalive true + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubenodeinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 3 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #containernodeinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 3 + + keepalive true - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #containerinventory for windows containers + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + #perf + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/perf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + retry_mdm_post_wait_minutes 30 - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + retry_mdm_post_wait_minutes 30 - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + + #kubehealth + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - \ No newline at end of file diff --git a/build/linux/installer/conf/out_oms.conf b/build/linux/installer/conf/out_oms.conf index 74ba3195e..21dc4c1ed 100644 --- a/build/linux/installer/conf/out_oms.conf +++ b/build/linux/installer/conf/out_oms.conf @@ -1,10 +1,9 @@ -omsadmin_conf_path=/etc/opt/microsoft/omsagent/conf/omsadmin.conf omsproxy_secret_path=/etc/omsagent-secret/PROXY adx_cluster_uri_path=/etc/config/settings/adx/ADXCLUSTERURI adx_client_id_path=/etc/config/settings/adx/ADXCLIENTID adx_tenant_id_path=/etc/config/settings/adx/ADXTENANTID adx_client_secret_path=/etc/config/settings/adx/ADXCLIENTSECRET -cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt -key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key +cert_file_path=/etc/mdsd.d/oms/%s/oms.crt +key_file_path=/etc/mdsd.d/oms/%s/oms.key container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname container_inventory_refresh_interval=60 diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index df8fbc3da..b9f889dba 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -18,89 +18,8 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/conf/installinfo.txt; build/linux/installer/conf/installinfo.txt; 644; root; root; conffile -/opt/microsoft/omsagent/plugin/filter_docker_log.rb; source/plugins/ruby/filter_docker_log.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_container.rb; source/plugins/ruby/filter_container.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root -/opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root - /etc/opt/microsoft/docker-cimprov/container.conf; build/linux/installer/conf/container.conf; 644; root; root -/opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root -/opt/microsoft/omsagent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root -/opt/microsoft/omsagent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root - - -/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby/ApplicationInsightsUtility.rb; 644; root; root -/opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/plugins/ruby/ContainerInventoryState.rb; 644; root; root -/opt/microsoft/omsagent/plugin/DockerApiClient.rb; source/plugins/ruby/DockerApiClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/DockerApiRestHelper.rb; source/plugins/ruby/DockerApiRestHelper.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby/kubernetes_container_inventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/proxy_utils.rb; source/plugins/ruby/proxy_utils.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby/arc_k8s_cluster_identity.rb; 644; root; root -/opt/microsoft/omsagent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor2mdm.rb; source/plugins/ruby/filter_cadvisor2mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/lib/application_insights/version.rb; source/plugins/ruby/lib/application_insights/version.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/rack/track_request.rb; source/plugins/ruby/lib/application_insights/rack/track_request.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/unhandled_exception.rb; source/plugins/ruby/lib/application_insights/unhandled_exception.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/telemetry_client.rb; source/plugins/ruby/lib/application_insights/telemetry_client.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/queue_base.rb; source/plugins/ruby/lib/application_insights/channel/queue_base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/plugins/ruby/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/request_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/session.rb; source/plugins/ruby/lib/application_insights/channel/contracts/session.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/location.rb; source/plugins/ruby/lib/application_insights/channel/contracts/location.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/operation.rb; source/plugins/ruby/lib/application_insights/channel/contracts/operation.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/event_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/device.rb; source/plugins/ruby/lib/application_insights/channel/contracts/device.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/message_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/user.rb; source/plugins/ruby/lib/application_insights/channel/contracts/user.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/plugins/ruby/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/application.rb; source/plugins/ruby/lib/application_insights/channel/contracts/application.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/plugins/ruby/lib/application_insights/channel/contracts/cloud.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/plugins/ruby/lib/application_insights/channel/contracts/envelope.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/plugins/ruby/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/domain.rb; source/plugins/ruby/lib/application_insights/channel/contracts/domain.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/base.rb; source/plugins/ruby/lib/application_insights/channel/contracts/base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/plugins/ruby/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/internal.rb; source/plugins/ruby/lib/application_insights/channel/contracts/internal.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_queue.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/sender_base.rb; source/plugins/ruby/lib/application_insights/channel/sender_base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_context.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_context.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_channel.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/event.rb; source/plugins/ruby/lib/application_insights/channel/event.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights.rb; source/plugins/ruby/lib/application_insights.rb; 644; root; root - /opt/tomlrb.rb; source/toml-parser/tomlrb.rb; 644; root; root /opt/tomlrb/generated_parser.rb; source/toml-parser/tomlrb/generated_parser.rb; 644; root; root /opt/tomlrb/handler.rb; source/toml-parser/tomlrb/handler.rb; 644; root; root @@ -126,6 +45,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root + /opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root @@ -134,43 +54,127 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/plugins/ruby/filter_cadvisor_health_node.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/plugins/ruby/filter_health_model_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_health.rb; source/plugins/ruby/in_kube_health.rb; 644; root; root -/opt/microsoft/omsagent/plugin/out_health_forward.rb; source/plugins/ruby/out_health_forward.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; build/linux/installer/conf/healthmonitorconfig.json; 644; root; root /etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; build/linux/installer/conf/health_model_definition.json; 644; root; root -/opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/plugins/ruby/health/aggregate_monitor.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/plugins/ruby/health/agg_monitor_id_labels.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/plugins/ruby/health/aggregate_monitor_state_finalizer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/plugins/ruby/health/cluster_health_state.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_aggregator.rb; source/plugins/ruby/health/health_container_cpu_memory_aggregator.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_record_formatter.rb; source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/plugins/ruby/health/health_hierarchy_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/plugins/ruby/health/health_kubernetes_resources.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/plugins/ruby/health/health_kube_api_down_handler.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_missing_signal_generator.rb; source/plugins/ruby/health/health_missing_signal_generator.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_buffer.rb; source/plugins/ruby/health/health_model_buffer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_builder.rb; source/plugins/ruby/health/health_model_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_constants.rb; source/plugins/ruby/health/health_model_constants.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/parent_monitor_provider.rb; source/plugins/ruby/health/parent_monitor_provider.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/plugins/ruby/health/health_model_definition_parser.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/plugins/ruby/health/health_monitor_helpers.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/plugins/ruby/health/health_monitor_optimizer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/plugins/ruby/health/health_monitor_provider.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/plugins/ruby/health/health_monitor_record.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/plugins/ruby/health/health_monitor_state.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_telemetry.rb; source/plugins/ruby/health/health_monitor_telemetry.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/plugins/ruby/health/health_monitor_utils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/plugins/ruby/health/health_signal_reducer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/plugins/ruby/health/monitor_factory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/plugins/ruby/health/monitor_set.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/plugins/ruby/health/unit_monitor.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/version.rb; source/plugins/ruby/lib/application_insights/version.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/rack/track_request.rb; source/plugins/ruby/lib/application_insights/rack/track_request.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/unhandled_exception.rb; source/plugins/ruby/lib/application_insights/unhandled_exception.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/telemetry_client.rb; source/plugins/ruby/lib/application_insights/telemetry_client.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/queue_base.rb; source/plugins/ruby/lib/application_insights/channel/queue_base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/plugins/ruby/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/request_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/session.rb; source/plugins/ruby/lib/application_insights/channel/contracts/session.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/location.rb; source/plugins/ruby/lib/application_insights/channel/contracts/location.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/operation.rb; source/plugins/ruby/lib/application_insights/channel/contracts/operation.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/event_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/device.rb; source/plugins/ruby/lib/application_insights/channel/contracts/device.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/message_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/user.rb; source/plugins/ruby/lib/application_insights/channel/contracts/user.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/plugins/ruby/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/application.rb; source/plugins/ruby/lib/application_insights/channel/contracts/application.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/plugins/ruby/lib/application_insights/channel/contracts/cloud.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/plugins/ruby/lib/application_insights/channel/contracts/envelope.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/plugins/ruby/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/domain.rb; source/plugins/ruby/lib/application_insights/channel/contracts/domain.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/base.rb; source/plugins/ruby/lib/application_insights/channel/contracts/base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/plugins/ruby/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/internal.rb; source/plugins/ruby/lib/application_insights/channel/contracts/internal.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_queue.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/sender_base.rb; source/plugins/ruby/lib/application_insights/channel/sender_base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/telemetry_context.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_context.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_channel.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/event.rb; source/plugins/ruby/lib/application_insights/channel/event.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights.rb; source/plugins/ruby/lib/application_insights.rb; 644; root; root + +/etc/fluent/plugin/health/aggregate_monitor.rb; source/plugins/ruby/health/aggregate_monitor.rb; 644; root; root +/etc/fluent/plugin/health/agg_monitor_id_labels.rb; source/plugins/ruby/health/agg_monitor_id_labels.rb; 644; root; root +/etc/fluent/plugin/health/aggregate_monitor_state_finalizer.rb; source/plugins/ruby/health/aggregate_monitor_state_finalizer.rb; 644; root; root +/etc/fluent/plugin/health/cluster_health_state.rb; source/plugins/ruby/health/cluster_health_state.rb; 644; root; root +/etc/fluent/plugin/health/health_container_cpu_memory_aggregator.rb; source/plugins/ruby/health/health_container_cpu_memory_aggregator.rb; 644; root; root +/etc/fluent/plugin/health/health_container_cpu_memory_record_formatter.rb; source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb; 644; root; root +/etc/fluent/plugin/health/health_hierarchy_builder.rb; source/plugins/ruby/health/health_hierarchy_builder.rb; 644; root; root +/etc/fluent/plugin/health/health_kubernetes_resources.rb; source/plugins/ruby/health/health_kubernetes_resources.rb; 644; root; root +/etc/fluent/plugin/health/health_kube_api_down_handler.rb; source/plugins/ruby/health/health_kube_api_down_handler.rb; 644; root; root +/etc/fluent/plugin/health/health_missing_signal_generator.rb; source/plugins/ruby/health/health_missing_signal_generator.rb; 644; root; root +/etc/fluent/plugin/health/health_model_buffer.rb; source/plugins/ruby/health/health_model_buffer.rb; 644; root; root +/etc/fluent/plugin/health/health_model_builder.rb; source/plugins/ruby/health/health_model_builder.rb; 644; root; root +/etc/fluent/plugin/health/health_model_constants.rb; source/plugins/ruby/health/health_model_constants.rb; 644; root; root +/etc/fluent/plugin/health/parent_monitor_provider.rb; source/plugins/ruby/health/parent_monitor_provider.rb; 644; root; root +/etc/fluent/plugin/health/health_model_definition_parser.rb; source/plugins/ruby/health/health_model_definition_parser.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_helpers.rb; source/plugins/ruby/health/health_monitor_helpers.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_optimizer.rb; source/plugins/ruby/health/health_monitor_optimizer.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_provider.rb; source/plugins/ruby/health/health_monitor_provider.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_record.rb; source/plugins/ruby/health/health_monitor_record.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_state.rb; source/plugins/ruby/health/health_monitor_state.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_telemetry.rb; source/plugins/ruby/health/health_monitor_telemetry.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_utils.rb; source/plugins/ruby/health/health_monitor_utils.rb; 644; root; root +/etc/fluent/plugin/health/health_signal_reducer.rb; source/plugins/ruby/health/health_signal_reducer.rb; 644; root; root +/etc/fluent/plugin/health/monitor_factory.rb; source/plugins/ruby/health/monitor_factory.rb; 644; root; root +/etc/fluent/plugin/health/monitor_set.rb; source/plugins/ruby/health/monitor_set.rb; 644; root; root +/etc/fluent/plugin/health/unit_monitor.rb; source/plugins/ruby/health/unit_monitor.rb; 644; root; root + +/etc/fluent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby/ApplicationInsightsUtility.rb; 644; root; root +/etc/fluent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby/arc_k8s_cluster_identity.rb; 644; root; root +/etc/fluent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root +/etc/fluent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root +/etc/fluent/plugin/ContainerInventoryState.rb; source/plugins/ruby/ContainerInventoryState.rb; 644; root; root +/etc/fluent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root +/etc/fluent/plugin/DockerApiClient.rb; source/plugins/ruby/DockerApiClient.rb; 644; root; root +/etc/fluent/plugin/DockerApiRestHelper.rb; source/plugins/ruby/DockerApiRestHelper.rb; 644; root; root +/etc/fluent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root +/etc/fluent/plugin/proxy_utils.rb; source/plugins/ruby/proxy_utils.rb; 644; root; root +/etc/fluent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby/kubernetes_container_inventory.rb; 644; root; root +/etc/fluent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root +/etc/fluent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root +/etc/fluent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root + +/etc/fluent/plugin/omslog.rb; source/plugins/utils/omslog.rb; 644; root; root +/etc/fluent/plugin/oms_common.rb; source/plugins/utils/oms_common.rb; 644; root; root + +/etc/fluent/kube.conf; build/linux/installer/conf/kube.conf; 644; root; root +/etc/fluent/container.conf; build/linux/installer/conf/container.conf; 644; root; root + +/etc/fluent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root +/etc/fluent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root +/etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root +/etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root +/etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root +/etc/fluent/plugin/in_kube_health.rb; source/plugins/ruby/in_kube_health.rb; 644; root; root +/etc/fluent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root +/etc/fluent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root +/etc/fluent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root + +/etc/fluent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root +/etc/fluent/plugin/filter_cadvisor_health_node.rb; source/plugins/ruby/filter_cadvisor_health_node.rb; 644; root; root +/etc/fluent/plugin/filter_cadvisor2mdm.rb; source/plugins/ruby/filter_cadvisor2mdm.rb; 644; root; root +/etc/fluent/plugin/filter_health_model_builder.rb; source/plugins/ruby/filter_health_model_builder.rb; 644; root; root +/etc/fluent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root +/etc/fluent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root + +/etc/fluent/plugin/out_health_forward.rb; source/plugins/ruby/out_health_forward.rb; 644; root; root +/etc/fluent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root + + %Links -/opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root %Directories /etc; 755; root; root; sysdir @@ -179,27 +183,18 @@ MAINTAINER: 'Microsoft Corporation' /var; 755; root; root; sysdir /var/opt; 755; root; root; sysdir +/opt/fluent; 755; root; root; sysdir + /etc/opt/microsoft; 755; root; root; sysdir /etc/opt/microsoft/docker-cimprov; 755; root; root /etc/opt/microsoft/docker-cimprov/conf; 755; root; root /etc/opt/microsoft/docker-cimprov/health; 755; root; root -/etc/opt/omi; 755; root; root; sysdir -/etc/opt/omi/conf; 755; root; root; sysdir -/etc/opt/omi/conf/omiregister; 755; root; root; sysdir -/etc/opt/omi/conf/omiregister/root-cimv2; 755; root; root - /opt/microsoft; 755; root; root; sysdir /opt/microsoft/docker-cimprov; 755; root; root /opt/microsoft/docker-cimprov/bin; 755; root; root /opt/microsoft/docker-cimprov/lib; 755; root; root -/opt/microsoft/omsagent; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/health; 755; root; root; sysdir - -/opt/omi; 755; root; root; sysdir -/opt/omi/lib; 755; root; root; sysdir /var/opt/microsoft; 755; root; root; sysdir /var/opt/microsoft/docker-cimprov; 755; root; root @@ -213,11 +208,14 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin; 755; root; root;sysdir /etc/telegraf; 755; root; root;sysdir -/opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/channel; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/rack; 755; root; root; sysdir +/etc/fluent; 755; root; root; sysdir +/etc/fluent/plugin; 755; root; root; sysdir +/etc/fluent/plugin/health; 755; root; root; sysdir +/etc/fluent/plugin/lib; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/channel; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/rack; 755; root; root; sysdir /opt/tomlrb; 755; root; root; sysdir @@ -230,64 +228,61 @@ WriteInstallInfo() { } WriteInstallInfo -#Make omsagent owner for ContainerInventory directory. This is needed for ruby plugin to have access -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/ContainerInventory # Get the state file in place with proper permissions touch /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt chmod 644 /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt touch /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml chmod 644 /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml touch /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml chmod 644 /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml + touch /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt chmod 666 /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt + touch /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt chmod 666 /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt + touch /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log chmod 666 /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log + touch /var/opt/microsoft/docker-cimprov/log/health_monitors.log chmod 666 /var/opt/microsoft/docker-cimprov/log/health_monitors.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/health_monitors.log + touch /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log + touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + touch /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log chmod 666 /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log -mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + +touch /var/opt/microsoft/docker-cimprov/log/fluentd.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/fluentd.log + %Postuninstall_10 # If we're an upgrade, skip all of this cleanup @@ -299,7 +294,6 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rm -f /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt - rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container.conf rmdir /var/opt/microsoft/docker-cimprov/log 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ContainerInventory 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ImageInventory 2> /dev/null @@ -308,14 +302,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rmdir /etc/opt/microsoft/docker-cimprov/conf 2> /dev/null rmdir /etc/opt/microsoft/docker-cimprov 2> /dev/null rmdir /etc/opt/microsoft 2> /dev/null - rmdir /etc/opt 2> /dev/null - #Remove sudoers file edit - if [ -s /etc/sudoers.d/omsagent ] - then - chmod +w /etc/sudoers.d/omsagent - sed -i '/docker\-provider/,+1 d' /etc/sudoers.d/omsagent - chmod 440 /etc/sudoers.d/omsagent - fi + rmdir /etc/opt 2> /dev/null fi %Preinstall_0 diff --git a/build/linux/installer/datafiles/linux.data b/build/linux/installer/datafiles/linux.data index 604394d80..48af63a73 100644 --- a/build/linux/installer/datafiles/linux.data +++ b/build/linux/installer/datafiles/linux.data @@ -1,16 +1,11 @@ %Variables PF: 'Linux' -OMI_SERVICE: '/opt/omi/bin/service_control' -OMS_SERVICE: '/opt/microsoft/omsagent/bin/service_control' + %Postinstall_2000 -# Reload the OMI server -${{OMI_SERVICE}} reload -${{OMS_SERVICE}} reload -if ${{PERFORMING_UPGRADE_NOT}}; then - /opt/omi/bin/omicli ei root/cimv2 Container_HostInventory -fi + + %Postuninstall_1000 # Calling sequence for RPM pre/post scripts, during upgrade, is as follows: @@ -35,10 +30,5 @@ if ${{PERFORMING_UPGRADE_NOT}}; then fi %Postuninstall_1100 -# If we're called for upgrade, don't do anything -if ${{PERFORMING_UPGRADE_NOT}}; then - # Reload the OMI server - ${{OMI_SERVICE}} reload - ${{OMS_SERVICE}} reload -fi + diff --git a/build/linux/installer/datafiles/linux_dpkg.data b/build/linux/installer/datafiles/linux_dpkg.data index a7821642d..bdf9f2354 100644 --- a/build/linux/installer/datafiles/linux_dpkg.data +++ b/build/linux/installer/datafiles/linux_dpkg.data @@ -3,5 +3,5 @@ PERFORMING_UPGRADE_NOT: '[ "$1" != "upgrade" ]' PACKAGE_TYPE: 'DPKG' %Dependencies -omi (>= 1.0.8.6) + diff --git a/build/linux/installer/datafiles/linux_rpm.data b/build/linux/installer/datafiles/linux_rpm.data index 1b9ba009b..d537b444d 100644 --- a/build/linux/installer/datafiles/linux_rpm.data +++ b/build/linux/installer/datafiles/linux_rpm.data @@ -3,5 +3,5 @@ PERFORMING_UPGRADE_NOT: '[ "$1" -ne 1 ]' PACKAGE_TYPE: 'RPM' %Dependencies -omi >= 1.0.8-6 + diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 5e1261e7e..252f471e9 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -1,19 +1,21 @@ #!/bin/bash -#test to exit non zero value if omsagent is not running -(ps -ef | grep omsagent- | grep -v "grep") +#test to exit non zero value if mdsd is not running +(ps -ef | grep "mdsd" | grep -v "grep") if [ $? -ne 0 ] then - echo " omsagent is not running" > /dev/termination-log - exit 1 + echo "mdsd is not running" > /dev/termination-log + exit 1 fi -#optionally test to exit non zero value if oneagent is not running -if [ -e "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" ]; then - (ps -ef | grep "mdsd" | grep -v "grep") + +#optionally test to exit non zero value if fluentd is not running +#fluentd not used in sidecar container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + (ps -ef | grep "fluentd" | grep -v "grep") if [ $? -ne 0 ] then - echo "oneagent is not running" > /dev/termination-log + echo "fluentd is not running" > /dev/termination-log exit 1 fi fi diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 5ce5d79d2..dcf179bf2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -3,7 +3,7 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" -require_relative "microsoft/omsagent/plugin/constants" +require_relative "/etc/fluent/plugin/constants" @configMapMountPath = "/etc/config/settings/alertable-metrics-configuration-settings" @configVersion = "" diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb index 40d87b7f1..cee41312b 100644 --- a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -3,7 +3,7 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" -require_relative "microsoft/omsagent/plugin/constants" +require_relative "/etc/fluent/plugin/constants" @configMapMountPath = "/etc/config/settings/metric_collection_settings" @configVersion = "" diff --git a/kubernetes/linux/envmdsd b/kubernetes/linux/envmdsd index 3f834bfb8..5a939fc3e 100644 --- a/kubernetes/linux/envmdsd +++ b/kubernetes/linux/envmdsd @@ -2,8 +2,6 @@ export MDSD_ROLE_PREFIX="/var/run/mdsd/default" #export MDSD_OPTIONS="-d -A -r ${MDSD_ROLE_PREFIX}" export MDSD_LOG="/var/opt/microsoft/linuxmonagent/log" export MDSD_SPOOL_DIRECTORY="/var/opt/microsoft/linuxmonagent" -export OMS_CERT_PATH="/etc/opt/microsoft/omsagent/certs/oms.crt" -export OMS_CERT_KEY_PATH="/etc/opt/microsoft/omsagent/certs/oms.key" #export CIWORKSPACE_id="" #export CIWORKSPACE_key="" export MDSD_OPTIONS="-A -c /etc/mdsd.d/mdsd.xml -r ${MDSD_ROLE_PREFIX} -S ${MDSD_SPOOL_DIRECTORY}/eh -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c7d939034..b21ed6b96 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -38,41 +38,9 @@ waitforlisteneronTCPport() { fi } -if [ -e "/etc/config/kube.conf" ]; then - cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "setting omsagent conf file for prometheus sidecar" - cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf - # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. - # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, - # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. - # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. - sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf - sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf -else - echo "setting omsagent conf file for daemonset" - sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -fi -sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf -sed -i -e 's/^exit 101$/exit 0/g' /usr/sbin/policy-rc.d - -#Using the get_hostname for hostname instead of the host field in syslog messages -sed -i.bak "s/record\[\"Host\"\] = hostname/record\[\"Host\"\] = OMS::Common.get_hostname/" /opt/microsoft/omsagent/plugin/filter_syslog.rb - #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state -#if [ ! -e "/etc/config/kube.conf" ]; then - # add permissions for omsagent user to access docker.sock - #sudo setfacl -m user:omsagent:rw /var/run/host/docker.sock -#fi - -# add permissions for omsagent user to access azure.json. -sudo setfacl -m user:omsagent:r /etc/kubernetes/host/azure.json - -# add permission for omsagent user to log folder. We also need 'x', else log rotation is failing. TODO: Investigate why. -sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log - #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' @@ -89,7 +57,7 @@ else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc source ~/.bashrc - echo "customResourceId:$customResourceId" + echo "customResourceId:$customResourceId" fi #set agent config schema version @@ -141,7 +109,6 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus fi export PROXY_ENDPOINT="" - # Check for internet connectivity or workspace deletion if [ -e "/etc/omsagent-secret/WSID" ]; then workspaceId=$(cat /etc/omsagent-secret/WSID) @@ -222,6 +189,7 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi + # Set environment variable for if public cloud by checking the workspace domain. if [ -z $domain ]; then ClOUD_ENVIRONMENT="unknown" @@ -233,6 +201,12 @@ fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +#consisten naming conventions with the windows +export DOMAIN=$domain +echo "export DOMAIN=$DOMAIN" >> ~/.bashrc +export WSID=$workspaceId +echo "export WSID=$WSID" >> ~/.bashrc + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) for BACKOFF in {1..4}; do @@ -267,7 +241,7 @@ source ~/.bashrc if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then #Parse the configmap to set the right environment variables. - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb + /usr/bin/ruby2.6 tomlparser.rb cat config_env_var | while read line; do echo $line >> ~/.bashrc @@ -278,7 +252,7 @@ fi #Parse the configmap to set the right environment variables for agent config. #Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb + /usr/bin/ruby2.6 tomlparser-agent-config.rb cat agent_config_env_var | while read line; do #echo $line @@ -287,7 +261,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + /usr/bin/ruby2.6 tomlparser-npm-config.rb cat integration_npm_config_env_var | while read line; do #echo $line @@ -298,11 +272,11 @@ fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb + /usr/bin/ruby2.6 td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb +/usr/bin/ruby2.6 tomlparser-prom-customconfig.rb #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then @@ -335,7 +309,7 @@ fi #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb + /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do echo $line >> ~/.bashrc @@ -343,7 +317,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then source config_mdm_metrics_env_var #Parse the configmap to set the right environment variables for metric collection settings - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do echo $line >> ~/.bashrc @@ -354,7 +328,7 @@ fi # OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + /usr/bin/ruby2.6 tomlparser-osm-config.rb if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do @@ -432,26 +406,11 @@ export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_error if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" -else - #if container run time is docker then add omsagent user to local docker group to get access to docker.sock - # docker.sock only use for the telemetry to get the docker version - DOCKER_SOCKET=/var/run/host/docker.sock - DOCKER_GROUP=docker - REGULAR_USER=omsagent - if [ -S ${DOCKER_SOCKET} ]; then - echo "getting gid for docker.sock" - DOCKER_GID=$(stat -c '%g' ${DOCKER_SOCKET}) - echo "creating a local docker group" - groupadd -for -g ${DOCKER_GID} ${DOCKER_GROUP} - echo "adding omsagent user to local docker group" - usermod -aG ${DOCKER_GROUP} ${REGULAR_USER} - fi + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" -sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /opt/microsoft/omsagent/ruby/bin/ruby - +sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6 echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc @@ -461,171 +420,70 @@ echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. cat /var/opt/microsoft/docker-cimprov/state/containerhostname - -#Commenting it for test. We do this in the installer now -#Setup sudo permission for containerlogtailfilereader -#chmod +w /etc/sudoers.d/omsagent -#echo "#run containerlogtailfilereader.rb for docker-provider" >> /etc/sudoers.d/omsagent -#echo "omsagent ALL=(ALL) NOPASSWD: /opt/microsoft/omsagent/ruby/bin/ruby /opt/microsoft/omsagent/plugin/containerlogtailfilereader.rb *" >> /etc/sudoers.d/omsagent -#chmod 440 /etc/sudoers.d/omsagent - -#Disable dsc -#/opt/microsoft/omsconfig/Scripts/OMS_MetaConfigHelper.py --disable -rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/omsconfig.consistencyinvoker.conf - -CIWORKSPACE_id="" -CIWORKSPACE_key="" - -if [ -z $INT ]; then - if [ -a /etc/omsagent-secret/PROXY ]; then - if [ -a /etc/omsagent-secret/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` -p `cat /etc/omsagent-secret/PROXY` - else - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -p `cat /etc/omsagent-secret/PROXY` - fi - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /etc/omsagent-secret/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /etc/omsagent-secret/WSID ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /run/secrets/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` -d `cat /run/secrets/DOMAIN` - CIWORKSPACE_id="$(cat /run/secrets/WSID)" - CIWORKSPACE_key="$(cat /run/secrets/KEY)" - elif [ -a /run/secrets/WSID ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` - CIWORKSPACE_id="$(cat /run/secrets/WSID)" - CIWORKSPACE_key="$(cat /run/secrets/KEY)" - elif [ -z $DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w $WSID -s $KEY - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - else - /opt/microsoft/omsagent/bin/omsadmin.sh -w $WSID -s $KEY -d $DOMAIN - CIWORKSPACE_id="$WSID" - CIWORKSPACE_key="$KEY" - fi -else -#To onboard to INT workspace - workspace-id (WSID-not base64 encoded), workspace-key (KEY-not base64 encoded), Domain(DOMAIN-int2.microsoftatlanta-int.com) -#need to be added to omsagent.yaml. - echo WORKSPACE_ID=$WSID > /etc/omsagent-onboard.conf - echo SHARED_KEY=$KEY >> /etc/omsagent-onboard.conf - echo URL_TLD=$DOMAIN >> /etc/omsagent-onboard.conf - /opt/microsoft/omsagent/bin/omsadmin.sh - CIWORKSPACE_id="$WSID" - CIWORKSPACE_key="$KEY" -fi - #start cron daemon for logrotate service cron start +#get docker-provider versions -#check if agent onboarded successfully -/opt/microsoft/omsagent/bin/omsadmin.sh -l - -#get omsagent and docker-provider versions -dpkg -l | grep omsagent | awk '{print $2 " " $3}' dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +echo "*** activating oneagent in legacy auth mode ***" +CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" +#use the file path as its secure than env +CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" +cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc +done +source /etc/mdsd.d/envmdsd +echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" +export CIWORKSPACE_id=$CIWORKSPACE_id +echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc +export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile +echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc +export OMS_TLD=$domain +echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +export MDSD_FLUENT_SOCKET_PORT="29230" +echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + +#skip imds lookup since not used in legacy auth path +export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc -#region check to auto-activate oneagent, to route container logs, -#Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap -# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map -# AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic -echo "************start oneagent log routing checks************" -# by default, use configmap route for safer side -AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE - -#trim region list -oneagentregions="$(echo $AZMON_CONTAINERLOGS_ONEAGENT_REGIONS | xargs)" -#lowercase region list -typeset -l oneagentregions=$oneagentregions -echo "oneagent regions: $oneagentregions" -#trim current region -currentregion="$(echo $AKS_REGION | xargs)" -#lowercase current region -typeset -l currentregion=$currentregion -echo "current region: $currentregion" - -#initilze isoneagentregion as false -isoneagentregion=false - -#set isoneagentregion as true if matching region is found -if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then - for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do - if [ "$rgn" == "$currentregion" ]; then - isoneagentregion=true - echo "current region is in oneagent regions..." - break - fi - done -else - echo "current region is not in oneagent regions..." -fi +source ~/.bashrc -if [ "$isoneagentregion" = true ]; then - #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE - echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - else #there is no configmap route, so route thru oneagent - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE="v2" - echo "oneagent region is true for current region:$currentregion and config map logs route is empty. so using oneagent as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - fi -else - echo "oneagent region is false for current region:$currentregion" +dpkg -l | grep mdsd | awk '{print $2 " " $3}' + +if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & +else + echo "starting mdsd in legacy auth mode in main container..." + # add -T 0xFFFF for full traces + mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi - -#start oneagent -if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then - echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" - echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - #trim - containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE | xargs)" - # convert to lowercase - typeset -l containerlogsroute=$containerlogsroute - - echo "setting AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE as :$containerlogsroute" - export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute - echo "export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute" >> ~/.bashrc - source ~/.bashrc - - if [ "$containerlogsroute" == "v2" ]; then - echo "activating oneagent..." - echo "configuring mdsd..." - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_key=$CIWORKSPACE_key - echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc - - source ~/.bashrc - - dpkg -l | grep mdsd | awk '{print $2 " " $3}' - - echo "starting mdsd ..." - mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - - touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 - fi - fi -fi -echo "************end oneagent log routing checks************" +# no dependency on fluentd for prometheus side car container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + if [ ! -e "/etc/config/kube.conf" ]; then + echo "*** starting fluentd v1 in daemonset" + fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + else + echo "*** starting fluentd v1 in replicaset" + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + fi +fi #If config parsing was successful, a copy of the conf file with replaced custom settings file is created if [ ! -e "/etc/config/kube.conf" ]; then @@ -749,12 +607,9 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' #dpkg -l | grep telegraf | awk '{print $2 " " $3}' - - # Write messages from the liveness probe to stdout (so telemetry picks it up) touch /dev/write-to-traces - echo "stopping rsyslog..." service rsyslog stop @@ -762,7 +617,7 @@ echo "getting rsyslog status..." service rsyslog status shutdown() { - /opt/microsoft/omsagent/bin/service_control stop + pkill -f mdsd } trap "shutdown" SIGTERM diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml index 49d329791..de14240aa 100644 --- a/kubernetes/linux/mdsd.xml +++ b/kubernetes/linux/mdsd.xml @@ -47,6 +47,149 @@ Each column has a name, an augmented JSON source type, and a target MDS type. --> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -68,14 +211,33 @@ + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - ]]> @@ -143,11 +360,95 @@ - - ]]> + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + ]]> + + + + + + + ]]> + + + diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index f065cc165..3d00e4c57 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,37 +9,13 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent_v1.10.0-1/omsagent-1.10.0-1.universal.x64.sh +#install oneagent - Official bits (05/17/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/05172021-oneagent/azure-mdsd_1.10.1-build.master.213_x86_64.deb -#create file to disable omi service startup script -touch /etc/.omi_disable_service_control - -chmod 775 $TMPDIR/*.sh - -#Extract omsbundle -$TMPDIR/omsagent-*.universal.x64.sh --extract -mv $TMPDIR/omsbundle* $TMPDIR/omsbundle -#Install omi -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/omi*.deb - -#Install scx -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/scx*.deb -#$TMPDIR/omsbundle/bundles/scx-1.6.*-*.universal.x64.sh --install - -#Install omsagent - -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/omsagent*.deb -#/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb - -#install oneagent - Official bits (05/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/05112021-oneagent/azure-mdsd_1.8.0-build.master.189_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d -#Assign permissions to omsagent user to access docker.sock -sudo apt-get install acl - #download inotify tools for watching configmap changes sudo apt-get update sudo apt-get install inotify-tools -y @@ -49,18 +25,7 @@ sudo apt-get install inotify-tools -y sudo apt-get install jq=1.5+dfsg-2 -y #used to setcaps for ruby process to read /proc/env -echo "installing libcap2-bin" sudo apt-get install libcap2-bin -y -#/$TMPDIR/omsbundle/oss-kits/docker-cimprov-1.0.0-*.x86_64.sh --install -#Use downloaded docker-provider instead of the bundled one - -#download and install telegraf -#wget https://dl.influxdata.com/telegraf/releases/telegraf_1.10.1-1_amd64.deb -#sudo dpkg -i telegraf_1.10.1-1_amd64.deb - -#service telegraf stop - -#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf #1.18 pre-release wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz @@ -79,8 +44,17 @@ sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/ sudo apt-get update sudo apt-get install td-agent-bit=1.6.8 -y -rm -rf $TMPDIR/omsbundle -rm -f $TMPDIR/omsagent*.sh +# install ruby2.6 +sudo apt-get install software-properties-common -y +sudo apt-add-repository ppa:brightbox/ruby-ng -y +sudo apt-get update +sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y +# fluentd v1 gem +gem install fluentd -v "1.12.2" --no-document +fluentd --setup ./fluent +gem install gyoku iso8601 --no-doc + + rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index d35acad3d..25f364c55 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -92,15 +92,24 @@ const kubeMonAgentConfigEventFlushInterval = 60 //Eventsource name in mdsd const MdsdContainerLogSourceName = "ContainerLogSource" const MdsdContainerLogV2SourceName = "ContainerLogV2Source" +const MdsdKubeMonAgentEventsSourceName = "KubeMonAgentEventsSource" +const MdsdInsightsMetricsSourceName = "InsightsMetricsSource" -//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) +//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, v1 for ODS Direct) const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" +//fallback option v1 route i.e. ODS direct if required in any case +const ContainerLogsV1Route = "v1" + //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" + +//env variable to container type +const ContainerTypeEnv = "CONTAINER_TYPE" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -108,6 +117,10 @@ var ( HTTPClient http.Client // Client for MDSD msgp Unix socket MdsdMsgpUnixSocketClient net.Conn + // Client for MDSD msgp Unix socket for KubeMon Agent events + MdsdKubeMonMsgpUnixSocketClient net.Conn + // Client for MDSD msgp Unix socket for Insights Metrics + MdsdInsightsMetricsMsgpUnixSocketClient net.Conn // Ingestor for ADX ADXIngestor *ingest.Ingestion // OMSEndpoint ingestion endpoint @@ -116,6 +129,8 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string + // LogAnalyticsWorkspaceDomain log analytics workspace domain + LogAnalyticsWorkspaceDomain string // ResourceID for resource-centric log analytics data ResourceID string // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) @@ -143,7 +158,17 @@ var ( // ADX tenantID AdxTenantID string //ADX client secret - AdxClientSecret string + AdxClientSecret string + // container log or container log v2 tag name for oneagent route + MdsdContainerLogTagName string + // kubemonagent events tag name for oneagent route + MdsdKubeMonAgentEventsTagName string + // InsightsMetrics tag name for oneagent route + MdsdInsightsMetricsTagName string + // flag to check if its Windows OS + IsWindows bool + // container type + ContainerType string ) var ( @@ -314,6 +339,15 @@ const ( PromScrapingError ) +// DataType to be used as enum per data type socket client creation +type DataType int +const ( + // DataType to be used as enum per data type socket client creation + ContainerLogV2 DataType = iota + KubeMonAgentEvents + InsightsMetrics +) + func createLogger() *log.Logger { var logfile *os.File @@ -532,6 +566,7 @@ func flushKubeMonAgentEventRecords() { start := time.Now() var elapsed time.Duration var laKubeMonAgentEventsRecords []laKubeMonAgentEvents + var msgPackEntries []MsgPackEntry telemetryDimensions := make(map[string]string) telemetryDimensions["ConfigErrorEventCount"] = strconv.Itoa(len(ConfigErrorEvent)) @@ -558,7 +593,25 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarhalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -579,7 +632,25 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarhalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -610,11 +681,63 @@ func flushKubeMonAgentEventRecords() { Message: "No errors", Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } - - if len(laKubeMonAgentEventsRecords) > 0 { + if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + Log("Info::mdsd:: using mdsdsource name for KubeMonAgentEvents: %s", MdsdKubeMonAgentEventsTagName) + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) + if MdsdKubeMonMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection for KubeMonAgentEvents does not exist. re-connecting ...") + CreateMDSDClient(KubeMonAgentEvents, ContainerType) + if MdsdKubeMonMsgpUnixSocketClient == nil { + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + KubeMonEventsMDSDClientCreateErrors += 1 + } + } + if MdsdKubeMonMsgpUnixSocketClient != nil { + deadline := 10 * time.Second + MdsdKubeMonMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdKubeMonMsgpUnixSocketClient.Write(msgpBytes) + elapsed = time.Since(start) + if er != nil { + message := fmt.Sprintf("Error::mdsd::Failed to write to kubemonagent mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + Log(message) + if MdsdKubeMonMsgpUnixSocketClient != nil { + MdsdKubeMonMsgpUnixSocketClient.Close() + MdsdKubeMonMsgpUnixSocketClient = nil + } + SendException(message) + } else { + numRecords := len(msgPackEntries) + Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records that was %d bytes in %s", numRecords, bts, elapsed) + // Send telemetry to AppInsights resource + SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) + } + } else { + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + } + } else if len(laKubeMonAgentEventsRecords) > 0 { //for windows, ODS direct kubeMonAgentEventEntry := KubeMonAgentEventBlob{ DataType: KubeMonAgentEventDataType, IPName: IPName, @@ -746,70 +869,144 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) Log(message) } + + if IsWindows == false { //for linux, mdsd route + var msgPackEntries []MsgPackEntry + var i int + start := time.Now() + var elapsed time.Duration + + for i = 0; i < len(laMetrics); i++ { + var interfaceMap map[string]interface{} + stringMap := make(map[string]string) + jsonBytes, err := json.Marshal(*laMetrics[i]) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } else { + if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) + Log(message) + SendException(message) + return output.FLB_OK + } else { + for key, value := range interfaceMap { + strKey := fmt.Sprintf("%v", key) + strValue := fmt.Sprintf("%v", value) + stringMap[strKey] = strValue + } + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } + } + if (len(msgPackEntries) > 0) { + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") + CreateMDSDClient(InsightsMetrics, ContainerType) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } + } - var metrics []laTelegrafMetric - var i int + deadline := 10 * time.Second + MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) - for i = 0; i < len(laMetrics); i++ { - metrics = append(metrics, *laMetrics[i]) - } + elapsed = time.Since(start) - laTelegrafMetrics := InsightsMetricsBlob{ - DataType: InsightsMetricsDataType, - IPName: IPName, - DataItems: metrics} + if er != nil { + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } - jsonBytes, err := json.Marshal(laTelegrafMetrics) + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } else { + numTelegrafMetricsRecords := len(msgPackEntries) + Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) + } + } + + } else { // for windows, ODS direct - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) - Log(message) - SendException(message) - return output.FLB_OK - } + var metrics []laTelegrafMetric + var i int - //Post metrics data to LA - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) + for i = 0; i < len(laMetrics); i++ { + metrics = append(metrics, *laMetrics[i]) + } - //req.URL.Query().Add("api-version","2016-04-01") + laTelegrafMetrics := InsightsMetricsBlob{ + DataType: InsightsMetricsDataType, + IPName: IPName, + DataItems: metrics} - //set headers - req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) - req.Header.Set("User-Agent", userAgent) - reqID := uuid.New().String() - req.Header.Set("X-Request-ID", reqID) + jsonBytes, err := json.Marshal(laTelegrafMetrics) - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } - start := time.Now() - resp, err := HTTPClient.Do(req) - elapsed := time.Since(start) + //Post metrics data to LA + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) - Log(message) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) - return output.FLB_RETRY - } + //req.URL.Query().Add("api-version","2016-04-01") - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) + //set headers + req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) + req.Header.Set("User-Agent", userAgent) + reqID := uuid.New().String() + req.Header.Set("X-Request-ID", reqID) + + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) } - if resp != nil && resp.StatusCode == 429 { - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + + start := time.Now() + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) + Log(message) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) + } + if resp != nil && resp.StatusCode == 429 { + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + } + return output.FLB_RETRY } - return output.FLB_RETRY - } - defer resp.Body.Close() + defer resp.Body.Close() - numMetrics := len(laMetrics) - UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) - Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + numMetrics := len(laMetrics) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) + Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + } return output.FLB_OK } @@ -986,13 +1183,9 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords := 0 if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { - //flush to mdsd - mdsdSourceName := MdsdContainerLogSourceName - if (ContainerLogSchemaV2 == true) { - mdsdSourceName = MdsdContainerLogV2SourceName - } + //flush to mdsd fluentForward := MsgPackForward{ - Tag: mdsdSourceName, + Tag: MdsdContainerLogTagName, Entries: msgPackEntries, } @@ -1019,7 +1212,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if MdsdMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") - CreateMDSDClient() + CreateMDSDClient(ContainerLogV2, ContainerType) if MdsdMsgpUnixSocketClient == nil { Log("Error::mdsd::Unable to create mdsd client. Please check error log.") @@ -1286,21 +1479,31 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { log.Fatalln(message) } - osType := os.Getenv("OS_TYPE") + ContainerType = os.Getenv(ContainerTypeEnv) + Log("Container Type %s", ContainerType) + osType := os.Getenv("OS_TYPE") + IsWindows = false // Linux if strings.Compare(strings.ToLower(osType), "windows") != 0 { Log("Reading configuration for Linux from %s", pluginConfPath) - omsadminConf, err := ReadConfiguration(pluginConfig["omsadmin_conf_path"]) - if err != nil { - message := fmt.Sprintf("Error Reading omsadmin configuration %s\n", err.Error()) + WorkspaceID = os.Getenv("WSID") + if WorkspaceID == "" { + message := fmt.Sprintf("WorkspaceID shouldnt be empty") Log(message) SendException(message) time.Sleep(30 * time.Second) log.Fatalln(message) } - OMSEndpoint = omsadminConf["OMS_ENDPOINT"] - WorkspaceID = omsadminConf["WORKSPACE_ID"] + LogAnalyticsWorkspaceDomain = os.Getenv("DOMAIN") + if LogAnalyticsWorkspaceDomain == "" { + message := fmt.Sprintf("Workspace DOMAIN shouldnt be empty") + Log(message) + SendException(message) + time.Sleep(30 * time.Second) + log.Fatalln(message) + } + OMSEndpoint = "https://" + WorkspaceID + ".ods." + LogAnalyticsWorkspaceDomain + "/OperationalData.svc/PostJsonDataItems" // Populate Computer field containerHostName, err1 := ioutil.ReadFile(pluginConfig["container_host_file_path"]) if err1 != nil { @@ -1329,6 +1532,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } } else { // windows + IsWindows = true Computer = os.Getenv("HOSTNAME") WorkspaceID = os.Getenv("WSID") logAnalyticsDomain := os.Getenv("DOMAIN") @@ -1410,21 +1614,15 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log(message) } - PluginConfiguration = pluginConfig - - CreateHTTPClient() + PluginConfiguration = pluginConfig - ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE"))) - Log("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE:%s", ContainerLogsRoute) + ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_ROUTE"))) + Log("AZMON_CONTAINER_LOGS_ROUTE:%s", ContainerLogsRoute) - ContainerLogsRouteV2 = false //default is ODS - ContainerLogsRouteADX = false //default is LA + ContainerLogsRouteV2 = false + ContainerLogsRouteADX = false - if strings.Compare(ContainerLogsRoute, ContainerLogsV2Route) == 0 && strings.Compare(strings.ToLower(osType), "windows") != 0 { - ContainerLogsRouteV2 = true - Log("Routing container logs thru %s route...", ContainerLogsV2Route) - fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsV2Route) - } else if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { + if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { //check if adx clusteruri, clientid & secret are set var err error AdxClusterUri, err = ReadFileContents(PluginConfiguration["adx_cluster_uri_path"]) @@ -1455,14 +1653,30 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Routing container logs thru %s route...", ContainerLogsADXRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) } - } + } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route + ContainerLogsRouteV2 = true //default is mdsd route + if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { + ContainerLogsRouteV2 = false //fallback option when hiddensetting set + } + Log("Routing container logs thru %s route...", ContainerLogsRoute) + fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsRoute) + } if ContainerLogsRouteV2 == true { - CreateMDSDClient() + CreateMDSDClient(ContainerLogV2, ContainerType) } else if ContainerLogsRouteADX == true { CreateADXClient() + } else { // v1 or windows + Log("Creating HTTP Client since either OS Platform is Windows or configmap configured with fallback option for ODS direct") + CreateHTTPClient() } + if IsWindows == false { // mdsd linux specific + Log("Creating MDSD clients for KubeMonAgentEvents & InsightsMetrics") + CreateMDSDClient(KubeMonAgentEvents, ContainerType) + CreateMDSDClient(InsightsMetrics, ContainerType) + } + ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) @@ -1491,4 +1705,12 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } + if ContainerLogSchemaV2 == true { + MdsdContainerLogTagName = MdsdContainerLogV2SourceName + } else { + MdsdContainerLogTagName = MdsdContainerLogSourceName + } + + MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName } \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 461fdea96..4750b4624 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -42,6 +42,10 @@ var ( ContainerLogsSendErrorsToMDSDFromFluent float64 //Tracks the number of mdsd client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsMDSDClientCreateErrors float64 + //Tracks the number of mdsd client create errors for insightsmetrics (uses ContainerLogTelemetryTicker) + InsightsMetricsMDSDClientCreateErrors float64 + //Tracks the number of mdsd client create errors for kubemonevents (uses ContainerLogTelemetryTicker) + KubeMonEventsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) @@ -74,6 +78,8 @@ const ( metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountInsightsMetricsMDSDClientCreateError = "InsightsMetricsMDSDClientCreateErrorsCount" + metricNameErrorCountKubeMonEventsMDSDClientCreateError = "KubeMonEventsMDSDClientCreateErrorsCount" metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" @@ -112,6 +118,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + insightsMetricsMDSDClientCreateErrors := InsightsMetricsMDSDClientCreateErrors + kubeMonEventsMDSDClientCreateErrors := KubeMonEventsMDSDClientCreateErrors osmNamespaceCount := OSMNamespaceCount promMonitorPods := PromMonitorPods promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength @@ -132,6 +140,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogsMDSDClientCreateErrors = 0.0 ContainerLogsSendErrorsToADXFromFluent = 0.0 ContainerLogsADXClientCreateErrors = 0.0 + InsightsMetricsMDSDClientCreateErrors = 0.0 + KubeMonEventsMDSDClientCreateErrors = 0.0 ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { @@ -186,6 +196,13 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { if containerLogsADXClientCreateErrors > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountContainerLogsADXClientCreateError, containerLogsADXClientCreateErrors)) } + if insightsMetricsMDSDClientCreateErrors > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountInsightsMetricsMDSDClientCreateError, insightsMetricsMDSDClientCreateErrors)) + } + if kubeMonEventsMDSDClientCreateErrors > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountKubeMonEventsMDSDClientCreateError, kubeMonEventsMDSDClientCreateErrors)) + } + start = time.Now() } } diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 61d047e52..3fe5c6d0e 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -12,11 +12,12 @@ import ( "net/url" "os" "strings" - "time" - + "time" + "github.com/Azure/azure-kusto-go/kusto" "github.com/Azure/azure-kusto-go/kusto/ingest" "github.com/Azure/go-autorest/autorest/azure/auth" + "github.com/tinylib/msgp/msgp" ) // ReadConfiguration reads a property file @@ -62,7 +63,13 @@ func ReadConfiguration(filename string) (map[string]string, error) { // CreateHTTPClient used to create the client for sending post requests to OMSEndpoint func CreateHTTPClient() { - cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) + certFilePath := PluginConfiguration["cert_file_path"] + keyFilePath := PluginConfiguration["key_file_path"] + if IsWindows == false { + certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) + keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) + } + cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) if err != nil { message := fmt.Sprintf("Error when loading cert %s", err.Error()) SendException(message) @@ -93,7 +100,7 @@ func CreateHTTPClient() { HTTPClient = http.Client{ Transport: transport, Timeout: 30 * time.Second, - } + } Log("Successfully created HTTP Client") } @@ -110,23 +117,58 @@ func ToString(s interface{}) string { } //mdsdSocketClient to write msgp messages -func CreateMDSDClient() { - if MdsdMsgpUnixSocketClient != nil { - MdsdMsgpUnixSocketClient.Close() - MdsdMsgpUnixSocketClient = nil - } - /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", - FluentSocketPath:"/var/run/mdsd/default_fluent.socket", - WriteTimeout: 5 * time.Second, - RequestAck: true}) */ - conn, err := net.DialTimeout("unix", - "/var/run/mdsd/default_fluent.socket", 10*time.Second) - if err != nil { - Log("Error::mdsd::Unable to open MDSD msgp socket connection %s", err.Error()) - //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) - } else { - Log("Successfully created MDSD msgp socket connection") - MdsdMsgpUnixSocketClient = conn +func CreateMDSDClient(dataType DataType, containerType string) { + mdsdfluentSocket := "/var/run/mdsd/default_fluent.socket" + if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { + mdsdfluentSocket = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) + } + switch dataType { + case ContainerLogV2: + if MdsdMsgpUnixSocketClient != nil { + MdsdMsgpUnixSocketClient.Close() + MdsdMsgpUnixSocketClient = nil + } + /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", + FluentSocketPath:"/var/run/mdsd/default_fluent.socket", + WriteTimeout: 5 * time.Second, + RequestAck: true}) */ + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for ContainerLogV2 %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for ContainerLogV2: %s", mdsdfluentSocket) + MdsdMsgpUnixSocketClient = conn + } + case KubeMonAgentEvents: + if MdsdKubeMonMsgpUnixSocketClient != nil { + MdsdKubeMonMsgpUnixSocketClient.Close() + MdsdKubeMonMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for KubeMon events %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for KubeMon events:%s", mdsdfluentSocket) + MdsdKubeMonMsgpUnixSocketClient = conn + } + case InsightsMetrics: + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for insights metrics %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for Insights metrics %s", mdsdfluentSocket) + MdsdInsightsMetricsMsgpUnixSocketClient = conn + } } } @@ -178,3 +220,33 @@ func isValidUrl(uri string) bool { } return true } + +func convertMsgPackEntriesToMsgpBytes(fluentForwardTag string, msgPackEntries []MsgPackEntry) []byte { + var msgpBytes []byte + + fluentForward := MsgPackForward{ + Tag: fluentForwardTag, + Entries: msgPackEntries, + } + //determine the size of msgp message + msgpSize := 1 + msgp.StringPrefixSize + len(fluentForward.Tag) + msgp.ArrayHeaderSize + for i := range fluentForward.Entries { + msgpSize += 1 + msgp.Int64Size + msgp.GuessSize(fluentForward.Entries[i].Record) + } + + //allocate buffer for msgp message + msgpBytes = msgp.Require(nil, msgpSize) + + //construct the stream + msgpBytes = append(msgpBytes, 0x92) + msgpBytes = msgp.AppendString(msgpBytes, fluentForward.Tag) + msgpBytes = msgp.AppendArrayHeader(msgpBytes, uint32(len(fluentForward.Entries))) + batchTime := time.Now().Unix() + for entry := range fluentForward.Entries { + msgpBytes = append(msgpBytes, 0x92) + msgpBytes = msgp.AppendInt64(msgpBytes, batchTime) + msgpBytes = msgp.AppendMapStrStr(msgpBytes, fluentForward.Entries[entry].Record) + } + + return msgpBytes +} diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index b118cc646..6ae567337 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -14,7 +14,6 @@ class ApplicationInsightsUtility @@Exception = "ExceptionEvent" @@AcsClusterType = "ACS" @@AksClusterType = "AKS" - @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf" @@EnvAcsResourceName = "ACS_RESOURCE_NAME" @@EnvAksRegion = "AKS_REGION" @@EnvAgentVersion = "AGENT_VERSION" @@ -263,14 +262,11 @@ def sendMetricTelemetry(metricName, metricValue, properties) end def getWorkspaceId() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split("=") - adminConf[splitStrings[0]] = splitStrings[1] + begin + workspaceId = ENV["WSID"] + if workspaceId.nil? || workspaceId.empty? + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - WorkspaceID either nil or empty") end - workspaceId = adminConf["WORKSPACE_ID"] return workspaceId rescue => errorStr $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") @@ -278,14 +274,8 @@ def getWorkspaceId() end def getWorkspaceCloud() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split("=") - adminConf[splitStrings[0]] = splitStrings[1] - end - workspaceDomain = adminConf["URL_TLD"].strip + begin + workspaceDomain = ENV["DOMAIN"] workspaceCloud = "AzureCloud" if workspaceDomain.casecmp("opinsights.azure.com") == 0 workspaceCloud = "AzureCloud" diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 8cb6f603e..f02459aef 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -203,23 +203,25 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] - metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue + - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) + metricItem = {} + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json + metricItems.push(metricItem) + #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -250,11 +252,8 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromUrl"] = @dsPromUrlCount end #telemetry about containerlog Routing for daemonset - if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) - telemetryProps["containerLogsRoute"] = "v2" - elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) - telemetryProps["containerLogsRoute"] = @containerLogsRoute - end + telemetryProps["containerLogsRoute"] = @containerLogsRoute + #telemetry about health model if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled @@ -503,18 +502,16 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] + metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricItem["json_Collections"] = [] + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn containerId = podUid + "/" + containerName # Adding the containers to the winContainerIdCache so that it can be used by the cleanup routine @@ -545,9 +542,11 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @@winContainerPrevMetricRate[containerId] = metricRateValue end - metricCollections["Value"] = metricValue - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricCollection["Value"] = metricValue + + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json metricItems.push(metricItem) #Telemetry about agent performance begin @@ -629,22 +628,21 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricTime = metricPollTime #container["memory"]["time"] metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json + metricItems.push(metricItem) - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -687,22 +685,21 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, if !node[metricCategory].nil? metricValue = node[metricCategory][metricNameToCollect] metricTime = metricPollTime #node[metricCategory]["time"] - - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName + + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json end rescue => error @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") @@ -805,21 +802,20 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl end end end - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json end rescue => error @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") @@ -841,22 +837,22 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn, metric metricValue = node["startTime"] metricTime = metricPollTime #Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn #Read it from /proc/uptime - metricCollections["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f + metricCollection["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json rescue => error @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") @Log.warn metricJSON @@ -880,21 +876,19 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn, m metricTime = metricPollTime #currentTime metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = DateTime.parse(metricValue).to_time.to_i - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = DateTime.parse(metricValue).to_time.to_i + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json metricItems.push(metricItem) end end diff --git a/source/plugins/ruby/DockerApiClient.rb b/source/plugins/ruby/DockerApiClient.rb index f2828b357..53dd1f39f 100644 --- a/source/plugins/ruby/DockerApiClient.rb +++ b/source/plugins/ruby/DockerApiClient.rb @@ -29,7 +29,7 @@ def getResponse(request, isMultiJson, isVersion) loop do begin responseChunk = "" - timeout(@@TimeoutInSeconds) do + Timeout.timeout(@@TimeoutInSeconds) do responseChunk = socket.recv(@@ChunkSize) end dockerResponse += responseChunk diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 98347d272..3720bf6dc 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -405,12 +405,9 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] - podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -442,9 +439,6 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -453,50 +447,22 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - #Telemetry about omsagent requests and limits - begin - if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) - nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") - @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue - end - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - @@resourceLimitsTelemetryHash.each { |key, value| - keyElements = key.split("~~") - if keyElements.length != 4 - next - end - - # get dimension values by key - telemetryProps = {} - telemetryProps["Computer"] = keyElements[0] - telemetryProps["PodName"] = keyElements[1] - telemetryProps["ContainerName"] = keyElements[2] - metricNameFromKey = keyElements[3] - ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) - } - @@telemetryTimeTracker = DateTime.now.to_time.to_i - @@resourceLimitsTelemetryHash = {} - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") - end + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricProps["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricProps["json_Collections"] = metricCollections.to_json + metricItems.push(metricProps) #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) metricValue = @@NodeMetrics[nodeMetricsHashKey] #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem["DataItems"] = [] - + metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -505,14 +471,14 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + metricProps["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricProps["json_Collections"] = metricCollections.to_json + metricItems.push(metricProps) end end end @@ -632,22 +598,22 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] + metricItem["Timestamp"] = metricTime + metricItem["Host"] = node["metadata"]["name"] # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - + metricItem["Computer"] = node["metadata"]["name"] + metricItem["ObjectName"] = "K8SNode" + metricItem["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + metricCollections = [] + metricCollections.push(metricCollection) + + metricItem["json_Collections"] = [] + metricItem["json_Collections"] = metricCollections.to_json + #push node level metrics to a inmem hash so that we can use it looking up at container level. #Currently if container level cpu & memory limits are not defined we default to node level limits @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 6641456af..a809087dc 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -525,11 +525,11 @@ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_m records = [] begin custommetricrecord = MdmAlertTemplates::Node_resource_metrics_template % { - timestamp: record["DataItems"][0]["Timestamp"], + timestamp: record["Timestamp"], metricName: metric_name, - hostvalue: record["DataItems"][0]["Host"], - objectnamevalue: record["DataItems"][0]["ObjectName"], - instancenamevalue: record["DataItems"][0]["InstanceName"], + hostvalue: record["Host"], + objectnamevalue: record["ObjectName"], + instancenamevalue: record["InstanceName"], metricminvalue: metric_value, metricmaxvalue: metric_value, metricsumvalue: metric_value, @@ -538,11 +538,11 @@ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_m if !percentage_metric_value.nil? additional_record = MdmAlertTemplates::Node_resource_metrics_template % { - timestamp: record["DataItems"][0]["Timestamp"], + timestamp: record["Timestamp"], metricName: @@node_metric_name_metric_percentage_name_hash[metric_name], - hostvalue: record["DataItems"][0]["Host"], - objectnamevalue: record["DataItems"][0]["ObjectName"], - instancenamevalue: record["DataItems"][0]["InstanceName"], + hostvalue: record["Host"], + objectnamevalue: record["ObjectName"], + instancenamevalue: record["InstanceName"], metricminvalue: percentage_metric_value, metricmaxvalue: percentage_metric_value, metricsumvalue: percentage_metric_value, diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 906019b95..c037c99f6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -103,5 +103,5 @@ class Constants #Pod Statuses POD_STATUS_TERMINATING = "Terminating" - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME = "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" + end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 659e3000c..62dcf31dc 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -2,7 +2,9 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require "logger" require "yajl/json_gem" require_relative "oms_common" @@ -12,7 +14,7 @@ module Fluent require_relative "in_kube_nodes" class CAdvisor2MdmFilter < Filter - Fluent::Plugin.register_filter("filter_cadvisor2mdm", self) + Fluent::Plugin.register_filter("cadvisor2mdm", self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" @@ -65,7 +67,7 @@ def start @containerResourceDimensionHash = {} @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds - @NodeCache = Fluent::NodeStatsCache.new() + @NodeCache = Fluent::Plugin::NodeStatsCache.new() end rescue => e @log.info "Error initializing plugin #{e}" @@ -148,16 +150,16 @@ def filter(tag, time, record) begin if @process_incoming_stream - # Check if insights metrics for PV metrics - data_type = record["DataType"] - if data_type == "INSIGHTS_METRICS_BLOB" + # Check if insights metrics for PV metrics + if record["Name"] == Constants::PV_USED_BYTES return filterPVInsightsMetrics(record) end - object_name = record["DataItems"][0]["ObjectName"] - counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] + object_name = record["ObjectName"] + counter_name = JSON.parse(record["json_Collections"])[0]["CounterName"] + percentage_metric_value = 0.0 - metric_value = record["DataItems"][0]["Collections"][0]["Value"] + metric_value = JSON.parse(record["json_Collections"])[0]["Value"] if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -165,7 +167,7 @@ def filter(tag, time, record) metric_name = Constants::CPU_USAGE_MILLI_CORES metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc if @@controller_type.downcase == "replicaset" - target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["DataItems"][0]["Host"]) / 1000000 + target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["Host"]) / 1000000 else target_node_cpu_capacity_mc = @cpu_capacity end @@ -178,7 +180,7 @@ def filter(tag, time, record) if counter_name.start_with?("memory") metric_name = counter_name if @@controller_type.downcase == "replicaset" - target_node_mem_capacity = @NodeCache.mem.get_capacity(record["DataItems"][0]["Host"]) + target_node_mem_capacity = @NodeCache.mem.get_capacity(record["Host"]) else target_node_mem_capacity = @memory_capacity end @@ -187,12 +189,12 @@ def filter(tag, time, record) percentage_metric_value = metric_value * 100 / target_node_mem_capacity end end - @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["DataItems"][0]["Host"]} percentage: #{percentage_metric_value}" + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["Host"]} percentage: #{percentage_metric_value}" # do some sanity checking. Do we want this? if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 telemetryProperties = {} - telemetryProperties["Computer"] = record["DataItems"][0]["Host"] + telemetryProperties["Computer"] = record["Host"] telemetryProperties["MetricName"] = metric_name telemetryProperties["MetricPercentageValue"] = percentage_metric_value ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) @@ -200,7 +202,7 @@ def filter(tag, time, record) return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) - instanceName = record["DataItems"][0]["InstanceName"] + instanceName = record["InstanceName"] metricName = counter_name # Using node cpu capacity in the absence of container cpu capacity since the container will end up using the # node's capacity in this case. Converting this to nanocores for computation purposes, since this is in millicores @@ -235,7 +237,7 @@ def filter(tag, time, record) flushMetricTelemetry if percentage_metric_value >= thresholdPercentage setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(record["DataItems"][0]["Timestamp"], + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(record["Timestamp"], metricName, percentage_metric_value, @containerResourceDimensionHash[instanceName], @@ -256,39 +258,36 @@ def filter(tag, time, record) end end - def filterPVInsightsMetrics(record) + def filterPVInsightsMetrics(record) begin mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - computer = dataItem["Computer"] - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] - - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - computer, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items + if record["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(record["Name"].downcase) + metricName = record["Name"] + usage = record["Value"] + capacity = record["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = record["Computer"] + resourceDimensions = record["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(record["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check return [] rescue Exception => e @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" @@ -316,16 +315,22 @@ def ensure_cpu_memory_capacity_set end if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") - if !cpu_capacity_json.nil? && !cpu_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"].to_s.nil? - @cpu_capacity = cpu_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"] - @log.info "CPU Limit #{@cpu_capacity}" + if !cpu_capacity_json.nil? + metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] + if !metricVal.to_s.nil? + @cpu_capacity = metricVal + @log.info "CPU Limit #{@cpu_capacity}" + end else @log.info "Error getting cpu_capacity" end memory_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes") - if !memory_capacity_json.nil? && !memory_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"].to_s.nil? - @memory_capacity = memory_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"] - @log.info "Memory Limit #{@memory_capacity}" + if !memory_capacity_json.nil? + metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] + if !metricVal.to_s.nil? + @memory_capacity = metricVal + @log.info "Memory Limit #{@memory_capacity}" + end else @log.info "Error getting memory_capacity" end @@ -346,7 +351,7 @@ def ensure_cpu_memory_capacity_set end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new begin ensure_cpu_memory_capacity_set # Getting container limits hash diff --git a/source/plugins/ruby/filter_cadvisor_health_container.rb b/source/plugins/ruby/filter_cadvisor_health_container.rb index 870fcd6d6..ab64b6e61 100644 --- a/source/plugins/ruby/filter_cadvisor_health_container.rb +++ b/source/plugins/ruby/filter_cadvisor_health_container.rb @@ -1,7 +1,9 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' @@ -11,7 +13,7 @@ module Fluent class CAdvisor2ContainerHealthFilter < Filter include HealthModel - Fluent::Plugin.register_filter('filter_cadvisor_health_container', self) + Fluent::Plugin.register_filter('cadvisor_health_container', self) config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log' config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' @@ -58,9 +60,9 @@ def start def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_cadvisor_health_container" - return MultiEventStream.new + return Fluent::MultiEventStream.new end - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new records_count = 0 es.each { |time, record| begin @@ -83,8 +85,9 @@ def filter(tag, time, record) if record.key?("MonitorLabels") return record end - object_name = record['DataItems'][0]['ObjectName'] - counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + + object_name = record['ObjectName'] + counter_name = JSON.parse(record['json_Collections'])[0]['CounterName'].downcase if @metrics_to_collect_hash.key?(counter_name) if object_name == @@object_name_k8s_container return @formatter.get_record_from_cadvisor_record(record) diff --git a/source/plugins/ruby/filter_cadvisor_health_node.rb b/source/plugins/ruby/filter_cadvisor_health_node.rb index 27e5bc255..ddbb871e8 100644 --- a/source/plugins/ruby/filter_cadvisor_health_node.rb +++ b/source/plugins/ruby/filter_cadvisor_health_node.rb @@ -1,7 +1,9 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' @@ -11,8 +13,8 @@ module Fluent class CAdvisor2NodeHealthFilter < Filter include HealthModel - Fluent::Plugin.register_filter('filter_cadvisor_health_node', self) - + Fluent::Plugin.register_filter('cadvisor_health_node', self) + attr_accessor :provider, :resources config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' @@ -75,13 +77,13 @@ def start def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) @cpu_capacity = node_capacity[0] @memory_capacity = node_capacity[1] - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new records_count = 0 es.each { |time, record| filtered_record = filter(tag, time, record) @@ -95,7 +97,7 @@ def filter_stream(tag, es) rescue => e @log.info "Error in filter_cadvisor_health_node filter_stream #{e.backtrace}" ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - return MultiEventStream.new + return Fluent::MultiEventStream.new end end @@ -105,10 +107,10 @@ def filter(tag, time, record) return record end - object_name = record['DataItems'][0]['ObjectName'] - counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + object_name = record['ObjectName'] + counter_name = JSON.parse(record['json_Collections'])[0]['CounterName'].downcase if @metrics_to_collect_hash.key?(counter_name.downcase) - metric_value = record['DataItems'][0]['Collections'][0]['Value'] + metric_value = JSON.parse(record['json_Collections'])[0]['Value'] case object_name when @@object_name_k8s_node case counter_name.downcase @@ -134,14 +136,14 @@ def process_node_cpu_record(record, metric_value) if record.nil? return nil else - instance_name = record['DataItems'][0]['InstanceName'] + instance_name = record['InstanceName'] #@log.info "CPU capacity #{@cpu_capacity}" metric_value /= 1000000 percent = (metric_value.to_f/@cpu_capacity*100).round(2) #@log.debug "Percentage of CPU limit: #{percent}" state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_CPU_MONITOR_ID)) #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] + timestamp = record['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value, "cpuUtilizationPercentage" => percent}} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) @@ -166,14 +168,14 @@ def process_node_memory_record(record, metric_value) if record.nil? return nil else - instance_name = record['DataItems'][0]['InstanceName'] + instance_name = record['InstanceName'] #@log.info "Memory capacity #{@memory_capacity}" percent = (metric_value.to_f/@memory_capacity*100).round(2) #@log.debug "Percentage of Memory limit: #{percent}" state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_MEMORY_MONITOR_ID)) #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] + timestamp = record['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} #@log.info health_monitor_record diff --git a/source/plugins/ruby/filter_container.rb b/source/plugins/ruby/filter_container.rb deleted file mode 100644 index b72e82dbc..000000000 --- a/source/plugins/ruby/filter_container.rb +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. - -# frozen_string_literal: true - -module Fluent - require 'logger' - - class ContainerFilter < Filter - Fluent::Plugin.register_filter('filter_container', self) - - config_param :enable_log, :integer, :default => 0 - config_param :log_path, :string, :default => '/var/opt/microsoft/omsagent/log/filter_container.log' - - def initialize - super - end - - def configure(conf) - super - @log = nil - - if @enable_log - @log = Logger.new(@log_path, 'weekly') - @log.debug {'Starting filter_container plugin'} - end - end - - def start - super - end - - def shutdown - super - end - - def filter(tag, time, record) - dataType = nil - - record.each do |r| - if dataType == nil - dataType = case r["ClassName"] - when "Container_ImageInventory" then "CONTAINER_IMAGE_INVENTORY_BLOB" - when "Container_ContainerInventory" then "CONTAINER_INVENTORY_BLOB" - when "Container_DaemonEvent" then "CONTAINER_SERVICE_LOG_BLOB" - when "Container_ContainerLog" then "CONTAINER_LOG_BLOB" - end - end - end - - wrapper = { - "DataType"=>dataType, - "IPName"=>"Containers", - "DataItems"=>record - } - - wrapper - end - end -end diff --git a/source/plugins/ruby/filter_docker_log.rb b/source/plugins/ruby/filter_docker_log.rb deleted file mode 100644 index b80f4c204..000000000 --- a/source/plugins/ruby/filter_docker_log.rb +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. - -# frozen_string_literal: true - -module Fluent - require 'logger' - require 'socket' - require 'yajl/json_gem' - - class DockerLogFilter < Filter - Plugin.register_filter('filter_docker_log', self) - - # Set to 1 in config file to enable logging - config_param :enable_log, :integer, :default => 0 - config_param :log_path, :string, :default => '/var/opt/microsoft/omsagent/log/filter_docker_log.txt' - - # This method is called before starting. - def configure(conf) - super - @hostname = Socket.gethostname - # in case get full name, extract up to '.' - dotpos = @hostname.index('.') - if dotpos != nil - @hostname = @hostname[0..dotpos-1] - end - - # Cache the image name and ID of each container so we don't have to inspect each time - @containerCache = Hash.new - - @log = nil - - if @enable_log - @log = Logger.new(@log_path, 'weekly') - @log.debug {'Starting filter_docker_log plugin on ' + @hostname} - end - end - - def filter(tag, time, record) - if @log != nil - @log.debug {'Accepted a log from container ' + record['container_id']} - end - - wrapper = Hash.new - - if record['log'].empty? - if @log != nil - @log.debug {'Log from container ' + record['container_id'] + ' had length 0 and will be discarded'} - end - else - # Need to query image information from ID - containerId = record['container_id'] - - unless @containerCache.has_key?(containerId) - if @log != nil - @log.debug {'Container ' + containerId + ' information is not in the cache, inspecting'} - end - - # Value not in cache, use inspect - @containerCache[containerId] = Hash.new - details = '' - - begin - details = JSON.parse(`sudo docker inspect #{containerId}`) - rescue => e - if @log != nil - @log.error {'sudo docker inspect ' + containerId + ' failed'} - end - end - - if details.empty? - # This should not occur - @containerCache[containerId]['Image'] = 'Unknown' - @containerCache[containerId]['ImageName'] = 'Unknown' - - if @log != nil - @log.warn {'The image ID of container ' + containerId + ' could not be determined'} - end - else - @containerCache[containerId]['Image'] = details[0]['Config']['Image'] - @containerCache[containerId]['ImageName'] = details[0]['Config']['Image'] - end - end - - newRecord = @containerCache[containerId] - - # No query is required - newRecord['Id'] = containerId - newRecord['Name'] = record['container_name'][0] == "/" ? record['container_name'][1..-1] : record['container_name'] - newRecord['LogEntrySource'] = record['source'] - newRecord['LogEntry'] = record['log'] - newRecord['Computer'] = @hostname - - wrapper = { - "DataType"=>"CONTAINER_LOG_BLOB", - "IPName"=>"Containers", - "DataItems"=>[newRecord] - } - end - - wrapper - end - end -end diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index 36e4801d7..d491f17c2 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -2,15 +2,17 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } - + class FilterHealthModelBuilder < Filter include HealthModel - Fluent::Plugin.register_filter('filter_health_model_builder', self) + Fluent::Plugin.register_filter('health_model_builder', self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log' @@ -20,7 +22,7 @@ class FilterHealthModelBuilder < Filter attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry - @@rewrite_tag = 'kubehealth.Signals' + @@cluster_id = KubernetesApiClient.getClusterId @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @@ -29,6 +31,7 @@ class FilterHealthModelBuilder < Filter def initialize begin super + @rewrite_tag = 'oneagent.containerInsights.KUBE_HEALTH_BLOB' @buffer = HealthModel::HealthModelBuffer.new @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) @@ -53,6 +56,7 @@ def initialize deserialized_state_info = @cluster_health_state.get_state @state.initialize_state(deserialized_state_info) end + rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -82,11 +86,11 @@ def shutdown def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_health_model_builder" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin - new_es = MultiEventStream.new - time = Time.now + new_es = Fluent::MultiEventStream.new + time = Time.now if tag.start_with?("kubehealth.DaemonSet.Node") node_records = [] @@ -96,7 +100,7 @@ def filter_stream(tag, es) } @buffer.add_to_buffer(node_records) end - return MultiEventStream.new + return Fluent::MultiEventStream.new elsif tag.start_with?("kubehealth.DaemonSet.Container") container_records = [] if !es.nil? @@ -110,7 +114,7 @@ def filter_stream(tag, es) @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. end @container_cpu_memory_records.push(*container_records) # push the records for aggregation later - return MultiEventStream.new + return Fluent::MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") records = [] es.each{|time, record| @@ -218,11 +222,11 @@ def filter_stream(tag, es) @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" - current_time = Time.now - emit_time = current_time.to_f + # for each key in monitor.keys, # get the state from health_monitor_state # generate the record to send + emit_time = Fluent::Engine.now all_monitors.keys.each{|key| record = @provider.get_record(all_monitors[key], state) if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER @@ -241,17 +245,12 @@ def filter_stream(tag, es) @cluster_new_state = new_state end end - end - record_wrapper = { - "DataType" => "KUBE_HEALTH_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - new_es.add(emit_time, record_wrapper) + end + new_es.add(emit_time, record) } #emit the stream - router.emit_stream(@@rewrite_tag, new_es) + router.emit_stream(@rewrite_tag, new_es) #initialize monitor_set and model_builder @monitor_set = HealthModel::MonitorSet.new @@ -261,8 +260,8 @@ def filter_stream(tag, es) @cluster_health_state.update_state(@state.to_h) @telemetry.send # return an empty event stream, else the match will throw a NoMethodError - return MultiEventStream.new - elsif tag.start_with?("kubehealth.Signals") + return Fluent::MultiEventStream.new + elsif tag.start_with?(@rewrite_tag) # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else @@ -274,6 +273,6 @@ def filter_stream(tag, es) @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" return nil end - end + end end end diff --git a/source/plugins/ruby/filter_inventory2mdm.rb b/source/plugins/ruby/filter_inventory2mdm.rb index 38ccab885..509ac608e 100644 --- a/source/plugins/ruby/filter_inventory2mdm.rb +++ b/source/plugins/ruby/filter_inventory2mdm.rb @@ -2,14 +2,16 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' require_relative 'CustomMetricsUtils' class Inventory2MdmFilter < Filter - Fluent::Plugin.register_filter('filter_inventory2mdm', self) + Fluent::Plugin.register_filter('inventory2mdm', self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' @@ -115,8 +117,8 @@ def process_node_inventory_records(es) es.each{|time,record| begin - timestamp = record['DataItems'][0]['CollectionTime'] - node_status = record['DataItems'][0]['Status'] + timestamp = record['CollectionTime'] + node_status = record['Status'] if node_status.downcase.split(",").include? @@node_status_ready.downcase node_ready_count = node_ready_count+1 else @@ -161,8 +163,8 @@ def process_pod_inventory_records(es) records = [] es.each{|time,record| record_count += 1 - timestamp = record['DataItems'][0]['CollectionTime'] - podUid = record['DataItems'][0]['PodUid'] + timestamp = record['CollectionTime'] + podUid = record['PodUid'] if podUids.key?(podUid) #@log.info "pod with #{podUid} already counted" @@ -170,10 +172,10 @@ def process_pod_inventory_records(es) end podUids[podUid] = true - podPhaseDimValue = record['DataItems'][0]['PodStatus'] - podNamespaceDimValue = record['DataItems'][0]['Namespace'] - podControllerNameDimValue = record['DataItems'][0]['ControllerName'] - podNodeDimValue = record['DataItems'][0]['Computer'] + podPhaseDimValue = record['PodStatus'] + podNamespaceDimValue = record['Namespace'] + podControllerNameDimValue = record['ControllerName'] + podNodeDimValue = record['Computer'] if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = 'No Controller' @@ -263,7 +265,7 @@ def process_pod_inventory_records(es) end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new filtered_records = [] time = DateTime.now begin diff --git a/source/plugins/ruby/filter_telegraf2mdm.rb b/source/plugins/ruby/filter_telegraf2mdm.rb index 88ae428d1..fd71f1682 100644 --- a/source/plugins/ruby/filter_telegraf2mdm.rb +++ b/source/plugins/ruby/filter_telegraf2mdm.rb @@ -2,7 +2,9 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require "logger" require "yajl/json_gem" require_relative "oms_common" @@ -11,7 +13,7 @@ module Fluent require_relative "constants" class Telegraf2MdmFilter < Filter - Fluent::Plugin.register_filter("filter_telegraf2mdm", self) + Fluent::Plugin.register_filter("telegraf2mdm", self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log" @@ -64,7 +66,7 @@ def filter(tag, time, record) end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new begin es.each { |time, record| filtered_records = filter(tag, time, record) diff --git a/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb b/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb index 12c72a120..ebf3abd7e 100644 --- a/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb +++ b/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb @@ -17,10 +17,10 @@ def initialize def get_record_from_cadvisor_record(cadvisor_record) begin - instance_name = cadvisor_record['DataItems'][0]['InstanceName'] - counter_name = cadvisor_record['DataItems'][0]['Collections'][0]['CounterName'] - metric_value = cadvisor_record['DataItems'][0]['Collections'][0]['Value'] - timestamp = cadvisor_record['DataItems'][0]['Timestamp'] + instance_name = cadvisor_record['InstanceName'] + counter_name = JSON.parse(cadvisor_record['json_Collections'])[0]['CounterName'] + metric_value = JSON.parse(cadvisor_record['json_Collections'])[0]['Value'] + timestamp = cadvisor_record['Timestamp'] health_container_cpu_memory_record = @@health_container_cpu_memory_record_template % { instance_name: instance_name, diff --git a/source/plugins/ruby/health/health_monitor_utils.rb b/source/plugins/ruby/health/health_monitor_utils.rb index c23d8824a..58f2ecc36 100644 --- a/source/plugins/ruby/health/health_monitor_utils.rb +++ b/source/plugins/ruby/health/health_monitor_utils.rb @@ -171,8 +171,9 @@ def get_cluster_cpu_memory_capacity(log, node_inventory: nil) cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? cpu_capacity_json.each do |cpu_capacity_node| - if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] + metricVal = JSON.parse(cpu_capacity_node['json_Collections'])[0]['Value'] + if !metricVal.to_s.nil? + cluster_cpu_capacity += metricVal end end else @@ -181,8 +182,9 @@ def get_cluster_cpu_memory_capacity(log, node_inventory: nil) memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") if !memory_capacity_json.nil? memory_capacity_json.each do |memory_capacity_node| - if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] + metricVal = JSON.parse(memory_capacity_node['json_Collections'])[0]['Value'] + if !metricVal.to_s.nil? + cluster_memory_capacity += metricVal end end else @@ -284,7 +286,7 @@ def build_metrics_hash(metrics_to_collect) def get_health_monitor_config health_monitor_config = {} begin - file = File.open('/opt/microsoft/omsagent/plugin/healthmonitorconfig.json', "r") + file = File.open('/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json', "r") if !file.nil? fileContents = file.read health_monitor_config = JSON.parse(fileContents) diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index b706ff00a..781042cea 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -1,10 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true +require 'fluent/plugin/input' -module Fluent +module Fluent::Plugin class CAdvisor_Perf_Input < Input - Plugin.register_input("cadvisorperf", self) + Fluent::Plugin.register_input("cadvisor_perf", self) def initialize super @@ -15,14 +16,15 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" - require_relative "constants" + require_relative "constants" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" config_param :nodehealthtag, :string, :default => "kubehealth.DaemonSet.Node" config_param :containerhealthtag, :string, :default => "kubehealth.DaemonSet.Container" + config_param :insightsmetricstag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super @@ -30,6 +32,7 @@ def configure(conf) def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -44,24 +47,23 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate() currentTime = Time.now - time = currentTime.to_f + time = Fluent::Engine.now batchTime = currentTime.utc.iso8601 @@istestvar = ENV["ISTEST"] begin - eventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) - metricData.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - eventStream.add(time, record) if record - end - + metricData.each do |record| + eventStream.add(time, record) if record + end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream router.emit_stream(@containerhealthtag, eventStream) if eventStream @@ -75,19 +77,13 @@ def enumerate() #start GPU InsightsMetrics items begin containerGPUusageInsightsMetricsDataItems = [] - containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) - + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) @@ -135,6 +131,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # CAdvisor_Perf_Input end # module diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index c1126aa4e..eebf422d6 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Container_Inventory_Input < Input - Plugin.register_input("containerinventory", self) + Fluent::Plugin.register_input("containerinventory", self) @@PluginName = "ContainerInventory" @@ -19,7 +21,7 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.containerinventory" + config_param :tag, :string, :default => "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" def configure(conf) super @@ -27,6 +29,7 @@ def configure(conf) def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -42,17 +45,18 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now batchTime = currentTime.utc.iso8601 + emitTime = Fluent::Engine.now containerInventory = Array.new - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new hostName = "" - $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") begin containerRuntimeEnv = ENV["CONTAINER_RUNTIME"] $log.info("in_container_inventory::enumerate : container runtime : #{containerRuntimeEnv}") @@ -89,13 +93,8 @@ def enumerate end end end - containerInventory.each do |record| - wrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + containerInventory.each do |record| + eventStream.add(emitTime, record) if record end router.emit_stream(@tag, eventStream) if eventStream @@istestvar = ENV["ISTEST"] @@ -149,6 +148,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Container_Inventory_Input end # module diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index f50019a01..6f65dab92 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Event_Input < Input - Plugin.register_input("kubeevents", self) + Fluent::Plugin.register_input("kube_events", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" def initialize @@ -29,14 +31,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_EVENTS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i else @@ -70,6 +73,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -80,8 +84,8 @@ def enumerate batchTime = currentTime.utc.iso8601 eventQueryState = getEventQueryState newEventQueryState = [] - @eventsCount = 0 - + @eventsCount = 0 + # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") @@ -127,11 +131,11 @@ def enumerate end # end enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now @@istestvar = ENV["ISTEST"] begin - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new events["items"].each do |items| record = {} # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion @@ -162,13 +166,8 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim record["Count"] = items["count"] record["Computer"] = nodeName record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - wrapper = { - "DataType" => "KUBE_EVENTS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + record["ClusterId"] = KubernetesApiClient.getClusterId + eventStream.add(emitTime, record) if record @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream diff --git a/source/plugins/ruby/in_kube_health.rb b/source/plugins/ruby/in_kube_health.rb index 874be26f6..db981c53e 100644 --- a/source/plugins/ruby/in_kube_health.rb +++ b/source/plugins/ruby/in_kube_health.rb @@ -1,17 +1,19 @@ #!/usr/local/bin/ruby # frozen_string_literal: true +require 'fluent/plugin/input' + require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" -module Fluent +module Fluent::Plugin Dir[File.join(__dir__, "./health", "*.rb")].each { |file| require file } class KubeHealthInput < Input include HealthModel - Plugin.register_input("kubehealth", self) + Fluent::Plugin.register_input("kube_health", self) config_param :health_monitor_config_path, :default => "/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json" @@ -46,6 +48,7 @@ def configure(conf) def start begin + super if @run_interval @finished = false @condition = ConditionVariable.new @@ -76,20 +79,21 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate if !@@cluster_health_model_enabled @@hmlog.info "Cluster Health Model disabled in in_kube_health" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now batchTime = currentTime.utc.iso8601 health_monitor_records = [] - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) # we do this so that if the call fails, we get a response code/header etc. diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 99e804302..ffc11de55 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -1,17 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent - class Kube_nodeInventory_Input < Input - Plugin.register_input("kubenodeinventory", self) +require 'fluent/plugin/input' - @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" - @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" +module Fluent::Plugin + class Kube_nodeInventory_Input < Input + Fluent::Plugin.register_input("kube_nodes", self) + @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - @@kubeperfTag = "oms.api.KubePerf" + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -35,7 +35,13 @@ def initialize require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" - require_relative "omslog" + require_relative "omslog" + + @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + # refer tomlparser-agent-config for the defaults @NODES_CHUNK_SIZE = 0 @NODES_EMIT_STREAM_BATCH_SIZE = 0 @@ -48,14 +54,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i else @@ -90,6 +97,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -101,8 +109,10 @@ def enumerate @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 - nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i + # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") @@ -151,49 +161,38 @@ def enumerate def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) begin - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now telemetrySent = false - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new - kubePerfEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + containerNodeInventoryEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - + router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new end # container node inventory - containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - containerNodeInventoryEventStream = MultiEventStream.new + router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -223,7 +222,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecords.push(nodeMetricRecord) # add data to the cache so filter_cadvisor2mdm.rb can use it if is_windows_node - @NodeCache.cpu.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + metricVal = JSON.parse(nodeMetricRecord["json_Collections"])[0]["Value"] + @NodeCache.cpu.set_capacity(nodeMetricRecord["Host"], metricVal) end end nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) @@ -231,18 +231,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecords.push(nodeMetricRecord) # add data to the cache so filter_cadvisor2mdm.rb can use it if is_windows_node - @NodeCache.mem.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + metricVal = JSON.parse(nodeMetricRecord["json_Collections"])[0]["Value"] + @NodeCache.mem.set_capacity(nodeMetricRecord["Host"], metricVal) end end - nodeMetricRecords.each do |metricRecord| - metricRecord["DataType"] = "LINUX_PERF_BLOB" - metricRecord["IPName"] = "LogManagement" + nodeMetricRecords.each do |metricRecord| kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - kubePerfEventStream = MultiEventStream.new + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodePerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -266,18 +265,13 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) end - nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = MultiEventStream.new + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -337,15 +331,15 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = nil + eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -354,7 +348,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if kubePerfEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodePerfInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -362,7 +356,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end if insightsMetricsEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -513,10 +507,8 @@ def getNodeTelemetryProps(item) $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end return properties - end + end end # Kube_Node_Input - - class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) # (to reduce code duplication) @@ -586,6 +578,5 @@ def cpu() def mem() return @@memCache end - end - + end end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5256eb159..5598602cd 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1,16 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input - Plugin.register_input("kubepodinventory", self) + Fluent::Plugin.register_input("kube_podinventory", self) @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - @@kubeperfTag = "oms.api.KubePerf" - @@kubeservicesTag = "oms.containerinsights.KubeServices" + def initialize super @@ -38,19 +39,25 @@ def initialize @winContainerCount = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 - @podsAPIE2ELatencyMs = 0 + @podsAPIE2ELatencyMs = 0 + + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" + @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB" def configure(conf) super @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end - def start + def start if @run_interval + super if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i else @@ -58,7 +65,7 @@ def start $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") @PODS_CHUNK_SIZE = 1000 end - $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + $log.info("in_kube_podinventory::start: PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i @@ -67,8 +74,7 @@ def start $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") @PODS_EMIT_STREAM_BATCH_SIZE = 200 end - $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") - + $log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -84,6 +90,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -100,7 +107,8 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - podInventoryStartTime = (Time.now.to_f * 1000).to_i + podInventoryStartTime = (Time.now.to_f * 1000).to_i + # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") @@ -189,12 +197,13 @@ def enumerate(podList = nil) end def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 - eventStream = MultiEventStream.new - kubePerfEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + containerInventoryStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin #begin block start @@ -205,13 +214,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) podInventoryRecords.each do |record| if !record.nil? - wrapper = { - "DataType" => "KUBE_POD_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) + eventStream.add(emitTime, record) if record + @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers @@ -228,13 +232,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc # Send container inventory records for containers on windows nodes @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - ciwrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], - } - eventStream.add(emitTime, ciwrapper) if ciwrapper + if !cirecord.nil? + containerInventoryStream.add(emitTime, cirecord) if cirecord end end end @@ -246,7 +245,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end router.emit_stream(@tag, eventStream) if eventStream - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new end #container perf records @@ -256,19 +255,17 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - containerMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" + containerMetricDataItems.each do |record| kubePerfEventStream.add(emitTime, record) if record end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - kubePerfEventStream = MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new end # container GPU records @@ -277,13 +274,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE @@ -291,8 +283,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = MultiEventStream.new + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new end end #podInventory block end @@ -305,9 +297,18 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc eventStream = nil end + if containerInventoryStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of windows container inventory records emitted #{containerInventoryStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@containerInventoryTag, containerInventoryStream) if containerInventoryStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeWindowsContainerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + containerInventoryStream = nil + end + if kubePerfEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -316,7 +317,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if insightsMetricsEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -327,7 +328,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = MultiEventStream.new + mdm_pod_inventory_es = Fluent::MultiEventStream.new pod_inventory_mdm_records.each { |pod_inventory_mdm_record| mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record } if pod_inventory_mdm_records @@ -335,22 +336,17 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if continuationToken.nil? # sending kube services inventory records - kubeServicesEventStream = MultiEventStream.new + kubeServicesEventStream = Fluent::MultiEventStream.new serviceRecords.each do |kubeServiceRecord| if !kubeServiceRecord.nil? # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId - kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName - kubeServicewrapper = { - "DataType" => "KUBE_SERVICES_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], - } - kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream - kubeServicesEventStream = MultiEventStream.new + router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + kubeServicesEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -360,7 +356,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if kubeServicesEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -652,6 +648,6 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName - end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 4efe86f61..40eebac8a 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -1,6 +1,11 @@ -module Fluent +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_PVInventory_Input < Input - Plugin.register_input("kubepvinventory", self) + Fluent::Plugin.register_input("kube_pvinventory", self) @@hostName = (OMS::Common.get_hostname) @@ -22,14 +27,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB" def configure(conf) super end - def start + def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -45,6 +51,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -54,7 +61,7 @@ def enumerate telemetryFlush = false @pvTypeToCountHash = {} currentTime = Time.now - batchTime = currentTime.utc.iso8601 + batchTime = currentTime.utc.iso8601 continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") @@ -103,9 +110,9 @@ def enumerate end # end enumerate def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f - eventStream = MultiEventStream.new + currentTime = Time.now + emitTime = Fluent::Engine.now + eventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin records = [] @@ -145,13 +152,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end records.each do |record| - if !record.nil? - wrapper = { - "DataType" => "KUBE_PV_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + if !record.nil? + eventStream.add(emitTime, record) end end @@ -250,7 +252,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end - + end end # Kube_PVInventory_Input -end # module \ No newline at end of file +end # module diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 27e4709a2..182c3ffc1 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Kubestate_Deployments_Input < Input - Plugin.register_input("kubestatedeployments", self) + Fluent::Plugin.register_input("kubestate_deployments", self) @@istestvar = ENV["ISTEST"] # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m @@deploymentsCount = 0 @@ -36,14 +38,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + config_param :tag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i else @@ -52,11 +55,11 @@ def start @DEPLOYMENTS_CHUNK_SIZE = 500 end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") - + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + @thread = Thread.new(&method(:run_periodic)) end end @@ -67,6 +70,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -77,8 +81,8 @@ def enumerate batchTime = currentTime.utc.iso8601 #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - + @deploymentsRunningTotal = 0 + # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") @@ -126,7 +130,7 @@ def enumerate def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) metricItems = [] - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new begin metricInfo = deployments metricInfo["items"].each do |deployment| @@ -181,17 +185,12 @@ def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) metricItems.push(metricItem) end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + time = Fluent::Engine.now + metricItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@tag, insightsMetricsEventStream) if insightsMetricsEventStream $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() @@ -234,6 +233,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index afecf8e3b..8f60bfb72 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Kubestate_HPA_Input < Input - Plugin.register_input("kubestatehpa", self) + Fluent::Plugin.register_input("kubestate_hpa", self) @@istestvar = ENV["ISTEST"] def initialize @@ -16,7 +18,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" - require_relative "constants" + require_relative "constants" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -33,14 +35,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + config_param :tag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i else @@ -64,6 +67,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -74,7 +78,7 @@ def enumerate batchTime = currentTime.utc.iso8601 @hpaCount = 0 - + # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") @@ -113,7 +117,7 @@ def enumerate def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) metricItems = [] - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new begin metricInfo = hpas metricInfo["items"].each do |hpa| @@ -181,17 +185,12 @@ def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) metricItems.push(metricItem) end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + time = Fluent::Engine.now + metricItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@tag, insightsMetricsEventStream) if insightsMetricsEventStream $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -232,6 +231,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 9c267cf4f..61e823ea6 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Win_CAdvisor_Perf_Input < Input - Plugin.register_input("wincadvisorperf", self) + Fluent::Plugin.register_input("win_cadvisor_perf", self) @@winNodes = [] @@ -18,10 +20,11 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.api.wincadvisorperf" + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" def configure(conf) @@ -50,11 +53,11 @@ def shutdown end def enumerate() - time = Time.now.to_f + time = Fluent::Engine.now begin timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - @@istestvar = ENV["ISTEST"] + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -68,12 +71,10 @@ def enumerate() @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601) metricData.each do |record| if !record.empty? - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" eventStream.add(time, record) if record end end @@ -88,18 +89,13 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/out_health_forward.rb b/source/plugins/ruby/out_health_forward.rb index 6fcfe368b..59eed97da 100644 --- a/source/plugins/ruby/out_health_forward.rb +++ b/source/plugins/ruby/out_health_forward.rb @@ -15,469 +15,593 @@ # limitations under the License. # -require 'base64' -require 'socket' -require 'fileutils' - -require 'cool.io' - require 'fluent/output' require 'fluent/config/error' +require 'fluent/clock' +require 'fluent/tls' +require 'base64' +require 'forwardable' -module Fluent - class ForwardOutputError < StandardError - end - - class ForwardOutputResponseError < ForwardOutputError - end +require 'fluent/compat/socket_util' +require 'fluent/plugin/out_forward/handshake_protocol' +require 'fluent/plugin/out_forward/load_balancer' +require 'fluent/plugin/out_forward/socket_cache' +require 'fluent/plugin/out_forward/failure_detector' +require 'fluent/plugin/out_forward/error' +require 'fluent/plugin/out_forward/connection_manager' +require 'fluent/plugin/out_forward/ack_handler' - class ForwardOutputConnectionClosedError < ForwardOutputError - end +module Fluent::Plugin + class HealthForwardOutput < Output + Fluent::Plugin.register_output('health_forward', self) - class ForwardOutputACKTimeoutError < ForwardOutputResponseError - end + helpers :socket, :server, :timer, :thread, :compat_parameters, :service_discovery - class HealthForwardOutput < ObjectBufferedOutput - Plugin.register_output('health_forward', self) + LISTEN_PORT = 25227 - def initialize - super - require 'fluent/plugin/socket_util' - @nodes = [] #=> [Node] - end + desc 'The transport protocol.' + config_param :transport, :enum, list: [:tcp, :tls], default: :tcp + # TODO: TLS session cache/tickets desc 'The timeout time when sending event logs.' config_param :send_timeout, :time, default: 60 - desc 'The transport protocol to use for heartbeats.(udp,tcp,none)' - config_param :heartbeat_type, default: :udp do |val| - case val.downcase - when 'tcp' - :tcp - when 'udp' - :udp - when 'none' - :none - else - raise ConfigError, "forward output heartbeat type should be 'tcp', 'udp', or 'none'" - end - end + desc 'The timeout time for socket connect' + config_param :connect_timeout, :time, default: nil + # TODO: add linger_timeout, recv_timeout + + desc 'The protocol to use for heartbeats (default is the same with "transport").' + config_param :heartbeat_type, :enum, list: [:transport, :tcp, :udp, :none], default: :transport desc 'The interval of the heartbeat packer.' config_param :heartbeat_interval, :time, default: 1 desc 'The wait time before accepting a server fault recovery.' config_param :recover_wait, :time, default: 10 desc 'The hard timeout used to detect server failure.' config_param :hard_timeout, :time, default: 60 - desc 'Set TTL to expire DNS cache in seconds.' - config_param :expire_dns_cache, :time, default: nil # 0 means disable cache desc 'The threshold parameter used to detect server faults.' config_param :phi_threshold, :integer, default: 16 desc 'Use the "Phi accrual failure detector" to detect server failure.' config_param :phi_failure_detector, :bool, default: true - # if any options added that requires extended forward api, fix @extend_internal_protocol - desc 'Change the protocol to at-least-once.' config_param :require_ack_response, :bool, default: false # require in_forward to respond with ack - desc 'This option is used when require_ack_response is true.' - config_param :ack_response_timeout, :time, default: 190 # 0 means do not wait for ack responses + + ## The reason of default value of :ack_response_timeout: # Linux default tcp_syn_retries is 5 (in many environment) # 3 + 6 + 12 + 24 + 48 + 96 -> 189 (sec) + desc 'This option is used when require_ack_response is true.' + config_param :ack_response_timeout, :time, default: 190 + + desc 'The interval while reading data from server' + config_param :read_interval_msec, :integer, default: 50 # 50ms + desc 'Reading data size from server' + config_param :read_length, :size, default: 512 # 512bytes + + desc 'Set TTL to expire DNS cache in seconds.' + config_param :expire_dns_cache, :time, default: nil # 0 means disable cache desc 'Enable client-side DNS round robin.' config_param :dns_round_robin, :bool, default: false # heartbeat_type 'udp' is not available for this + desc 'Ignore DNS resolution and errors at startup time.' + config_param :ignore_network_errors_at_startup, :bool, default: false + + desc 'Verify that a connection can be made with one of out_forward nodes at the time of startup.' + config_param :verify_connection_at_startup, :bool, default: false + + desc 'Compress buffered data.' + config_param :compress, :enum, list: [:text, :gzip], default: :text + + desc 'The default version of TLS transport.' + config_param :tls_version, :enum, list: Fluent::TLS::SUPPORTED_VERSIONS, default: Fluent::TLS::DEFAULT_VERSION + desc 'The cipher configuration of TLS transport.' + config_param :tls_ciphers, :string, default: Fluent::TLS::CIPHERS_DEFAULT + desc 'Skip all verification of certificates or not.' + config_param :tls_insecure_mode, :bool, default: false + desc 'Allow self signed certificates or not.' + config_param :tls_allow_self_signed_cert, :bool, default: false + desc 'Verify hostname of servers and certificates or not in TLS transport.' + config_param :tls_verify_hostname, :bool, default: true + desc 'The additional CA certificate path for TLS.' + config_param :tls_ca_cert_path, :array, value_type: :string, default: nil + desc 'The additional certificate path for TLS.' + config_param :tls_cert_path, :array, value_type: :string, default: nil + desc 'The client certificate path for TLS.' + config_param :tls_client_cert_path, :string, default: nil + desc 'The client private key path for TLS.' + config_param :tls_client_private_key_path, :string, default: nil + desc 'The client private key passphrase for TLS.' + config_param :tls_client_private_key_passphrase, :string, default: nil, secret: true + desc 'The certificate thumbprint for searching from Windows system certstore.' + config_param :tls_cert_thumbprint, :string, default: nil, secret: true + desc 'The certificate logical store name on Windows system certstore.' + config_param :tls_cert_logical_store_name, :string, default: nil + desc 'Enable to use certificate enterprise store on Windows system certstore.' + config_param :tls_cert_use_enterprise_store, :bool, default: true + desc "Enable keepalive connection." + config_param :keepalive, :bool, default: false + desc "Expired time of keepalive. Default value is nil, which means to keep connection as long as possible" + config_param :keepalive_timeout, :time, default: nil + + config_section :security, required: false, multi: false do + desc 'The hostname' + config_param :self_hostname, :string + desc 'Shared key for authentication' + config_param :shared_key, :string, secret: true + end + + config_section :server, param_name: :servers do + desc "The IP address or host name of the server." + config_param :host, :string + desc "The name of the server. Used for logging and certificate verification in TLS transport (when host is address)." + config_param :name, :string, default: nil + desc "The port number of the host." + config_param :port, :integer, default: LISTEN_PORT + desc "The shared key per server." + config_param :shared_key, :string, default: nil, secret: true + desc "The username for authentication." + config_param :username, :string, default: '' + desc "The password for authentication." + config_param :password, :string, default: '', secret: true + desc "Marks a node as the standby node for an Active-Standby model between Fluentd nodes." + config_param :standby, :bool, default: false + desc "The load balancing weight." + config_param :weight, :integer, default: 60 + end + attr_reader :nodes - config_param :port, :integer, default: DEFAULT_LISTEN_PORT, deprecated: "User host xxx instead." - config_param :host, :string, default: nil, deprecated: "Use port xxx instead." - desc 'Skip network related error, e.g. DNS error, during plugin setup' - config_param :skip_network_error_at_init, :bool, :default => false + config_param :port, :integer, default: LISTEN_PORT, obsoleted: "User section instead." + config_param :host, :string, default: nil, obsoleted: "Use section instead." + config_section :buffer do + config_set_default :chunk_keys, ["tag"] + end - attr_accessor :extend_internal_protocol + attr_reader :read_interval, :recover_sample_size - def configure(conf) + def initialize super - # backward compatibility - if host = conf['host'] - port = conf['port'] - port = port ? port.to_i : DEFAULT_LISTEN_PORT - e = conf.add_element('server') - e['host'] = host - e['port'] = port.to_s - end + @nodes = [] #=> [Node] + @loop = nil + @thread = nil - recover_sample_size = @recover_wait / @heartbeat_interval + @usock = nil + @keep_alive_watcher_interval = 5 # TODO + @suspend_flush = false + end - # add options here if any options addes which uses extended protocol - @extend_internal_protocol = if @require_ack_response - true - else - false - end + def configure(conf) + compat_parameters_convert(conf, :buffer, default_chunk_key: 'tag') - if @dns_round_robin - if @heartbeat_type == :udp - raise ConfigError, "forward output heartbeat type must be 'tcp' or 'none' to use dns_round_robin option" - end - end + super - conf.elements.each {|e| - next if e.name != "server" + unless @chunk_key_tag + raise Fluent::ConfigError, "buffer chunk key must include 'tag' for forward output" + end - host = e['host'] - port = e['port'] - port = port ? port.to_i : DEFAULT_LISTEN_PORT + @read_interval = @read_interval_msec / 1000.0 + @recover_sample_size = @recover_wait / @heartbeat_interval - weight = e['weight'] - weight = weight ? weight.to_i : 60 + if @heartbeat_type == :tcp + log.warn "'heartbeat_type tcp' is deprecated. use 'transport' instead." + @heartbeat_type = :transport + end - standby = !!e['standby'] + if @dns_round_robin && @heartbeat_type == :udp + raise Fluent::ConfigError, "forward output heartbeat type must be 'transport' or 'none' to use dns_round_robin option" + end - name = e['name'] - unless name - name = "#{host}:#{port}" + if @transport == :tls + # socket helper adds CA cert or signed certificate to same cert store internally so unify it in this place. + if @tls_cert_path && !@tls_cert_path.empty? + @tls_ca_cert_path = @tls_cert_path + end + if @tls_ca_cert_path && !@tls_ca_cert_path.empty? + @tls_ca_cert_path.each do |path| + raise Fluent::ConfigError, "specified cert path does not exist:#{path}" unless File.exist?(path) + raise Fluent::ConfigError, "specified cert path is not readable:#{path}" unless File.readable?(path) + end end - failure = FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f) - - node_conf = NodeConfig2.new(name, host, port, weight, standby, failure, - @phi_threshold, recover_sample_size, @expire_dns_cache, @phi_failure_detector, @dns_round_robin, @skip_network_error_at_init) + if @tls_insecure_mode + log.warn "TLS transport is configured in insecure way" + @tls_verify_hostname = false + @tls_allow_self_signed_cert = true + end - if @heartbeat_type == :none - @nodes << NoneHeartbeatNode.new(log, node_conf) + if Fluent.windows? + if (@tls_cert_path || @tls_ca_cert_path) && @tls_cert_logical_store_name + raise Fluent::ConfigError, "specified both cert path and tls_cert_logical_store_name is not permitted" + end else - @nodes << Node.new(log, node_conf) + raise Fluent::ConfigError, "This parameter is for only Windows" if @tls_cert_logical_store_name + raise Fluent::ConfigError, "This parameter is for only Windows" if @tls_cert_thumbprint end - log.info "adding forwarding server '#{name}'", host: host, port: port, weight: weight, plugin_id: plugin_id - } + end + + @ack_handler = @require_ack_response ? AckHandler.new(timeout: @ack_response_timeout, log: @log, read_length: @read_length) : nil + socket_cache = @keepalive ? SocketCache.new(@keepalive_timeout, @log) : nil + @connection_manager = Fluent::Plugin::ForwardOutput::ConnectionManager.new( + log: @log, + secure: !!@security, + connection_factory: method(:create_transfer_socket), + socket_cache: socket_cache, + ) - if @nodes.empty? - raise ConfigError, "forward output plugin requires at least one is required" + configs = [] + + # rewrite for using server as sd_static + conf.elements(name: 'server').each do |s| + s.name = 'service' end - end - def start - super + unless conf.elements(name: 'service').empty? + # To copy `services` element only + new_elem = Fluent::Config::Element.new('static_service_discovery', {}, {}, conf.elements(name: 'service')) + configs << { type: :static, conf: new_elem } + end - @rand_seed = Random.new.seed - rebuild_weight_array - @rr = 0 + conf.elements(name: 'service_discovery').each_with_index do |c, i| + configs << { type: @service_discovery[i][:@type], conf: c } + end - unless @heartbeat_type == :none - @loop = Coolio::Loop.new + service_discovery_create_manager( + :out_forward_service_discovery_watcher, + configurations: configs, + load_balancer: Fluent::Plugin::ForwardOutput::LoadBalancer.new(log), + custom_build_method: method(:build_node), + ) - if @heartbeat_type == :udp - # assuming all hosts use udp - @usock = SocketUtil.create_udp_socket(@nodes.first.host) - @usock.fcntl(Fcntl::F_SETFL, Fcntl::O_NONBLOCK) - @hb = HeartbeatHandler.new(@usock, method(:on_heartbeat)) - @loop.attach(@hb) + discovery_manager.services.each do |server| + # it's only for test + @nodes << server + unless @heartbeat_type == :none + begin + server.validate_host_resolution! + rescue => e + raise unless @ignore_network_errors_at_startup + log.warn "failed to resolve node name when configured", server: (server.name || server.host), error: e + server.disable! + end end + end - @timer = HeartbeatRequestTimer.new(@heartbeat_interval, method(:on_timer)) - @loop.attach(@timer) + unless @as_secondary + if @compress == :gzip && @buffer.compress == :text + @buffer.compress = :gzip + elsif @compress == :text && @buffer.compress == :gzip + log.info "buffer is compressed. If you also want to save the bandwidth of a network, Add `compress` configuration in " + end + end - @thread = Thread.new(&method(:run)) + if discovery_manager.services.empty? + raise Fluent::ConfigError, "forward output plugin requires at least one node is required. Add or " end - end - def shutdown - @finished = true - if @loop - @loop.watchers.each {|w| w.detach } - @loop.stop + if !@keepalive && @keepalive_timeout + log.warn('The value of keepalive_timeout is ignored. if you want to use keepalive, please add `keepalive true` to your conf.') end - @thread.join if @thread - @usock.close if @usock + + raise Fluent::ConfigError, "ack_response_timeout must be a positive integer" if @ack_response_timeout < 1 end - def run - @loop.run if @loop - rescue - log.error "unexpected error", error: $!.to_s - log.error_backtrace + def multi_workers_ready? + true end - def write_objects(tag, chunk) - return if chunk.empty? + def prefer_delayed_commit + @require_ack_response + end - error = nil + def overwrite_delayed_commit_timeout + # Output#start sets @delayed_commit_timeout by @buffer_config.delayed_commit_timeout + # But it should be overwritten by ack_response_timeout to rollback chunks after timeout + if @delayed_commit_timeout != @ack_response_timeout + log.info "delayed_commit_timeout is overwritten by ack_response_timeout" + @delayed_commit_timeout = @ack_response_timeout + 2 # minimum ack_reader IO.select interval is 1s + end + end - wlen = @weight_array.length - wlen.times do - @rr = (@rr + 1) % wlen - node = @weight_array[@rr] + def start + super - if node.available? + unless @heartbeat_type == :none + if @heartbeat_type == :udp + @usock = socket_create_udp(discovery_manager.services.first.host, discovery_manager.services.first.port, nonblock: true) + server_create_udp(:out_forward_heartbeat_receiver, 0, socket: @usock, max_bytes: @read_length, &method(:on_udp_heatbeat_response_recv)) + end + timer_execute(:out_forward_heartbeat_request, @heartbeat_interval, &method(:on_heartbeat_timer)) + end + + if @require_ack_response + overwrite_delayed_commit_timeout + thread_create(:out_forward_receiving_ack, &method(:ack_reader)) + end + + if @verify_connection_at_startup + discovery_manager.services.each do |node| begin - send_data(node, tag, chunk) - return - rescue - # for load balancing during detecting crashed servers - error = $! # use the latest error + node.verify_connection + rescue StandardError => e + log.fatal "forward's connection setting error: #{e.message}" + raise Fluent::UnrecoverableError, e.message end end end - if error - raise error - else - raise "no nodes are available" # TODO message + if @keepalive + timer_execute(:out_forward_keep_alived_socket_watcher, @keep_alive_watcher_interval, &method(:on_purge_obsolete_socks)) end end - private + def close + if @usock + # close socket and ignore errors: this socket will not be used anyway. + @usock.close rescue nil + end - def rebuild_weight_array - standby_nodes, regular_nodes = @nodes.partition {|n| - n.standby? - } + super + end - lost_weight = 0 - regular_nodes.each {|n| - unless n.available? - lost_weight += n.weight - end - } - log.debug "rebuilding weight array", lost_weight: lost_weight - - if lost_weight > 0 - standby_nodes.each {|n| - if n.available? - regular_nodes << n - log.warn "using standby node #{n.host}:#{n.port}", weight: n.weight - lost_weight -= n.weight - break if lost_weight <= 0 - end - } + def stop + super + + if @keepalive + @connection_manager.stop end + end + + def before_shutdown + super + @suspend_flush = true + end + + def after_shutdown + last_ack if @require_ack_response + super + end - weight_array = [] - gcd = regular_nodes.map {|n| n.weight }.inject(0) {|r,w| r.gcd(w) } - regular_nodes.each {|n| - (n.weight / gcd).times { - weight_array << n - } - } + def try_flush + return if @require_ack_response && @suspend_flush + super + end - # for load balancing during detecting crashed servers - coe = (regular_nodes.size * 6) / weight_array.size - weight_array *= coe if coe > 1 + def last_ack + overwrite_delayed_commit_timeout + ack_check(ack_select_interval) + end - r = Random.new(@rand_seed) - weight_array.sort_by! { r.rand } + def write(chunk) + return if chunk.empty? + tag = chunk.metadata.tag - @weight_array = weight_array + discovery_manager.select_service { |node| node.send_data(tag, chunk) } end - # MessagePack FixArray length = 3 (if @extend_internal_protocol) - # = 2 (else) - FORWARD_HEADER = [0x92].pack('C').freeze - FORWARD_HEADER_EXT = [0x93].pack('C').freeze - def forward_header - if @extend_internal_protocol - FORWARD_HEADER_EXT - else - FORWARD_HEADER + def try_write(chunk) + log.trace "writing a chunk to destination", chunk_id: dump_unique_id_hex(chunk.unique_id) + if chunk.empty? + commit_write(chunk.unique_id) + return end + tag = chunk.metadata.tag + discovery_manager.select_service { |node| node.send_data(tag, chunk) } + last_ack if @require_ack_response && @suspend_flush end - #FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack - def send_heartbeat_tcp(node) - sock = connect(node) - begin - opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) - opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval - # don't send any data to not cause a compatibility problem - #sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) - #sock.write FORWARD_TCP_HEARTBEAT_DATA - node.heartbeat(true) - ensure - sock.close + def create_transfer_socket(host, port, hostname, &block) + case @transport + when :tls + socket_create_tls( + host, port, + version: @tls_version, + ciphers: @tls_ciphers, + insecure: @tls_insecure_mode, + verify_fqdn: @tls_verify_hostname, + fqdn: hostname, + allow_self_signed_cert: @tls_allow_self_signed_cert, + cert_paths: @tls_ca_cert_path, + cert_path: @tls_client_cert_path, + private_key_path: @tls_client_private_key_path, + private_key_passphrase: @tls_client_private_key_passphrase, + cert_thumbprint: @tls_cert_thumbprint, + cert_logical_store_name: @tls_cert_logical_store_name, + cert_use_enterprise_store: @tls_cert_use_enterprise_store, + + # Enabling SO_LINGER causes tcp port exhaustion on Windows. + # This is because dynamic ports are only 16384 (from 49152 to 65535) and + # expiring SO_LINGER enabled ports should wait 4 minutes + # where set by TcpTimeDelay. Its default value is 4 minutes. + # So, we should disable SO_LINGER on Windows to prevent flood of waiting ports. + linger_timeout: Fluent.windows? ? nil : @send_timeout, + send_timeout: @send_timeout, + recv_timeout: @ack_response_timeout, + connect_timeout: @connect_timeout, + &block + ) + when :tcp + socket_create_tcp( + host, port, + linger_timeout: @send_timeout, + send_timeout: @send_timeout, + recv_timeout: @ack_response_timeout, + connect_timeout: @connect_timeout, + &block + ) + else + raise "BUG: unknown transport protocol #{@transport}" end end - def send_data(node, tag, chunk) - sock = connect(node) - begin - opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) - - opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) - - # beginArray(2) - sock.write forward_header - - # writeRaw(tag) - sock.write tag.to_msgpack # tag - - # beginRaw(size) - sz = chunk.size - #if sz < 32 - # # FixRaw - # sock.write [0xa0 | sz].pack('C') - #elsif sz < 65536 - # # raw 16 - # sock.write [0xda, sz].pack('Cn') - #else - # raw 32 - sock.write [0xdb, sz].pack('CN') - #end - - # writeRawBody(packed_es) - chunk.write_to(sock) - - if @extend_internal_protocol - option = {} - option['chunk'] = Base64.encode64(chunk.unique_id) if @require_ack_response - sock.write option.to_msgpack - - if @require_ack_response && @ack_response_timeout > 0 - # Waiting for a response here results in a decrease of throughput because a chunk queue is locked. - # To avoid a decrease of troughput, it is necessary to prepare a list of chunks that wait for responses - # and process them asynchronously. - if IO.select([sock], nil, nil, @ack_response_timeout) - raw_data = sock.recv(1024) - - # When connection is closed by remote host, socket is ready to read and #recv returns an empty string that means EOF. - # If this happens we assume the data wasn't delivered and retry it. - if raw_data.empty? - @log.warn "node #{node.host}:#{node.port} closed the connection. regard it as unavailable." - node.disable! - raise ForwardOutputConnectionClosedError, "node #{node.host}:#{node.port} closed connection" - else - # Serialization type of the response is same as sent data. - res = MessagePack.unpack(raw_data) - - if res['ack'] != option['chunk'] - # Some errors may have occured when ack and chunk id is different, so send the chunk again. - raise ForwardOutputResponseError, "ack in response and chunk id in sent data are different" - end - end - - else - # IO.select returns nil on timeout. - # There are 2 types of cases when no response has been received: - # (1) the node does not support sending responses - # (2) the node does support sending response but responses have not arrived for some reasons. - @log.warn "no response from #{node.host}:#{node.port}. regard it as unavailable." - node.disable! - raise ForwardOutputACKTimeoutError, "node #{node.host}:#{node.port} does not return ACK" - end - end + def statistics + stats = super + services = discovery_manager.services + healthy_nodes_count = 0 + registed_nodes_count = services.size + services.each do |s| + if s.available? + healthy_nodes_count += 1 end - - node.heartbeat(false) - return res # for test - ensure - sock.close end + + stats.merge( + 'healthy_nodes_count' => healthy_nodes_count, + 'registered_nodes_count' => registed_nodes_count, + ) end - def connect(node) - # TODO unix socket? - TCPSocket.new(node.resolved_host, node.port) + # MessagePack FixArray length is 3 + FORWARD_HEADER = [0x93].pack('C').freeze + def forward_header + FORWARD_HEADER end - class HeartbeatRequestTimer < Coolio::TimerWatcher - def initialize(interval, callback) - super(interval, true) - @callback = callback - end + private - def on_timer - @callback.call - rescue - # TODO log? + def build_node(server) + name = server.name || "#{server.host}:#{server.port}" + log.info "adding forwarding server '#{name}'", host: server.host, port: server.port, weight: server.weight, plugin_id: plugin_id + + failure = Fluent::Plugin::ForwardOutput::FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f) + if @heartbeat_type == :none + NoneHeartbeatNode.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler) + else + Node.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler) end end - def on_timer - return if @finished - @nodes.each {|n| - if n.tick - rebuild_weight_array - end + def on_heartbeat_timer + need_rebuild = false + discovery_manager.services.each do |n| begin - #log.trace "sending heartbeat #{n.host}:#{n.port} on #{@heartbeat_type}" - if @heartbeat_type == :tcp - send_heartbeat_tcp(n) - else - @usock.send "\0", 0, Socket.pack_sockaddr_in(n.port, n.resolved_host) - end - rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED - # TODO log - log.debug "failed to send heartbeat packet to #{n.host}:#{n.port}", error: $!.to_s + log.trace "sending heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type + n.usock = @usock if @usock + need_rebuild = n.send_heartbeat || need_rebuild + rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED, Errno::ETIMEDOUT => e + log.debug "failed to send heartbeat packet", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e + rescue => e + log.debug "unexpected error happen during heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e end - } - end - class HeartbeatHandler < Coolio::IO - def initialize(io, callback) - super(io) - @io = io - @callback = callback + need_rebuild = n.tick || need_rebuild end - def on_readable - begin - msg, addr = @io.recvfrom(1024) - rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR - return - end - host = addr[3] - port = addr[1] - sockaddr = Socket.pack_sockaddr_in(port, host) - @callback.call(sockaddr, msg) - rescue - # TODO log? + if need_rebuild + discovery_manager.rebalance end end - def on_heartbeat(sockaddr, msg) - port, host = Socket.unpack_sockaddr_in(sockaddr) - if node = @nodes.find {|n| n.sockaddr == sockaddr } - #log.trace "heartbeat from '#{node.name}'", :host=>node.host, :port=>node.port + def on_udp_heatbeat_response_recv(data, sock) + sockaddr = Socket.pack_sockaddr_in(sock.remote_port, sock.remote_host) + if node = discovery_manager.services.find { |n| n.sockaddr == sockaddr } + # log.trace "heartbeat arrived", name: node.name, host: node.host, port: node.port if node.heartbeat - rebuild_weight_array + discovery_manager.rebalance end + else + log.warn("Unknown heartbeat response received from #{sock.remote_host}:#{sock.remote_port}. It may service out") end end - NodeConfig2 = Struct.new("NodeConfig2", :name, :host, :port, :weight, :standby, :failure, - :phi_threshold, :recover_sample_size, :expire_dns_cache, :phi_failure_detector, :dns_round_robin, :skip_network_error) + def on_purge_obsolete_socks + @connection_manager.purge_obsolete_socks + end + + def ack_select_interval + if @delayed_commit_timeout > 3 + 1 + else + @delayed_commit_timeout / 3.0 + end + end + + def ack_reader + select_interval = ack_select_interval + + while thread_current_running? + ack_check(select_interval) + end + end + + def ack_check(select_interval) + @ack_handler.collect_response(select_interval) do |chunk_id, node, sock, result| + @connection_manager.close(sock) + + case result + when AckHandler::Result::SUCCESS + commit_write(chunk_id) + when AckHandler::Result::FAILED + node.disable! + rollback_write(chunk_id, update_retry: false) + when AckHandler::Result::CHUNKID_UNMATCHED + rollback_write(chunk_id, update_retry: false) + else + log.warn("BUG: invalid status #{result} #{chunk_id}") + + if chunk_id + rollback_write(chunk_id, update_retry: false) + end + end + end + end class Node - def initialize(log, conf) - @log = log - @conf = conf - @name = @conf.name - @host = @conf.host - @port = @conf.port - @weight = @conf.weight - @failure = @conf.failure + extend Forwardable + def_delegators :@server, :discovery_id, :host, :port, :name, :weight, :standby + + # @param connection_manager [Fluent::Plugin::ForwardOutput::ConnectionManager] + # @param ack_handler [Fluent::Plugin::ForwardOutput::AckHandler] + def initialize(sender, server, failure:, connection_manager:, ack_handler:) + @sender = sender + @log = sender.log + @compress = sender.compress + @server = server + + @name = server.name + @host = server.host + @port = server.port + @weight = server.weight + @standby = server.standby + @failure = failure @available = true + # @hostname is used for certificate verification & TLS SNI + host_is_hostname = !(IPAddr.new(@host) rescue false) + @hostname = case + when host_is_hostname then @host + when @name then @name + else nil + end + + @usock = nil + + @handshake = Fluent::Plugin::ForwardOutput::HandshakeProtocol.new( + log: @log, + hostname: sender.security && sender.security.self_hostname, + shared_key: server.shared_key || (sender.security && sender.security.shared_key) || '', + password: server.password || '', + username: server.username || '', + ) + + @unpacker = Fluent::MessagePackFactory.msgpack_unpacker + @resolved_host = nil @resolved_time = 0 - begin - resolved_host # check dns - rescue => e - if @conf.skip_network_error - log.warn "#{@name} got network error during setup. Resolve host later", :error => e, :error_class => e.class - else - raise - end - end - end + @resolved_once = false + + @connection_manager = connection_manager + @ack_handler = ack_handler + end + + attr_accessor :usock - attr_reader :conf - attr_reader :name, :host, :port, :weight - attr_reader :sockaddr # used by on_heartbeat - attr_reader :failure, :available # for test + attr_reader :state + attr_reader :sockaddr # used by on_udp_heatbeat_response_recv + attr_reader :failure # for test + + def validate_host_resolution! + resolved_host + end def available? @available @@ -488,41 +612,158 @@ def disable! end def standby? - @conf.standby + @standby + end + + def verify_connection + connect do |sock, ri| + ensure_established_connection(sock, ri) + end + end + + def establish_connection(sock, ri) + while ri.state != :established + begin + # TODO: On Ruby 2.2 or earlier, read_nonblock doesn't work expectedly. + # We need rewrite around here using new socket/server plugin helper. + buf = sock.read_nonblock(@sender.read_length) + if buf.empty? + sleep @sender.read_interval + next + end + @unpacker.feed_each(buf) do |data| + if @handshake.invoke(sock, ri, data) == :established + @log.debug "connection established", host: @host, port: @port + end + end + rescue IO::WaitReadable + # If the exception is Errno::EWOULDBLOCK or Errno::EAGAIN, it is extended by IO::WaitReadable. + # So IO::WaitReadable can be used to rescue the exceptions for retrying read_nonblock. + # https//docs.ruby-lang.org/en/2.3.0/IO.html#method-i-read_nonblock + sleep @sender.read_interval unless ri.state == :established + rescue SystemCallError => e + @log.warn "disconnected by error", host: @host, port: @port, error: e + disable! + break + rescue EOFError + @log.warn "disconnected", host: @host, port: @port + disable! + break + rescue HeloError => e + @log.warn "received invalid helo message from #{@name}" + disable! + break + rescue PingpongError => e + @log.warn "connection refused to #{@name || @host}: #{e.message}" + disable! + break + end + end + end + + def send_data_actual(sock, tag, chunk) + option = { 'size' => chunk.size, 'compressed' => @compress } + option['chunk'] = Base64.encode64(chunk.unique_id) if @ack_handler + + # https://github.com/fluent/fluentd/wiki/Forward-Protocol-Specification-v1#packedforward-mode + # out_forward always uses str32 type for entries. + # str16 can store only 64kbytes, and it should be much smaller than buffer chunk size. + + tag = tag.dup.force_encoding(Encoding::UTF_8) + + sock.write @sender.forward_header # array, size=3 + sock.write tag.to_msgpack # 1. tag: String (str) + chunk.open(compressed: @compress) do |chunk_io| + entries = [0xdb, chunk_io.size].pack('CN') + sock.write entries.force_encoding(Encoding::UTF_8) # 2. entries: String (str32) + IO.copy_stream(chunk_io, sock) # writeRawBody(packed_es) + end + sock.write option.to_msgpack # 3. option: Hash(map) + + # TODO: use bin32 for non-utf8 content(entries) when old msgpack-ruby (0.5.x or earlier) not supported + end + + def send_data(tag, chunk) + ack = @ack_handler && @ack_handler.create_ack(chunk.unique_id, self) + connect(nil, ack: ack) do |sock, ri| + ensure_established_connection(sock, ri) + send_data_actual(sock, tag, chunk) + end + + heartbeat(false) + nil + end + + # FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack + # + # @return [Boolean] return true if it needs to rebuild nodes + def send_heartbeat + begin + dest_addr = resolved_host + @resolved_once = true + rescue ::SocketError => e + if !@resolved_once && @sender.ignore_network_errors_at_startup + @log.warn "failed to resolve node name in heartbeating", server: @name || @host, error: e + return false + end + raise + end + + case @sender.heartbeat_type + when :transport + connect(dest_addr) do |sock, ri| + ensure_established_connection(sock, ri) + + ## don't send any data to not cause a compatibility problem + # sock.write FORWARD_TCP_HEARTBEAT_DATA + + # successful tcp connection establishment is considered as valid heartbeat. + # When heartbeat is succeeded after detached, return true. It rebuilds weight array. + heartbeat(true) + end + when :udp + @usock.send "\0", 0, Socket.pack_sockaddr_in(@port, dest_addr) + # response is going to receive at on_udp_heatbeat_response_recv + false + when :none # :none doesn't use this class + raise "BUG: heartbeat_type none must not use Node" + else + raise "BUG: unknown heartbeat_type '#{@sender.heartbeat_type}'" + end end def resolved_host - case @conf.expire_dns_cache + case @sender.expire_dns_cache when 0 # cache is disabled - return resolve_dns! + resolve_dns! when nil # persistent cache - return @resolved_host ||= resolve_dns! + @resolved_host ||= resolve_dns! else - now = Engine.now + now = Fluent::EventTime.now rh = @resolved_host - if !rh || now - @resolved_time >= @conf.expire_dns_cache + if !rh || now - @resolved_time >= @sender.expire_dns_cache rh = @resolved_host = resolve_dns! @resolved_time = now end - return rh + rh end end def resolve_dns! addrinfo_list = Socket.getaddrinfo(@host, @port, nil, Socket::SOCK_STREAM) - addrinfo = @conf.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first - @sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_heartbeat + addrinfo = @sender.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first + @sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_udp_heatbeat_response_recv addrinfo[3] end private :resolve_dns! def tick now = Time.now.to_f - if !@available + unless available? if @failure.hard_timeout?(now) @failure.clear end @@ -531,41 +772,51 @@ def tick if @failure.hard_timeout?(now) @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, hard_timeout: true - @available = false + disable! @resolved_host = nil # expire cached host @failure.clear return true end - if @conf.phi_failure_detector + if @sender.phi_failure_detector phi = @failure.phi(now) - #$log.trace "phi '#{@name}'", :host=>@host, :port=>@port, :phi=>phi - if phi > @conf.phi_threshold - @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi - @available = false + if phi > @sender.phi_threshold + @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi, phi_threshold: @sender.phi_threshold + disable! @resolved_host = nil # expire cached host @failure.clear return true end end - return false + false end def heartbeat(detect=true) now = Time.now.to_f @failure.add(now) - #@log.trace "heartbeat from '#{@name}'", :host=>@host, :port=>@port, :available=>@available, :sample_size=>@failure.sample_size - if detect && !@available && @failure.sample_size > @conf.recover_sample_size + if detect && !available? && @failure.sample_size > @sender.recover_sample_size @available = true @log.warn "recovered forwarding server '#{@name}'", host: @host, port: @port - return true + true else - return nil + nil end end - def to_msgpack(out = '') - [@host, @port, @weight, @available].to_msgpack(out) + private + + def ensure_established_connection(sock, request_info) + if request_info.state != :established + establish_connection(sock, request_info) + + if request_info.state != :established + raise ConnectionClosedError, "failed to establish connection with node #{@name}" + end + end + end + + def connect(host = nil, ack: false, &block) + @connection_manager.connect(host: host || resolved_host, port: port, hostname: @hostname, ack: ack, &block) end end @@ -583,96 +834,5 @@ def heartbeat(detect=true) true end end - - class FailureDetector - PHI_FACTOR = 1.0 / Math.log(10.0) - SAMPLE_SIZE = 1000 - - def initialize(heartbeat_interval, hard_timeout, init_last) - @heartbeat_interval = heartbeat_interval - @last = init_last - @hard_timeout = hard_timeout - - # microsec - @init_gap = (heartbeat_interval * 1e6).to_i - @window = [@init_gap] - end - - def hard_timeout?(now) - now - @last > @hard_timeout - end - - def add(now) - if @window.empty? - @window << @init_gap - @last = now - else - gap = now - @last - @window << (gap * 1e6).to_i - @window.shift if @window.length > SAMPLE_SIZE - @last = now - end - end - - def phi(now) - size = @window.size - return 0.0 if size == 0 - - # Calculate weighted moving average - mean_usec = 0 - fact = 0 - @window.each_with_index {|gap,i| - mean_usec += gap * (1+i) - fact += (1+i) - } - mean_usec = mean_usec / fact - - # Normalize arrive intervals into 1sec - mean = (mean_usec.to_f / 1e6) - @heartbeat_interval + 1 - - # Calculate phi of the phi accrual failure detector - t = now - @last - @heartbeat_interval + 1 - phi = PHI_FACTOR * t / mean - - return phi - end - - def sample_size - @window.size - end - - def clear - @window.clear - @last = 0 - end - end - - ## TODO - #class RPC - # def initialize(this) - # @this = this - # end - # - # def list_nodes - # @this.nodes - # end - # - # def list_fault_nodes - # list_nodes.select {|n| !n.available? } - # end - # - # def list_available_nodes - # list_nodes.select {|n| n.available? } - # end - # - # def add_node(name, host, port, weight) - # end - # - # def recover_node(host, port) - # end - # - # def remove_node(host, port) - # end - #end end end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 6238eb51a..8e80fb753 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -1,11 +1,12 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent - class OutputMDM < BufferedOutput - config_param :retry_mdm_post_wait_minutes, :integer +require 'fluent/plugin/output' - Plugin.register_output("out_mdm", self) +module Fluent::Plugin + class OutputMDM < Output + config_param :retry_mdm_post_wait_minutes, :integer + Fluent::Plugin.register_output("mdm", self) def initialize super @@ -57,8 +58,6 @@ def initialize end def configure(conf) - s = conf.add_element("secondary") - s["type"] = ChunkErrorHandler::SecondaryName super end @@ -204,7 +203,7 @@ def get_access_token end def write_status_file(success, message) - fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status" + fn = "/var/opt/microsoft/docker-cimprov/log/MDMIngestion.status" status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] begin File.open(fn, "w") { |file| file.write(status) } @@ -270,6 +269,7 @@ def write(chunk) flush_mdm_exception_telemetry if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] + chunk.extend Fluent::ChunkMessagePackEventStreamer chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } @@ -320,7 +320,7 @@ def send_to_mdm(post_body) ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPServerException => e + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else @@ -334,7 +334,7 @@ def send_to_mdm(post_body) # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue - @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}" else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" @@ -352,72 +352,5 @@ def send_to_mdm(post_body) raise e end end - - private - - class ChunkErrorHandler - include Configurable - include PluginId - include PluginLoggerMixin - - SecondaryName = "__ChunkErrorHandler__" - - Plugin.register_output(SecondaryName, self) - - def initialize - @router = nil - end - - def secondary_init(primary) - @error_handlers = create_error_handlers @router - end - - def start - # NOP - end - - def shutdown - # NOP - end - - def router=(r) - @router = r - end - - def write(chunk) - chunk.msgpack_each { |(tag, record)| - @error_handlers[tag].emit(record) - } - end - - private - - def create_error_handlers(router) - nop_handler = NopErrorHandler.new - Hash.new() { |hash, tag| - etag = OMS::Common.create_error_tag tag - hash[tag] = router.match?(etag) ? - ErrorHandler.new(router, etag) : - nop_handler - } - end - - class ErrorHandler - def initialize(router, etag) - @router = router - @etag = etag - end - - def emit(record) - @router.emit(@etag, Fluent::Engine.now, record) - end - end - - class NopErrorHandler - def emit(record) - # NOP - end - end - end end # class OutputMDM end # module Fluent diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index d9cb71bd4..c24a91a87 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -279,16 +279,16 @@ def process_pod_inventory_record(record) begin records = [] - podUid = record["DataItems"][0]["PodUid"] + podUid = record["PodUid"] if @pod_uids.key?(podUid) return end @pod_uids[podUid] = true - podPhaseDimValue = record["DataItems"][0]["PodStatus"] - podNamespaceDimValue = record["DataItems"][0]["Namespace"] - podControllerNameDimValue = record["DataItems"][0]["ControllerName"] - podNodeDimValue = record["DataItems"][0]["Computer"] + podPhaseDimValue = record["PodStatus"] + podNamespaceDimValue = record["Namespace"] + podControllerNameDimValue = record["ControllerName"] + podNodeDimValue = record["Computer"] if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = "No Controller" diff --git a/source/plugins/utils/oms_common.rb b/source/plugins/utils/oms_common.rb new file mode 100644 index 000000000..c10cb8638 --- /dev/null +++ b/source/plugins/utils/oms_common.rb @@ -0,0 +1,143 @@ +module OMS + + MSDockerCImprovHostnameFilePath = '/var/opt/microsoft/docker-cimprov/state/containerhostname' + IPV6_REGEX = '\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}' + IPV4_Approximate_REGEX = '\d+\.\d+\.\d+\.\d+' + + class RetryRequestException < Exception + # Throw this exception to tell the fluentd engine to retry and + # inform the output plugin that it is indeed retryable + end + + class Common + require 'socket' + require_relative 'omslog' + + @@Hostname = nil + @@HostnameFilePath = MSDockerCImprovHostnameFilePath + + + class << self + + # Internal methods + # (left public for easy testing, though protected may be better later) + + def clean_hostname_string(hnBuffer) + return "" if hnBuffer.nil? # So give the rest of the program a string to deal with. + hostname_buffer = hnBuffer.strip + return hostname_buffer + end + + def has_designated_hostnamefile? + return false if @@HostnameFilePath.nil? + return false unless @@HostnameFilePath =~ /\w/ + return false unless File.exist?(@@HostnameFilePath) + return true + end + + def is_dot_separated_string?(hnBuffer) + return true if /[^.]+\.[^.]+/ =~ hnBuffer + return false + end + + def is_hostname_compliant?(hnBuffer) + # RFC 2181: + # Size limit is 1 to 63 octets, so probably bytesize is appropriate method. + return false if hnBuffer.nil? + return false if /\./ =~ hnBuffer # Hostname by definition may not contain a dot. + return false if /:/ =~ hnBuffer # Hostname by definition may not contain a colon. + return false unless 1 <= hnBuffer.bytesize && hnBuffer.bytesize <= 63 + return true + end + + def is_like_ipv4_string?(hnBuffer) + return false unless /\A#{IPV4_Approximate_REGEX}\z/ =~ hnBuffer + qwa = hnBuffer.split('.') + return false unless qwa.length == 4 + return false if qwa[0].to_i == 0 + qwa.each do |quadwordstring| + bi = quadwordstring.to_i + # This may need more detail if 255 octets are sometimes allowed, but I don't think so. + return false unless 0 <= bi and bi < 255 + end + return true + end + + def is_like_ipv6_string?(hnBuffer) + return true if /\A#{IPV6_REGEX}\z/ =~ hnBuffer + return false + end + + def look_for_socket_class_host_address + hostname_buffer = nil + + begin + hostname_buffer = Socket.gethostname + rescue => error + OMS::Log.error_once("Unable to get the Host Name using socket facility: #{error}") + return + end + @@Hostname = clean_hostname_string(hostname_buffer) + + return # Thwart accidental return to force correct use. + end + + def look_in_designated_hostnamefile + # Issue: + # When omsagent runs inside a container, gethostname returns the hostname of the container (random name) + # not the actual machine hostname. + # One way to solve this problem is to set the container hostname same as machine name, but this is not + # possible when host-machine is a private VM inside a cluster. + # Solution: + # Share/mount ‘/etc/hostname’ as '/var/opt/microsoft/omsagent/state/containername' with container and + # omsagent will read hostname from shared file. + hostname_buffer = nil + + unless File.readable?(@@HostnameFilePath) + OMS::Log.warn_once("File '#{@@HostnameFilePath}' exists but is not readable.") + return + end + + begin + hostname_buffer = File.read(@@HostnameFilePath) + rescue => error + OMS::Log.warn_once("Unable to read the hostname from #{@@HostnameFilePath}: #{error}") + end + @@Hostname = clean_hostname_string(hostname_buffer) + return # Thwart accidental return to force correct use. + end + + def validate_hostname_equivalent(hnBuffer) + # RFC 1123 and 2181 + # Note that for now we are limiting the earlier maximum of 63 for fqdn labels and thus + # hostnames UNTIL we are assured azure will allow 255, as specified in RFC 1123, or + # we are otherwise instructed. + rfcl = "RFCs 1123, 2181 with hostname range of {1,63} octets for non-root item." + return if is_hostname_compliant?(hnBuffer) + return if is_like_ipv4_string?(hnBuffer) + return if is_like_ipv6_string?(hnBuffer) + msg = "Hostname '#{hnBuffer}' not compliant (#{rfcl}). Not IP Address Either." + OMS::Log.warn_once(msg) + raise NameError, msg + end + + # End of Internal methods + + def get_hostname(ignoreOldValue = false) + if not is_hostname_compliant?(@@Hostname) or ignoreOldValue then + + look_in_designated_hostnamefile if has_designated_hostnamefile? + + look_for_socket_class_host_address unless is_hostname_compliant?(@@Hostname) + end + + begin + validate_hostname_equivalent(@@Hostname) + rescue => error + OMS::Log.warn_once("Hostname '#{@@Hostname}' found, but did NOT validate as compliant. #{error}. Using anyway.") + end + return @@Hostname + end + end # Class methods + end # class Common +end # module OMS diff --git a/source/plugins/utils/omslog.rb b/source/plugins/utils/omslog.rb new file mode 100644 index 000000000..b65bf947c --- /dev/null +++ b/source/plugins/utils/omslog.rb @@ -0,0 +1,50 @@ +module OMS + class Log + require 'set' + require 'digest' + + @@error_proc = Proc.new {|message| $log.error message } + @@warn_proc = Proc.new {|message| $log.warn message } + @@info_proc = Proc.new {|message| $log.info message } + @@debug_proc = Proc.new {|message| $log.debug message } + + @@logged_hashes = Set.new + + class << self + def error_once(message, tag=nil) + log_once(@@error_proc, @@debug_proc, message, tag) + end + + def warn_once(message, tag=nil) + log_once(@@warn_proc, @@debug_proc, message, tag) + end + + def info_once(message, tag=nil) + log_once(@@info_proc, @@debug_proc, message, tag) + end + + def log_once(first_loglevel_proc, next_loglevel_proc, message, tag=nil) + # Will log a message once with the first procedure and subsequently with the second + # This allows repeated messages to be ignored by having the second logging function at a lower log level + # An optional tag can be used as the message key + + if tag == nil + tag = message + end + + md5_digest = Digest::MD5.new + tag_hash = md5_digest.update(tag).base64digest + res = @@logged_hashes.add?(tag_hash) + + if res == nil + # The hash was already in the set + next_loglevel_proc.call(message) + else + # First time we see this hash + first_loglevel_proc.call(message) + end + end + end # Class methods + + end # Class Log +end # Module OMS