Skip to content

Commit

Permalink
Gangams/aad stage2 full switch to mdsd (#559)
Browse files Browse the repository at this point in the history
* full switch to mdsd, upgrade to ruby v1 & omsagent removal

* add odsdirect as fallback option

* cleanup

* cleanup

* move customRegion to stage3

* updates related to containerlog route

* make xml eventschema consistent

* add buffer settings

* address HTTPServerException deprecation in ruby 2.6

* update to official mdsd version

* fix log message issue

* fix pr feedback

* get ridoff unused code from omscommon

* fix pr feedback

* fix pr feedback

* clean up

* clean up

* fix missing conf
  • Loading branch information
ganga1980 authored May 22, 2021
1 parent 0fa350e commit c707539
Show file tree
Hide file tree
Showing 49 changed files with 2,821 additions and 2,176 deletions.
16 changes: 11 additions & 5 deletions build/common/installer/scripts/tomlparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
@enrichContainerLogs = false
@containerLogSchemaVersion = ""
@collectAllKubeEvents = false
@containerLogsRoute = ""

@containerLogsRoute = "v2" # default for linux
if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0
@containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama
end
# Use parser to parse the configmap toml file to a ruby structure
def parseConfigMap
begin
Expand Down Expand Up @@ -162,8 +164,12 @@ def populateSettingValuesFromConfigMap(parsedConfig)
#Get container logs route setting
begin
if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil?
@containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version]
puts "config::Using config map setting for container logs route"
if !parsedConfig[:log_collection_settings][:route_container_logs][:version].empty?
@containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version]
puts "config::Using config map setting for container logs route: #{@containerLogsRoute}"
else
puts "config::Ignoring config map settings and using default value since provided container logs route value is empty"
end
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for container logs route - #{errorStr}, using defaults, please check config map for errors")
Expand Down Expand Up @@ -256,7 +262,7 @@ def get_command_windows(env_variable_name, env_variable_value)
file.write(commands)
commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents)
file.write(commands)
commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute)
commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute)
file.write(commands)
commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion)
file.write(commands)
Expand Down
318 changes: 178 additions & 140 deletions build/linux/installer/conf/container.conf
Original file line number Diff line number Diff line change
@@ -1,141 +1,179 @@
# Fluentd config file for OMS Docker - container components (non kubeAPI)

# Forward port 25225 for container logs
<source>
type forward
port 25225
bind 127.0.0.1
</source>

# MDM metrics from telegraf
<source>
@type tcp
tag oms.mdm.container.perf.telegraf.*
bind 0.0.0.0
port 25228
format json
</source>

# Container inventory
<source>
type containerinventory
tag oms.containerinsights.containerinventory
run_interval 60
log_level debug
</source>

#cadvisor perf
<source>
type cadvisorperf
tag oms.api.cadvisorperf
run_interval 60
log_level debug
</source>

<filter kubehealth.DaemonSet.Node**>
type filter_cadvisor_health_node
log_level debug
</filter>

<filter kubehealth.DaemonSet.Container**>
type filter_cadvisor_health_container
log_level debug
</filter>

#custom_metrics_mdm filter plugin
<filter mdm.cadvisorperf**>
type filter_cadvisor2mdm
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes
log_level info
</filter>

<filter oms.mdm.container.perf.telegraf**>
type filter_telegraf2mdm
log_level debug
</filter>

<match oms.containerinsights.containerinventory**>
type out_oms
log_level debug
num_threads 5
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer
buffer_queue_full_action drop_oldest_chunk
buffer_chunk_limit 4m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>

<match oms.api.cadvisorperf**>
type out_oms
log_level debug
num_threads 5
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer
buffer_queue_full_action drop_oldest_chunk
buffer_chunk_limit 4m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>


<match kubehealth.DaemonSet**>
@type health_forward
send_timeout 60s
recover_wait 10s
hard_timeout 60s
heartbeat_type tcp
skip_network_error_at_init true
expire_dns_cache 600s
buffer_queue_full_action drop_oldest_chunk
buffer_type file
buffer_path %STATE_DIR_WS%/out_health_forward*.buffer
buffer_chunk_limit 3m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m

<server>
host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}"
port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
</server>

<secondary>
# Fluentd config file for OMS Docker - container components (non kubeAPI)

# Forward port 25225 for container logs
# gangams - not used and get ridoff after confirming safe to remove
<source>
@type forward
port 25225
bind 127.0.0.1
</source>

# MDM metrics from telegraf
<source>
@type tcp
tag oms.mdm.container.perf.telegraf.*
bind 0.0.0.0
port 25228
format json
</source>

# Container inventory
<source>
@type containerinventory
tag oneagent.containerInsights.CONTAINER_INVENTORY_BLOB
run_interval 60
@log_level debug
</source>

#cadvisor perf
<source>
@type cadvisor_perf
tag oneagent.containerInsights.LINUX_PERF_BLOB
run_interval 60
@log_level debug
</source>

<filter kubehealth.DaemonSet.Node**>
@type cadvisor_health_node
@log_level debug
</filter>

<filter kubehealth.DaemonSet.Container**>
@type cadvisor_health_container
@log_level debug
</filter>

#custom_metrics_mdm filter plugin
<filter mdm.cadvisorperf**>
@type cadvisor2mdm
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes
@log_level info
</filter>

<filter oms.mdm.container.perf.telegraf**>
@type telegraf2mdm
@log_level debug
</filter>

#containerinventory
<match **CONTAINER_INVENTORY_BLOB**>
@type forward
@log_level debug
send_timeout 30
connect_timeout 30
heartbeat_type none
<server>
host 0.0.0.0
port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
</server>
<buffer>
@type file
path %STATE_DIR_WS%/fluent_forward_failed.buffer
</secondary>
</match>

<match mdm.cadvisorperf** oms.mdm.container.perf.telegraf**>
type out_mdm
log_level debug
num_threads 5
buffer_type file
buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer
buffer_queue_full_action drop_oldest_chunk
buffer_chunk_limit 4m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
retry_mdm_post_wait_minutes 30
</match>

<match oms.api.InsightsMetrics**>
type out_oms
log_level debug
num_threads 5
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
buffer_queue_full_action drop_oldest_chunk
buffer_chunk_limit 4m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>
path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
queue_limit_length 20
flush_interval 20s
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
flush_thread_count 5
</buffer>
keepalive true
</match>

#cadvisorperf
<match **LINUX_PERF_BLOB**>
@type forward
@log_level debug
send_timeout 30
connect_timeout 30
heartbeat_type none
<server>
host 0.0.0.0
port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
</server>
<buffer>
@type file
path /var/opt/microsoft/docker-cimprov/state/cadvisorperf*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
queue_limit_length 20
flush_interval 20s
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
flush_thread_count 5
</buffer>
keepalive true
</match>

<match kubehealth.DaemonSet**>
@type health_forward
send_timeout 60s
recover_wait 10s
hard_timeout 60s
transport tcp
ignore_network_errors_at_startup true
expire_dns_cache 600s
<buffer>
@type file
overflow_action drop_oldest_chunk
path /var/opt/microsoft/docker-cimprov/state/out_health_forward*.buffer
chunk_limit_size 3m
flush_interval 20s
retry_max_times 10
retry_max_interval 5m
retry_wait 5s
</buffer>
<server>
host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}"
port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
</server>
<secondary>
@type file
path /var/opt/microsoft/docker-cimprov/state/fluent_forward_failed.buffer
</secondary>
</match>

<match mdm.cadvisorperf** oms.mdm.container.perf.telegraf**>
@type mdm
@log_level debug
<buffer>
@type file
path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
flush_interval 20s
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
flush_thread_count 5
</buffer>
retry_mdm_post_wait_minutes 30
</match>

#InsightsMetrics
<match **INSIGHTS_METRICS_BLOB**>
@type forward
@log_level debug
send_timeout 30
connect_timeout 30
heartbeat_type none
<server>
host 0.0.0.0
port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
</server>
<buffer>
@type file
path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
queue_limit_length 20
flush_interval 20s
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
flush_thread_count 5
</buffer>
keepalive true
</match>
Loading

0 comments on commit c707539

Please sign in to comment.