Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement iis as pdh check #927

Merged
merged 3 commits into from
Dec 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 99 additions & 163 deletions iis/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,183 +5,119 @@
'''
Check the performance counters from IIS
'''
# 3p
import pythoncom

# project
from checks import AgentCheck
from checks.winwmi_check import WinWMICheck, WMIMetric
from config import _is_affirmative
try:
from checks.libs.win.pdhbasecheck import PDHBaseCheck
except ImportError:
def PDHBaseCheck(*args, **kwargs):
return

from utils.containers import hash_mutable
from utils.timeout import TimeoutException


class IIS(WinWMICheck):
METRICS = [
('ServiceUptime', 'iis.uptime', 'gauge'),

# Network
('TotalBytesSent','iis.net.bytes_sent', 'rate'),
('TotalBytesReceived', 'iis.net.bytes_rcvd', 'rate'),
('TotalBytesTransferred', 'iis.net.bytes_total', 'rate'),
('CurrentConnections', 'iis.net.num_connections', 'gauge'),
('TotalFilesSent', 'iis.net.files_sent', 'rate'),
('TotalFilesReceived', 'iis.net.files_rcvd', 'rate'),
('TotalConnectionAttemptsAllInstances', 'iis.net.connection_attempts', 'rate'),

# HTTP Methods
('TotalGetRequests', 'iis.httpd_request_method.get', 'rate'),
('TotalPostRequests', 'iis.httpd_request_method.post', 'rate'),
('TotalHeadRequests', 'iis.httpd_request_method.head', 'rate'),
('TotalPutRequests', 'iis.httpd_request_method.put', 'rate'),
('TotalDeleteRequests', 'iis.httpd_request_method.delete', 'rate'),
('TotalOptionsRequests', 'iis.httpd_request_method.options', 'rate'),
('TotalTraceRequests', 'iis.httpd_request_method.trace', 'rate'),

# Errors
('TotalNotFoundErrors', 'iis.errors.not_found', 'rate'),
('TotalLockedErrors', 'iis.errors.locked', 'rate'),

# Users
('TotalAnonymousUsers', 'iis.users.anon', 'rate'),
('TotalNonAnonymousUsers', 'iis.users.nonanon', 'rate'),

# Requests
('TotalCGIRequests', 'iis.requests.cgi', 'rate'),
('TotalISAPIExtensionRequests', 'iis.requests.isapi', 'rate'),
]

DEFAULT_COUNTERS = [
["Web Service", None, "Service Uptime", "iis.uptime", "gauge"],
# Network
["Web Service", None, "Bytes Sent/sec", "iis.net.bytes_sent", "gauge"],
["Web Service", None, "Bytes Received/sec", "iis.net.bytes_rcvd", "gauge"],
["Web Service", None, "Bytes Total/sec", "iis.net.bytes_total", "gauge"],
["Web Service", None, "Current Connections", "iis.net.num_connections", "gauge"],
["Web Service", None, "Files Sent/sec", "iis.net.files_sent", "gauge"],
["Web Service", None, "Files Received/sec", "iis.net.files_rcvd" ,"gauge"],
["Web Service", None, "Total Connection Attempts (all instances)", "iis.net.connection_attempts", "gauge"],

# HTTP Methods
["Web Service", None, "Get Requests/sec", "iis.httpd_request_method.get", "gauge"],
["Web Service", None, "Post Requests/sec", "iis.httpd_request_method.post", "gauge"],
["Web Service", None, "Head Requests/sec", "iis.httpd_request_method.head", "gauge"],
["Web Service", None, "Put Requests/sec", "iis.httpd_request_method.put", "gauge"],
["Web Service", None, "Delete Requests/sec", "iis.httpd_request_method.delete", "gauge"],
["Web Service", None, "Options Requests/sec", "iis.httpd_request_method.options", "gauge"],
["Web Service", None, "Trace Requests/sec", "iis.httpd_request_method.trace", "gauge"],

# Errors
["Web Service", None, "Not Found Errors/sec", "iis.errors.not_found", "gauge"],
["Web Service", None, "Locked Errors/sec", "iis.errors.locked", "gauge"],

# Users
["Web Service", None, "Anonymous Users/sec", "iis.users.anon", "gauge"],
["Web Service", None, "NonAnonymous Users/sec", "iis.users.nonanon", "gauge"],

# Requests
["Web Service", None, "CGI Requests/sec", "iis.requests.cgi", "gauge"],
["Web Service", None, "ISAPI Extension Requests/sec", "iis.requests.isapi", "gauge"],
]
class IIS(PDHBaseCheck):
SERVICE_CHECK = "iis.site_up"

NAMESPACE = "root\\CIMV2"
CLASS = "Win32_PerfFormattedData_W3SVC_WebService"

def __init__(self, name, init_config, agentConfig, instances):
WinWMICheck.__init__(self, name, init_config, agentConfig, instances)
PDHBaseCheck.__init__(self, name, init_config, agentConfig, instances=instances, counter_list=DEFAULT_COUNTERS)

def check(self, instance):
# Connect to the WMI provider
host = instance.get('host', "localhost")
provider = instance.get('provider')
user = instance.get('username', "")
password = instance.get('password', "")
instance_tags = instance.get('tags', [])
sites = instance.get('sites', ['_Total'])
is_2008 = _is_affirmative(instance.get('is_2008', False))

instance_hash = hash_mutable(instance)
instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash)
filters = map(lambda x: {"Name": tuple(('=', x))}, sites)

metrics_by_property, properties = self._get_wmi_properties(instance_key, self.METRICS, [])

if is_2008:
for idx, prop in enumerate(properties):
if prop == "TotalBytesTransferred":
properties[idx] = "TotalBytesTransfered"
break

wmi_sampler = self._get_wmi_sampler(
instance_key,
self.CLASS, properties,
filters=filters,
host=host, namespace=self.NAMESPACE, provider=provider,
username=user, password=password
)

# Sample, extract & submit metrics
try:
wmi_sampler.sample()

metrics = self._extract_metrics(wmi_sampler, sites, instance_tags)
except TimeoutException:
self.log.warning(
u"[IIS] WMI query timed out."
u" class={wmi_class} - properties={wmi_properties} -"
u" filters={filters} - tags={instance_tags}".format(
wmi_class=self.CLASS, wmi_properties=properties,
filters=filters, instance_tags=instance_tags
)
)
except pythoncom.com_error as e:
if '0x80041017' in str(e):
self.warning(
u"You may be running IIS6/7 which reports metrics a "
u"little differently. Try enabling the is_2008 flag for this instance."
)
raise e
else:
self._submit_events(wmi_sampler, sites)
self._submit_metrics(metrics, metrics_by_property)

def _extract_metrics(self, wmi_sampler, sites, instance_tags):
"""
Extract and tag metrics from the WMISampler.

Returns: List of WMIMetric
```
[
WMIMetric("freemegabytes", 19742, ["name:_total"]),
WMIMetric("avgdiskbytesperwrite", 1536, ["name:c:"]),
]
```
"""
metrics = []

for wmi_obj in wmi_sampler:
tags = list(instance_tags) if instance_tags else []

# Get site name
sitename = wmi_obj['Name']

# Skip any sites we don't specifically want.
if sitename not in sites:
continue
elif sitename != "_Total":
tags.append("site:{0}".format(self.normalize(sitename)))

# Tag with `tag_queries` parameter
for wmi_property, wmi_value in wmi_obj.iteritems():
# No metric extraction on 'Name' property
if wmi_property == 'name':
continue

sites = instance.get('sites')
if sites is None:
expected_sites = set()
else:
expected_sites = set(sites)
# _Total should always be in the list of expected sites; we always
# report _Total
if "_Total" not in expected_sites:
expected_sites.add("_Total")

self.log.debug("expected sites is %s" % str(expected_sites))
key = hash_mutable(instance)
for inst_name, dd_name, metric_func, counter in self._metrics[key]:
try:
try:
metrics.append(WMIMetric(wmi_property, float(wmi_value), tags))
except ValueError:
self.log.warning(u"When extracting metrics with WMI, found a non digit value"
" for property '{0}'.".format(wmi_property))
continue
except TypeError:
self.log.warning(u"When extracting metrics with WMI, found a missing property"
" '{0}'".format(wmi_property))
vals = counter.get_all_values()
except Exception as e:
self.log.error("Failed to get_all_values %s %s" % (inst_name, dd_name))
continue
return metrics

def _submit_events(self, wmi_sampler, sites):
expected_sites = set(sites)

for wmi_obj in wmi_sampler:
sitename = wmi_obj['Name']
uptime = wmi_obj["ServiceUptime"]
status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK

self.service_check(self.SERVICE_CHECK, status, tags=['site:{0}'.format(self.normalize(sitename))])
expected_sites.remove(sitename)
for sitename, val in vals.iteritems():
tags = []
if key in self._tags:
tags = self._tags[key]

try:
if not counter.is_single_instance():
# Skip any sites we don't specifically want.
if not sites:
tags.append("site:{0}".format(self.normalize(sitename)))
# always report total
elif sitename == "_Total":
tags.append("site:{0}".format(self.normalize(sitename)))
elif sitename not in sites:
continue
else:
tags.append("site:{0}".format(self.normalize(sitename)))
except Exception as e:
self.log.error("Caught exception %s setting tags" % str(e))

try:
metric_func(dd_name, val, tags)
except Exception as e:
self.log.error("metric_func: %s %s %s" % (dd_name, str(val), str(e)))
pass

if dd_name == "iis.uptime":
uptime = int(val)
status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK
self.service_check(self.SERVICE_CHECK, status, tags=['site:{0}'.format(self.normalize(sitename))])
if sitename in expected_sites:
self.log.debug("Removing %s from expected sites" % sitename)
expected_sites.remove(sitename)
else:
self.log.warning("site not in expected_sites %s" % sitename)

except Exception as e:
# don't give up on all of the metrics because one failed
self.log.error("IIS Failed to get metric data for %s %s: %s" % (inst_name, dd_name, str(e)))
pass

for site in expected_sites:
self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL,
tags=['site:{0}'.format(self.normalize(site))])

def _submit_metrics(self, wmi_metrics, metrics_by_property):
for m in wmi_metrics:
metric_name = m.name
# Windows 2008 sp2 reports it as TotalbytesTransfered
# instead of TotalBytesTransferred (single r)
if metric_name.lower() == "totalbytestransfered":
metric_name = "totalbytestransferred"
elif m.name not in metrics_by_property:
continue

metric, mtype = metrics_by_property[metric_name]
submittor = getattr(self, mtype)
submittor(metric, m.value, m.tags)
tags=['site:{0}'.format(self.normalize(site))])
31 changes: 22 additions & 9 deletions iis/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@ init_config:

instances:
# By default, this check will run against a single instance - the current
# machine that the Agent is running on. It will check the WMI performance
# counters for IIS on that machine.
# machine that the Agent is running on. It will check the PDH (Performance
# Data Helper) performance counters for IIS on that machine.
#
# If you want to check other remote machines as well, you can add one
# instance per host. Note: If you also want to check the counters on the
# current machine, you will have to create an instance with empty params.
#
# The optional `provider` parameter allows to specify a WMI provider
# (default to `32` on Datadog Agent 32-bit or `64`). It is used to request
# WMI data from the non-default provider. Available options are: `32` or `64`.
# For more information: https://msdn.microsoft.com/en-us/library/aa393067(v=vs.85).aspx
#
# The `sites` parameter allows you to specify a list of sites you want to
# read metrics from. With sites specified, metrics will be tagged with the
# site name. If you don't define any sites, the check will pull the
# aggregate values across all sites.
# site name. If you don't define any sites, the check will pull all of the
# sites, and tag each one with the site name
#
# PDH provides hundreds of metrics for each service. Additional metrics
# can be specified using the `additional_metrics` configuration.
#
# Here's an example of configuration that would check the current machine
# and a remote machine called MYREMOTESERVER. For the remote host we are
Expand All @@ -31,6 +29,21 @@ instances:
# sites:
# - Default Web Site
#
# The additional metrics is a list of additional counters to collect. The
# list is formatted as follows:
# ['<counterset name>', <counter instance name>, '<counter name>', <metric name>, <metric type>]
#
# <counterset name> is the name of the PDH Counter Set (the name of the counter)
# <counter instance name> is the specific counter instance to collect, for example
# "Default Web Site". Specify 'none' For all instances of the counter.
# <counter name> is the individual counter to report
# <metric name> is the name you want to show up in Datadog
# <metric type> is from the standard choices for all agent checks, such as gauge,
# rate, histogram or counter
#
# additional_metrics:
# - ['Web Service', none, 'CGI Requests/sec', iis.httpd_request_method.cgi, gauge]

# - host: MYREMOTESERVER
# username: MYREMOTESERVER\fred
# password: mysecretpassword
Expand Down
19 changes: 12 additions & 7 deletions iis/test_iis.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,29 @@ class IISTest(AgentCheckTest):
def test_basic_check(self):
self.run_check_twice({'instances': [MINIMAL_INSTANCE]})

site_tags = ['Default_Web_Site', 'Test_Website_1', 'Total']
for metric in self.IIS_METRICS:
self.assertMetric(metric, tags=[], count=1)
for site_tag in site_tags:
self.assertMetric(metric, tags=["site:{0}".format(site_tag)], count=1)

for site_tag in site_tags:
self.assertServiceCheckOK('iis.site_up',
tags=["site:{0}".format(site_tag)], count=1)

self.assertServiceCheckOK('iis.site_up', tags=["site:{0}".format('Total')], count=1)
self.coverage_report()

def test_check_on_specific_websites(self):
self.run_check_twice({'instances': [INSTANCE]})

site_tags = ['Default_Web_Site', 'Test_Website_1']
site_tags = ['Default_Web_Site', 'Test_Website_1', 'Total']
for metric in self.IIS_METRICS:
for site_tag in site_tags:
self.assertMetric(metric, tags=["site:{0}".format(site_tag)], count=1)

self.assertServiceCheckOK('iis.site_up',
tags=["site:{0}".format('Default_Web_Site')], count=1)
self.assertServiceCheckOK('iis.site_up',
tags=["site:{0}".format('Test_Website_1')], count=1)
for site_tag in site_tags:
self.assertServiceCheckOK('iis.site_up',
tags=["site:{0}".format(site_tag)], count=1)

self.assertServiceCheckCritical('iis.site_up',
tags=["site:{0}".format('Non_Existing_Website')], count=1)

Expand Down
2 changes: 1 addition & 1 deletion pdh_check/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@ def check(self, instance):
tags = self._tags[key]

if not counter.is_single_instance():
tag = "instance=%s" % key
tag = "instance:%s" % key
tags.append(tag)
metric_func(dd_name, val, tags)
4 changes: 2 additions & 2 deletions pdh_check/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
"maintainer": "[email protected]",
"manifest_version": "0.1.1",
"max_agent_version": "6.0.0",
"min_agent_version": "5.18.0",
"min_agent_version": "5.21.0",
"name": "pdh_check",
"short_description": "Collect and graph any Windows PDH metrics.",
"support": "core",
"supported_os": [
"windows"
],
"version": "1.0.0",
"version": "1.1.0",
"guid": "D09B3410-00A0-4789-ABD7-7740C3FE211F",
"public_title": "Datadog-Pdh Check Integration",
"categories":["os & system"],
Expand Down