From c9a01deff1bc301a7a66a82db3bb2c8a422f3d8d Mon Sep 17 00:00:00 2001
From: Olivier Vielpeau <olivielpeau@users.noreply.github.com>
Date: Fri, 16 Sep 2016 18:04:38 +0200
Subject: [PATCH] [ceph] Improve health metrics (#2852)

* Catch `KeyError` on `osd_pool_stats`

We do it for all the other keys of the raw dict, so let's do it for
this one too (even though I'm not a huge fan of this "try/except
everything" approach).

* Update health metrics for usability

Behavior before this commit: the `ceph.num_near_full_osds` and
`ceph.num_full_osds` would either:
* take the value `0` and be tagged with no `osd` tag if no osd is
reporting health issues
* take a value representing the usage percentage, tagged by `osd`

This doesn't really make sense with respect to the name of the metrics
(`num_*`). To solve this, replace these metrics with the following:
* `ceph.num_near_full_ods` and `ceph.num_full_osds` report the total
number of osds that are respectively near full and full. Not tagged
by osd.
* when some osds report health issues, the check sends a
`ceph.osd.pct_used` metric which reports the usage
percentage, tagged by osd. Unfortunately we can't send `0` values
on this metric when no osd reports health issues since we can't tag by
osd in that case.

This should make these metrics more usable. Also, use `gauge` since
there's no reason to use `count`.
---
 checks.d/ceph.py               | 99 ++++++++++++++++++----------------
 tests/checks/mock/test_ceph.py | 22 ++++----
 2 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/checks.d/ceph.py b/checks.d/ceph.py
index 09347d6352..36e8a91e57 100644
--- a/checks.d/ceph.py
+++ b/checks.d/ceph.py
@@ -83,64 +83,69 @@ def _extract_metrics(self, raw, tags):
             self.log.debug('Error retrieving osdperf metrics')
 
         try:
+            health = {'num_near_full_osds': 0, 'num_full_osds': 0}
             # Health summary will be empty if no bad news
-            if raw['health_detail']['summary'] == []:
-                health = {'num_near_full_osds' : 0, 'num_full_osds' : 0}
-                self._publish(health, self.count, ['num_near_full_osds'], tags)
-                self._publish(health, self.count, ['num_full_osds'], tags)
-            else:
+            if raw['health_detail']['summary'] != []:
                 for osdhealth in raw['health_detail']['detail']:
                     osd, pct = self._osd_pct_used(osdhealth)
                     if osd:
                         local_tags = tags + ['ceph_osd:%s' % osd.replace('.','')]
 
                         if 'near' in osdhealth:
-                            health = {'num_near_full_osds' : pct}
-                            self._publish(health, self.count, ['num_near_full_osds'], local_tags)
+                            health['num_near_full_osds'] += 1
+                            local_health = {'osd.pct_used': pct}
+                            self._publish(local_health, self.gauge, ['osd.pct_used'], local_tags)
                         else:
-                            health = {'num_full_osds' : pct}
-                            self._publish(health, self.count, ['num_full_osds'], local_tags)
+                            health['num_full_osds'] += 1
+                            local_health = {'osd.pct_used': pct}
+                            self._publish(local_health, self.gauge, ['osd.pct_used'], local_tags)
+
+            self._publish(health, self.gauge, ['num_full_osds'], tags)
+            self._publish(health, self.gauge, ['num_near_full_osds'], tags)
         except KeyError:
             self.log.debug('Error retrieving health metrics')
 
-        for osdinfo in raw['osd_pool_stats']:
-            name = osdinfo.get('pool_name')
-            local_tags = tags + ['ceph_pool:%s' % name]
-            ops = 0
-            try:
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags)
-                ops += osdinfo['client_io_rate']['read_op_per_sec']
-            except KeyError:
-                osdinfo['client_io_rate'].update({'read_op_per_sec' : 0})
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags)
-
-            try:
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags)
-                ops += osdinfo['client_io_rate']['write_op_per_sec']
-            except KeyError:
-                osdinfo['client_io_rate'].update({'write_op_per_sec' : 0})
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags)
-
-            try:
-                osdinfo['client_io_rate']['op_per_sec']
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags)
-            except KeyError:
-                osdinfo['client_io_rate'].update({'op_per_sec' : ops})
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags)
-
-            try:
-                osdinfo['client_io_rate']['read_bytes_sec']
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags)
-            except KeyError:
-                osdinfo['client_io_rate'].update({'read_bytes_sec' : 0})
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags)
-
-            try:
-                osdinfo['client_io_rate']['write_bytes_sec']
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags)
-            except KeyError:
-                osdinfo['client_io_rate'].update({'write_bytes_sec' : 0})
-                self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags)
+        try:
+            for osdinfo in raw['osd_pool_stats']:
+                name = osdinfo.get('pool_name')
+                local_tags = tags + ['ceph_pool:%s' % name]
+                ops = 0
+                try:
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags)
+                    ops += osdinfo['client_io_rate']['read_op_per_sec']
+                except KeyError:
+                    osdinfo['client_io_rate'].update({'read_op_per_sec' : 0})
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags)
+
+                try:
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags)
+                    ops += osdinfo['client_io_rate']['write_op_per_sec']
+                except KeyError:
+                    osdinfo['client_io_rate'].update({'write_op_per_sec' : 0})
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags)
+
+                try:
+                    osdinfo['client_io_rate']['op_per_sec']
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags)
+                except KeyError:
+                    osdinfo['client_io_rate'].update({'op_per_sec' : ops})
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags)
+
+                try:
+                    osdinfo['client_io_rate']['read_bytes_sec']
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags)
+                except KeyError:
+                    osdinfo['client_io_rate'].update({'read_bytes_sec' : 0})
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags)
+
+                try:
+                    osdinfo['client_io_rate']['write_bytes_sec']
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags)
+                except KeyError:
+                    osdinfo['client_io_rate'].update({'write_bytes_sec' : 0})
+                    self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags)
+        except KeyError:
+            self.log.debug('Error retrieving osd_pool_stats metrics')
 
         try:
             osdstatus = raw['status']['osdmap']['osdmap']
diff --git a/tests/checks/mock/test_ceph.py b/tests/checks/mock/test_ceph.py
index bb4f47a61e..c1b7a78e40 100644
--- a/tests/checks/mock/test_ceph.py
+++ b/tests/checks/mock/test_ceph.py
@@ -59,19 +59,18 @@ def test_osd_status_metrics(self):
         }
 
         self.run_check_twice(config, mocks=mocks, force_reload=True)
-        for osd in ['osd2']:
-            expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader',
-                             'ceph_osd:%s' % osd]
-
-            for metric in ['ceph.num_full_osds']:
-                self.assertMetric(metric, count=1, tags=expected_tags)
 
-        for osd in ['osd1']:
+        for osd, pct_used in [('osd1', 94), ('osd2', 95)]:
             expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader',
                              'ceph_osd:%s' % osd]
 
-            for metric in ['ceph.num_near_full_osds']:
-                self.assertMetric(metric, count=1, tags=expected_tags)
+            for metric in ['ceph.osd.pct_used']:
+                self.assertMetric(metric, value=pct_used, count=1, tags=expected_tags)
+
+        self.assertMetric('ceph.num_full_osds', value=1, count=1,
+                          tags=['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a', 'ceph_mon_state:leader'])
+        self.assertMetric('ceph.num_near_full_osds', value=1, count=1,
+                          tags=['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a', 'ceph_mon_state:leader'])
 
         for pool in ['rbd', 'scbench']:
             expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader',
@@ -93,3 +92,8 @@ def test_osd_status_metrics_non_osd_health(self):
         }
 
         self.run_check_twice(config, mocks=mocks, force_reload=True)
+
+        self.assertMetric('ceph.num_full_osds', value=0, count=1,
+                          tags=['ceph_fsid:7d375c2a-902a-4990-93fd-ce21a296f444', 'ceph_mon_state:leader'])
+        self.assertMetric('ceph.num_near_full_osds', value=0, count=1,
+                          tags=['ceph_fsid:7d375c2a-902a-4990-93fd-ce21a296f444', 'ceph_mon_state:leader'])