From c9a01deff1bc301a7a66a82db3bb2c8a422f3d8d Mon Sep 17 00:00:00 2001 From: Olivier Vielpeau Date: Fri, 16 Sep 2016 18:04:38 +0200 Subject: [PATCH] [ceph] Improve health metrics (#2852) * Catch `KeyError` on `osd_pool_stats` We do it for all the other keys of the raw dict, so let's do it for this one too (even though I'm not a huge fan of this "try/except everything" approach). * Update health metrics for usability Behavior before this commit: the `ceph.num_near_full_osds` and `ceph.num_full_osds` would either: * take the value `0` and be tagged with no `osd` tag if no osd is reporting health issues * take a value representing the usage percentage, tagged by `osd` This doesn't really make sense with respect to the name of the metrics (`num_*`). To solve this, replace these metrics with the following: * `ceph.num_near_full_ods` and `ceph.num_full_osds` report the total number of osds that are respectively near full and full. Not tagged by osd. * when some osds report health issues, the check sends a `ceph.osd.pct_used` metric which reports the usage percentage, tagged by osd. Unfortunately we can't send `0` values on this metric when no osd reports health issues since we can't tag by osd in that case. This should make these metrics more usable. Also, use `gauge` since there's no reason to use `count`. --- checks.d/ceph.py | 99 ++++++++++++++++++---------------- tests/checks/mock/test_ceph.py | 22 ++++---- 2 files changed, 65 insertions(+), 56 deletions(-) diff --git a/checks.d/ceph.py b/checks.d/ceph.py index 09347d6352..36e8a91e57 100644 --- a/checks.d/ceph.py +++ b/checks.d/ceph.py @@ -83,64 +83,69 @@ def _extract_metrics(self, raw, tags): self.log.debug('Error retrieving osdperf metrics') try: + health = {'num_near_full_osds': 0, 'num_full_osds': 0} # Health summary will be empty if no bad news - if raw['health_detail']['summary'] == []: - health = {'num_near_full_osds' : 0, 'num_full_osds' : 0} - self._publish(health, self.count, ['num_near_full_osds'], tags) - self._publish(health, self.count, ['num_full_osds'], tags) - else: + if raw['health_detail']['summary'] != []: for osdhealth in raw['health_detail']['detail']: osd, pct = self._osd_pct_used(osdhealth) if osd: local_tags = tags + ['ceph_osd:%s' % osd.replace('.','')] if 'near' in osdhealth: - health = {'num_near_full_osds' : pct} - self._publish(health, self.count, ['num_near_full_osds'], local_tags) + health['num_near_full_osds'] += 1 + local_health = {'osd.pct_used': pct} + self._publish(local_health, self.gauge, ['osd.pct_used'], local_tags) else: - health = {'num_full_osds' : pct} - self._publish(health, self.count, ['num_full_osds'], local_tags) + health['num_full_osds'] += 1 + local_health = {'osd.pct_used': pct} + self._publish(local_health, self.gauge, ['osd.pct_used'], local_tags) + + self._publish(health, self.gauge, ['num_full_osds'], tags) + self._publish(health, self.gauge, ['num_near_full_osds'], tags) except KeyError: self.log.debug('Error retrieving health metrics') - for osdinfo in raw['osd_pool_stats']: - name = osdinfo.get('pool_name') - local_tags = tags + ['ceph_pool:%s' % name] - ops = 0 - try: - self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags) - ops += osdinfo['client_io_rate']['read_op_per_sec'] - except KeyError: - osdinfo['client_io_rate'].update({'read_op_per_sec' : 0}) - self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags) - - try: - self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags) - ops += osdinfo['client_io_rate']['write_op_per_sec'] - except KeyError: - osdinfo['client_io_rate'].update({'write_op_per_sec' : 0}) - self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags) - - try: - osdinfo['client_io_rate']['op_per_sec'] - self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags) - except KeyError: - osdinfo['client_io_rate'].update({'op_per_sec' : ops}) - self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags) - - try: - osdinfo['client_io_rate']['read_bytes_sec'] - self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags) - except KeyError: - osdinfo['client_io_rate'].update({'read_bytes_sec' : 0}) - self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags) - - try: - osdinfo['client_io_rate']['write_bytes_sec'] - self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags) - except KeyError: - osdinfo['client_io_rate'].update({'write_bytes_sec' : 0}) - self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags) + try: + for osdinfo in raw['osd_pool_stats']: + name = osdinfo.get('pool_name') + local_tags = tags + ['ceph_pool:%s' % name] + ops = 0 + try: + self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags) + ops += osdinfo['client_io_rate']['read_op_per_sec'] + except KeyError: + osdinfo['client_io_rate'].update({'read_op_per_sec' : 0}) + self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_op_per_sec'], local_tags) + + try: + self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags) + ops += osdinfo['client_io_rate']['write_op_per_sec'] + except KeyError: + osdinfo['client_io_rate'].update({'write_op_per_sec' : 0}) + self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_op_per_sec'], local_tags) + + try: + osdinfo['client_io_rate']['op_per_sec'] + self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags) + except KeyError: + osdinfo['client_io_rate'].update({'op_per_sec' : ops}) + self._publish(osdinfo, self.gauge, ['client_io_rate', 'op_per_sec'], local_tags) + + try: + osdinfo['client_io_rate']['read_bytes_sec'] + self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags) + except KeyError: + osdinfo['client_io_rate'].update({'read_bytes_sec' : 0}) + self._publish(osdinfo, self.gauge, ['client_io_rate', 'read_bytes_sec'], local_tags) + + try: + osdinfo['client_io_rate']['write_bytes_sec'] + self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags) + except KeyError: + osdinfo['client_io_rate'].update({'write_bytes_sec' : 0}) + self._publish(osdinfo, self.gauge, ['client_io_rate', 'write_bytes_sec'], local_tags) + except KeyError: + self.log.debug('Error retrieving osd_pool_stats metrics') try: osdstatus = raw['status']['osdmap']['osdmap'] diff --git a/tests/checks/mock/test_ceph.py b/tests/checks/mock/test_ceph.py index bb4f47a61e..c1b7a78e40 100644 --- a/tests/checks/mock/test_ceph.py +++ b/tests/checks/mock/test_ceph.py @@ -59,19 +59,18 @@ def test_osd_status_metrics(self): } self.run_check_twice(config, mocks=mocks, force_reload=True) - for osd in ['osd2']: - expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader', - 'ceph_osd:%s' % osd] - - for metric in ['ceph.num_full_osds']: - self.assertMetric(metric, count=1, tags=expected_tags) - for osd in ['osd1']: + for osd, pct_used in [('osd1', 94), ('osd2', 95)]: expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader', 'ceph_osd:%s' % osd] - for metric in ['ceph.num_near_full_osds']: - self.assertMetric(metric, count=1, tags=expected_tags) + for metric in ['ceph.osd.pct_used']: + self.assertMetric(metric, value=pct_used, count=1, tags=expected_tags) + + self.assertMetric('ceph.num_full_osds', value=1, count=1, + tags=['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a', 'ceph_mon_state:leader']) + self.assertMetric('ceph.num_near_full_osds', value=1, count=1, + tags=['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a', 'ceph_mon_state:leader']) for pool in ['rbd', 'scbench']: expected_tags = ['ceph_fsid:e0efcf84-e8ed-4916-8ce1-9c70242d390a','ceph_mon_state:leader', @@ -93,3 +92,8 @@ def test_osd_status_metrics_non_osd_health(self): } self.run_check_twice(config, mocks=mocks, force_reload=True) + + self.assertMetric('ceph.num_full_osds', value=0, count=1, + tags=['ceph_fsid:7d375c2a-902a-4990-93fd-ce21a296f444', 'ceph_mon_state:leader']) + self.assertMetric('ceph.num_near_full_osds', value=0, count=1, + tags=['ceph_fsid:7d375c2a-902a-4990-93fd-ce21a296f444', 'ceph_mon_state:leader'])