Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Aerospike 5.3 #8430

Merged
merged 28 commits into from
Feb 1, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f57df36
Add version parsing and checking
ChristineTChen Jan 22, 2021
8d60458
Add 5.3
ChristineTChen Jan 28, 2021
254f853
Refactor metric name matching
ChristineTChen Jan 29, 2021
60b4e38
Include ns
ChristineTChen Jan 29, 2021
c238732
Add test
ChristineTChen Jan 29, 2021
d60f373
Get all values
ChristineTChen Jan 29, 2021
554abdf
Set version condition for dc command
ChristineTChen Jan 29, 2021
923fc66
Remove duplicate metric metadata
ChristineTChen Jan 29, 2021
1c19632
Add metric list
ChristineTChen Jan 29, 2021
3812cf4
Add aerospike test with versioning
ChristineTChen Jan 30, 2021
fd814fa
Add ops sec metric assertion
ChristineTChen Jan 30, 2021
63367dc
Fix style
ChristineTChen Jan 30, 2021
eb839e1
Fix style
ChristineTChen Jan 30, 2021
2610143
Fix test version
ChristineTChen Jan 30, 2021
3e1cf26
Fix version mock
ChristineTChen Feb 1, 2021
c322be9
Ensure custom tags are used
ChristineTChen Feb 1, 2021
0d3c04c
Remove version test mock
ChristineTChen Feb 1, 2021
3ff400d
Fix style
ChristineTChen Feb 1, 2021
72d2e1f
Tag new latency metrics by bucket
ChristineTChen Feb 1, 2021
b69c873
Document new and deprecated metrics
ChristineTChen Feb 1, 2021
78a0ffd
Lower log level when version not found
ChristineTChen Feb 1, 2021
98cde01
Update aerospike/datadog_checks/aerospike/aerospike.py
ChristineTChen Feb 1, 2021
d6af601
Update aerospike/datadog_checks/aerospike/aerospike.py
ChristineTChen Feb 1, 2021
7ba1136
Update aerospike/datadog_checks/aerospike/aerospike.py
ChristineTChen Feb 1, 2021
1a24cd2
Update aerospike/datadog_checks/aerospike/aerospike.py
ChristineTChen Feb 1, 2021
777a141
Update links
ChristineTChen Feb 1, 2021
de1765d
Use tuple for version
ChristineTChen Feb 1, 2021
e7a43bc
Use tuple
ChristineTChen Feb 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 112 additions & 36 deletions aerospike/datadog_checks/aerospike/aerospike.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
ENABLED_VALUES = {'true', 'on', 'enable', 'enabled'}
DISABLED_VALUES = {'false', 'off', 'disable', 'disabled'}

V5_1 = [5, 1, 0, 0]
V5_0 = [5, 0, 0, 0]
ChristineTChen marked this conversation as resolved.
Show resolved Hide resolved


def parse_namespace(data, namespace, secondary):
idxs = []
Expand Down Expand Up @@ -141,27 +144,49 @@ def check(self, _):
set_tags.extend(namespace_tags)
self.collect_info('sets/{}/{}'.format(ns, s), SET_METRIC_TYPE, separator=':', tags=set_tags)

# https://www.aerospike.com/docs/reference/info/#dcs
try:
datacenters = self.get_datacenters()
version = self.collect_version()
if version is None:
self.log.debug("Could not determine version, using Aerospike v5.1")
ChristineTChen marked this conversation as resolved.
Show resolved Hide resolved
version = V5_1

for dc in datacenters:
self.collect_datacenter(dc)
if version < V5_1:
# https://www.aerospike.com/docs/reference/info/#throughput
self.collect_throughput(namespaces)
# https://www.aerospike.com/docs/reference/info/#latency
self.collect_latency(namespaces)

except Exception as e:
self.log.debug("There were no datacenters found: %s", e)
if version < V5_0:
# https://www.aerospike.com/docs/reference/info/#dcs
try:
datacenters = self.get_datacenters()

# https://www.aerospike.com/docs/reference/info/#throughput
self.collect_throughput(namespaces)
for dc in datacenters:
self.collect_datacenter(dc)

# https://www.aerospike.com/docs/reference/info/#latency
self.collect_latency(namespaces)
self.collect_version()
except Exception as e:
self.log.debug("There were no datacenters found: %s", e)
else:
# https://www.aerospike.com/docs/reference/info/#latencies
self.collect_latencies(namespaces)

self.service_check(SERVICE_CHECK_UP, self.OK, tags=self._tags)

def collect_version(self):
version = self.get_info("build")[0]
raw_version = self.get_info("build")[0]
self.submit_version_metadata(raw_version)

try:
parse_version = raw_version.split('.')
version = [int(p) for p in parse_version]
except Exception as e:
self.log.debug("Unable to parse version: %s", str(e))
return None

self.log.debug("Found Aerospike version: %s", version)
return version

@AgentCheck.metadata_entrypoint
def submit_version_metadata(self, version):
self.set_metadata('version', version)

def collect_info(self, command, metric_type, separator=';', required_keys=None, tags=None):
Expand Down Expand Up @@ -217,15 +242,18 @@ def get_client(self):
def get_info(self, command, separator=';'):
# See https://www.aerospike.com/docs/reference/info/
# Example output: command\tKEY=VALUE;KEY=VALUE;...
data = self._client.info_node(command, self._host, self._info_policies)
self.log.debug(
"Get info results for command=`%s`, host=`%s`, policies=`%s`: %s",
command,
self._host,
self._info_policies,
data,
)

try:
data = self._client.info_node(command, self._host, self._info_policies)
self.log.debug(
"Get info results for command=`%s`, host=`%s`, policies=`%s`: %s",
command,
self._host,
self._info_policies,
data,
)
except Exception as e:
self.log.warning("Command `%s` was unsuccessful: %s"(command, str(e)))
return
# Get rid of command and whitespace
data = data[len(command) :].strip()

Expand Down Expand Up @@ -259,6 +287,66 @@ def collect_datacenter(self, datacenter):
continue
self.send(DATACENTER_METRIC_TYPE, key, value, datacenter_tags)

def get_metric_name(self, line):
# match only works at the beginning
# ':' or ';' are not allowed in namespace-name: https://www.aerospike.com/docs/guide/limitations.html
ns_metric_name_match = re.match(r'{([^\}:;]+)}-(\w+):', line)
ChristineTChen marked this conversation as resolved.
Show resolved Hide resolved
if ns_metric_name_match:
return ns_metric_name_match.groups()[0], ns_metric_name_match.groups()[1]
elif line.startswith("batch-index"):
# https://www.aerospike.com/docs/operations/monitor/latency/#batch-index
return None, "batch-index"
else:
self.log.warning("Invalid data. Namespace and/or metric name not found in line: `%s`", line)
# Since the data come by pair and the order matters it's safer to return right away than submitting
# possibly wrong metrics.
return None, None

def collect_latencies(self, namespaces):
"""
In Aerospike 5.1+, the `latencies` command is used gives the output of latencies like so:
ChristineTChen marked this conversation as resolved.
Show resolved Hide resolved

histogramName_0:timeUnit,ops/sec,threshOld_0,threshOld_1,...;histogramName_1:...
"""
data = self.get_info('latencies:')

while data:
line = data.pop(0)

if not data:
break

ns, metric_name = self.get_metric_name(line)
if metric_name is None:
return

namespace_tags = ['namespace:{}'.format(ns)] if ns else []
namespace_tags.extend(self._tags)

values = re.search(r'\:\w+\,(\d*\.?\d*),([,\d+.\d+]*)', line)
ChristineTChen marked this conversation as resolved.
Show resolved Hide resolved
if values:
ops_per_sec_val = values.groups()[0]
# For backwards compatibility, the ops/sec value is `latencies` is already calculated
ops_per_sec_name = metric_name + "_" + "ops_sec"
self.send(NAMESPACE_LATENCY_METRIC_TYPE, ops_per_sec_name, float(ops_per_sec_val), namespace_tags)

bucket_vals = values.groups()[1]
if bucket_vals:
latencies = bucket_vals.split(',')
if latencies and len(latencies) == 17:
for i in range(len(latencies)):
bucket = 2 ** i
tags = namespace_tags + ['bucket:{}'.format(bucket)]
latency_name = metric_name
self.send(NAMESPACE_LATENCY_METRIC_TYPE, latency_name, latencies[i], tags)

# Also submit old latency names like `aerospike.namespace.latency.read_over_64ms`
if bucket in [1, 8, 64]:
latency_name = metric_name + '_over_{}ms'.format(str(bucket))
self.send(NAMESPACE_LATENCY_METRIC_TYPE, latency_name, latencies[i], tags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we add this new tag?

Copy link
Contributor Author

@ChristineTChen ChristineTChen Feb 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the legacy metrics are also tagged by the bucket:<num>

else:
self.log.debug("Got unexpected latency buckets: %s", latencies)

def collect_latency(self, namespaces):
data = self.get_info('latency:')

Expand All @@ -282,20 +370,8 @@ def collect_latency(self, namespaces):
ns_latencies[ns].setdefault("metric_values", []).extend(metric_values)
continue

# match only works at the beginning
# ':' or ';' are not allowed in namespace-name: https://www.aerospike.com/docs/guide/limitations.html
ns_metric_name_match = re.match(r'{([^\}:;]+)}-(\w+):', line)
if ns_metric_name_match:
ns = ns_metric_name_match.groups()[0]
metric_name = ns_metric_name_match.groups()[1]
elif line.startswith("batch-index"):
# https://www.aerospike.com/docs/operations/monitor/latency/#batch-index
ns = None
metric_name = "batch-index"
else:
self.log.warning("Invalid data. Namespace and/or metric name not found in line: `%s`", line)
# Since the data come by pair and the order matters it's safer to return right away than submitting
# possibly wrong metrics.
ns, metric_name = self.get_metric_name(line)
if metric_name is None:
return

# need search because this isn't at the beginning
Expand Down
63 changes: 34 additions & 29 deletions aerospike/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,10 @@ aerospike.namespace.latency.batch_index_ops_sec,gauge,,transaction,,The batch re
aerospike.namespace.latency.batch_index_over_1ms,gauge,,transaction,,The batch read latency over 1ms.,-1,aerospike,batch read latency 1ms
aerospike.namespace.latency.batch_index_over_64ms,gauge,,transaction,,The batch read latency over 64ms.,-1,aerospike,batch read latency 64ms
aerospike.namespace.latency.batch_index_over_8ms,gauge,,transaction,,The batch read latency over 8ms.,-1,aerospike,batch read latency 8ms
aerospike.namespace.latency.batch_index,gauge,,transaction,,The batch latency tagged by bucket (1 to 2^16).,0,aerospike,
aerospike.namespace.latency.read,gauge,,transaction,,The read latency tagged by bucket (1 to 2^16).,0,aerospike,
aerospike.namespace.latency.write,gauge,,transaction,,The write latency tagged by bucket (1 to 2^16).,0,aerospike,
aerospike.namespace.latency.udf,gauge,,transaction,,The udf latency tagged by bucket (1 to 2^16).,0,aerospike,
aerospike.namespace.migrate_order,gauge,,,,The number between 1 and 10 which determines the order namespaces are to be processed when migrating.,0,aerospike,
aerospike.namespace.geo2dsphere_within.strict,gauge,,,,"An additional sanity check from Aerospike to validate whether the points returned by S2 falls under the user's query region. When set to false, Aerospike does not do this additional check and send the results as it is.",0,aerospike,
aerospike.namespace.geo2dsphere_within.earth_radius_meters,gauge,,,,"Earth's radius in meters, since the workspace here is the complete earth.",0,aerospike,
Expand Down Expand Up @@ -671,38 +675,39 @@ aerospike.system_total_cpu_pct,gauge,,percent,,The percentage of CPU usage by al
aerospike.system_user_cpu_pct,gauge,,percent,,The percentage of CPU usage by processes running in user mode.,0,aerospike,
aerospike.time_since_rebalance,gauge,,second,,"The number of seconds since the last reclustering event, either triggered via the recluster info command or by a cluster disruption (such as a node being add/removed or a network disruption).",0,aerospike,
aerospike.xdr_ship_destination_permanent_error,gauge,,error,,The number of permanent errors from the remote cluster(s) while shipping records.,0,aerospike,
aerospike.datacenter.dc_as_open_conn,gauge,,connection,,The number of open connection to the Aerospike DC.,0,aerospike,
aerospike.datacenter.dc_as_size,gauge,,,,The cluster size of the destination Aerospike DC.,0,aerospike,
aerospike.datacenter.dc_http_good_locations,gauge,,,,The number of URLs that are considered healthy and being used by the change notification system.,0,aerospike,
aerospike.datacenter.dc_http_locations,gauge,,,,The number of URLs configured for the HTTP destination.,0,aerospike,
aerospike.datacenter.dc_ship_attempts,gauge,,record,,"The number of records that have been attempted to be shipped, but could have resulted in either success or error.",0,aerospike,
aerospike.datacenter.dc_ship_bytes,gauge,,byte,,The number of bytes shipped for this DC.,0,aerospike,
aerospike.datacenter.dc_ship_delete_success,gauge,,transaction,,The number of delete transactions that have been successfully shipped.,0,aerospike,
aerospike.datacenter.dc_ship_destination_error,gauge,,error,,The number of errors from the remote cluster(s) while shipping records for this DC.,0,aerospike,
aerospike.datacenter.dc_ship_idle_avg,gauge,,millisecond,,The average number of ms of sleep for each record being shipped.,0,aerospike,
aerospike.datacenter.dc_ship_idle_avg_pct,gauge,,percent,,The representation in percent of total time spent for dc_ship_idle_avg.,0,aerospike,
aerospike.datacenter.dc_ship_inflight_objects,gauge,,record,,The number of records that are inflight (which have been shipped but for which a response from the remote DC has not yet been received).,0,aerospike,
aerospike.datacenter.dc_ship_latency_avg,gauge,,,,The moving average of shipping latency for the specific DC.,0,aerospike,
aerospike.datacenter.dc_ship_source_error,gauge,,error,,The number of client layer errors while shipping records for this DC.,0,aerospike,
aerospike.datacenter.dc_ship_success,gauge,,record,,The number of records that have been successfully shipped.,0,aerospike,
aerospike.datacenter.dc_state,gauge,,,,The state of the DC.,0,aerospike,
aerospike.datacenter.dc_timelag,gauge,,,,The time lag for this specific DC.,0,aerospike,
aerospike.datacenter.dc_deletes_shipped,gauge,,transaction,,The number of delete transactions that have been successfully shipped. Deprecated.,0,aerospike,
aerospike.datacenter.dc_err_ship_client,gauge,,error,,"The number of client layer errors while shipping records for this DC. Errors include timeout, bad network fd, etc.",0,aerospike,
aerospike.datacenter.dc_err_ship_server,gauge,,error,,"The number of errors from the remote cluster(s) while shipping records for this DC. Errors include out-of-space, key-busy, etc.",0,aerospike,
aerospike.datacenter.dc_esmt_bytes_shipped,gauge,,byte,,The number of bytes shipped for this DC. Deprecated.,0,aerospike,
aerospike.datacenter.dc_latency_avg_ship,gauge,,,,The moving average of shipping latency for the specific DC. Deprecated.,0,aerospike,
aerospike.datacenter.dc_open_conn,gauge,,connection,,"The number of open connection to the DC. If the DC accepts pipeline writes, there will be 64 connections per destination node.",0,aerospike,
aerospike.datacenter.dc_recs_inflight,gauge,,record,,The number of records that are inflight (which have been shipped but for which a response from the remote DC has not yet been received). Deprecated.,0,aerospike,
aerospike.datacenter.dc_recs_shipped,gauge,,record,,"The number of records that have been attempted to be shipped, but could have resulted in either success or error. Deprecated.",0,aerospike,
aerospike.datacenter.dc_recs_shipped_ok,gauge,,record,,The number of records that have been successfully shipped. Deprecated.,0,aerospike,
aerospike.datacenter.dc_remote_ship_avg_sleep,gauge,,millisecond,,The average number of ms of sleep for each record being shipped. Deprecated.,0,aerospike,
aerospike.datacenter.dc_size,gauge,,,,The cluster size of the destination DC.,0,aerospike,
aerospike.namespace.tps.read,gauge,,,,The throughput performace of reads,0,aerospike,
aerospike.namespace.tps.write,gauge,,,,The throughput performace of writes,0,aerospike,
aerospike.datacenter.dc_as_open_conn,gauge,,connection,,The number of open connection to the Aerospike DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_as_size,gauge,,,,The cluster size of the destination Aerospike DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_http_good_locations,gauge,,,,The number of URLs that are considered healthy and being used by the change notification system. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_http_locations,gauge,,,,The number of URLs configured for the HTTP destination. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_attempts,gauge,,record,,"The number of records that have been attempted to be shipped, but could have resulted in either success or error. [Removed in 5.0.0]",0,aerospike,
aerospike.datacenter.dc_ship_bytes,gauge,,byte,,The number of bytes shipped for this DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_delete_success,gauge,,transaction,,The number of delete transactions that have been successfully shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_destination_error,gauge,,error,,The number of errors from the remote cluster(s) while shipping records for this DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_idle_avg,gauge,,millisecond,,The average number of ms of sleep for each record being shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_idle_avg_pct,gauge,,percent,,The representation in percent of total time spent for dc_ship_idle_avg. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_inflight_objects,gauge,,record,,The number of records that are inflight (which have been shipped but for which a response from the remote DC has not yet been received). [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_latency_avg,gauge,,,,The moving average of shipping latency for the specific DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_source_error,gauge,,error,,The number of client layer errors while shipping records for this DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_ship_success,gauge,,record,,The number of records that have been successfully shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_state,gauge,,,,The state of the DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_timelag,gauge,,,,The time lag for this specific DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_deletes_shipped,gauge,,transaction,,The number of delete transactions that have been successfully shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_err_ship_client,gauge,,error,,"The number of client layer errors while shipping records for this DC. Errors include timeout [Removed in 5.0.0], bad network fd, etc.",0,aerospike,
aerospike.datacenter.dc_err_ship_server,gauge,,error,,"The number of errors from the remote cluster(s) while shipping records for this DC. Errors include out-of-space, key-busy, etc. [Removed in 5.0.0]",0,aerospike,
aerospike.datacenter.dc_esmt_bytes_shipped,gauge,,byte,,The number of bytes shipped for this DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_latency_avg_ship,gauge,,,,The moving average of shipping latency for the specific DC. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_open_conn,gauge,,connection,,"The number of open connection to the DC. If the DC accepts pipeline writes, there will be 64 connections per destination node. [Removed in 5.0.0]",0,aerospike,
aerospike.datacenter.dc_recs_inflight,gauge,,record,,The number of records that are inflight (which have been shipped but for which a response from the remote DC has not yet been received). [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_recs_shipped,gauge,,record,,"The number of records that have been attempted to be shipped, but could have resulted in either success or error. [Removed in 5.0.0]",0,aerospike,
aerospike.datacenter.dc_recs_shipped_ok,gauge,,record,,The number of records that have been successfully shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_remote_ship_avg_sleep,gauge,,millisecond,,The average number of ms of sleep for each record being shipped. [Removed in 5.0.0],0,aerospike,
aerospike.datacenter.dc_size,gauge,,,,The cluster size of the destination DC. [Removed in 5.0.0],0,aerospike,
aerospike.namespace.tps.read,gauge,,,,The throughput performace of reads. [Removed in 5.1.0],0,aerospike,
aerospike.namespace.tps.write,gauge,,,,The throughput performace of writes. [Removed in 5.1.0],0,aerospike,
aerospike.set.memory_data_bytes,gauge,,byte,,The memory used by this set for the data part (does not include index part). Value will be 0 if data is not stored in memory. ,0,aerospike,
aerospike.set.objects,gauge,,record,,The total number of objects (master and all replicas) in this set on this node.,0,aerospike,Set Objects
aerospike.set.stop_writes_count,gauge,,,,The total count this set has hit stop_writes,0,aerospike,
aerospike.set.device_data_bytes,gauge,,byte,,The device storage used by data (master and proles) excluding primary index.,0,aerospike,
aerospike.sindex.keys,gauge,,,,The number of secondary keys for this secondary index.,0,aerospike,
aerospike.sindex.entries,gauge,,,,Th number of secondary index entries for this secondary index. This is the number of records that have been indexed by this secondary index.,0,aerospike,
aerospike.sindex.ibtr_memory_used,gauge,,byte,,The amount of memory the secondary index is consuming for the keys,0,aerospike,Sindex Index Memory Used
Expand Down
Loading