Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite Postgres size query and add postgresql.relation.{tuples,pages,all_visible} + toast_size metrics #14500

Merged
merged 6 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
from datadog_checks.postgres import aws
from datadog_checks.postgres.metadata import PostgresMetadata
from datadog_checks.postgres.metrics_cache import PostgresMetricsCache
from datadog_checks.postgres.relationsmanager import INDEX_BLOAT, RELATION_METRICS, TABLE_BLOAT, RelationsManager
from datadog_checks.postgres.relationsmanager import (
DYNAMIC_RELATION_QUERIES,
INDEX_BLOAT,
RELATION_METRICS,
TABLE_BLOAT,
RelationsManager,
)
from datadog_checks.postgres.statement_samples import PostgresStatementSamples
from datadog_checks.postgres.statements import PostgresStatementMetrics

Expand Down Expand Up @@ -85,7 +91,7 @@ def __init__(self, name, init_config, instances):
self.statement_metrics = PostgresStatementMetrics(self, self._config, shutdown_callback=self._close_db_pool)
self.statement_samples = PostgresStatementSamples(self, self._config, shutdown_callback=self._close_db_pool)
self.metadata_samples = PostgresMetadata(self, self._config, shutdown_callback=self._close_db_pool)
self._relations_manager = RelationsManager(self._config.relations)
self._relations_manager = RelationsManager(self._config.relations, self._config.max_relations)
self._clean_state()
self.check_initializations.append(lambda: RelationsManager.validate_relations_config(self._config.relations))
self.check_initializations.append(self.set_resolved_hostname_metadata)
Expand Down Expand Up @@ -191,6 +197,14 @@ def dynamic_queries(self):
self.log.debug("no dynamic queries defined")
return None

# Dynamic queries for relationsmanager
if self._config.relations:
for query in DYNAMIC_RELATION_QUERIES:
query = copy.copy(query)
formatted_query = self._relations_manager.filter_relation_query(query['query'], 'nspname')
query['query'] = formatted_query
queries.append(query)

self._dynamic_queries = self._new_query_executor(queries)
self._dynamic_queries.compile_queries()
self.log.debug("initialized %s dynamic querie(s)", len(queries))
Expand Down
90 changes: 67 additions & 23 deletions postgres/datadog_checks/postgres/relationsmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,27 +108,66 @@

# The catalog pg_class catalogs tables and most everything else that has columns or is otherwise similar to a table.
# For this integration we are restricting the query to ordinary tables.
SIZE_METRICS = {
'descriptors': [('nspname', 'schema'), ('relname', 'table')],
'metrics': {
'pg_table_size(C.oid) as table_size': ('postgresql.table_size', AgentCheck.gauge),
'pg_indexes_size(C.oid) as index_size': ('postgresql.index_size', AgentCheck.gauge),
'pg_total_relation_size(C.oid) as total_size': ('postgresql.total_size', AgentCheck.gauge),
},
'relation': True,
#
# Sizes: Calling pg_relation_size, pg_table_size, pg_indexes_size or pg_total_relation_size
# can be expensive as the relation needs to be locked and stat syscalls are made behind the hood.
#
# We want to limit those calls as much as possible at the cost of precision.
# We also want to get toast size separated from the main table size.
# We can't use pg_total_relation_size which includes both toast, index and table size.
# Same for pg_table_size which includes both toast, table size.
#
# We will mainly rely on pg_relation_size which only get the size of the main fork.
# To keep postgresql.table_size's old behaviour which was based on pg_table_size, we will
# approximate table_size to (relation_size + toast_size). This will ignore FSM and VM size
# but their sizes are dwarfed by the relation's size and it's an acceptable trade off
# to ignore them to lower the amount of stat calls.
#
# Previous version filtered on nspname !~ '^pg_toast'. Since pg_toast namespace only
# contains index and toast table, the filter was redundant with relkind = 'r'
QUERY_PG_CLASS = {
'name': 'pg_class',
'query': """
SELECT
N.nspname,
relname,
{metrics_columns}
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND
nspname !~ '^pg_toast' AND
relkind = 'r' AND
{relations}""",
SELECT current_database(),
s.schemaname, s.table, s.partition_of,
s.relpages, s.reltuples, s.relallvisible,
s.relation_size + s.toast_size,
s.relation_size,
s.index_size,
s.toast_size,
s.relation_size + s.index_size + s.toast_size
FROM
(SELECT
N.nspname as schemaname,
relname as table,
I.inhparent::regclass AS partition_of,
C.relpages, C.reltuples, C.relallvisible,
pg_relation_size(C.oid) as relation_size,
CASE WHEN C.relhasindex THEN pg_indexes_size(C.oid) ELSE 0 END as index_size,
CASE WHEN C.reltoastrelid > 0 THEN pg_relation_size(C.reltoastrelid) ELSE 0 END as toast_size
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
LEFT JOIN pg_inherits I ON (I.inhrelid = C.oid)
WHERE NOT (nspname = ANY('{{pg_catalog,information_schema}}')) AND
relkind = 'r' AND
{relations} {limits}) as s""",
'columns': [
{'name': 'db', 'type': 'tag'},
{'name': 'schema', 'type': 'tag'},
{'name': 'table', 'type': 'tag'},
{'name': 'partition_of', 'type': 'tag_not_null'},
{'name': 'postgresql.relation.pages', 'type': 'gauge'},
{'name': 'postgresql.relation.tuples', 'type': 'gauge'},
{'name': 'postgresql.relation.all_visible', 'type': 'gauge'},
{'name': 'postgresql.table_size', 'type': 'gauge'},
{'name': 'postgresql.relation_size', 'type': 'gauge'},
{'name': 'postgresql.index_size', 'type': 'gauge'},
{'name': 'postgresql.toast_size', 'type': 'gauge'},
{'name': 'postgresql.total_size', 'type': 'gauge'},
],
}


# The pg_statio_all_tables view will contain one row for each table in the current database,
# showing statistics about I/O on that specific table. The pg_statio_user_tables views contain the same information,
# but filtered to only show user tables.
Expand Down Expand Up @@ -259,16 +298,18 @@
'relation': True,
}

RELATION_METRICS = [LOCK_METRICS, REL_METRICS, IDX_METRICS, SIZE_METRICS, STATIO_METRICS]
RELATION_METRICS = [LOCK_METRICS, REL_METRICS, IDX_METRICS, STATIO_METRICS]
DYNAMIC_RELATION_QUERIES = [QUERY_PG_CLASS]


class RelationsManager(object):
"""Builds queries to collect metrics about relations"""

def __init__(self, yamlconfig):
# type: (List[Union[str, Dict]]) -> None
def __init__(self, yamlconfig, max_relations):
# type: (List[Union[str, Dict]], int) -> None
self.log = get_check_logger()
self.config = self._build_relations_config(yamlconfig)
self.max_relations = max_relations
self.has_relations = len(self.config) > 0

def filter_relation_query(self, query, schema_field):
Expand All @@ -295,8 +336,11 @@ def filter_relation_query(self, query, schema_field):
relations_filter.append(' '.join(relation_filter))

relations_filter = '(' + ' OR '.join(relations_filter) + ')'
self.log.debug("Running query: %s with relations matching: %s", str(query), relations_filter)
return query.format(relations=relations_filter)
limits_filter = 'LIMIT {}'.format(self.max_relations)
self.log.debug(
"Running query: %s with relations matching: %s, limits %s", str(query), relations_filter, self.max_relations
)
return query.format(relations=relations_filter, limits=limits_filter)

@staticmethod
def validate_relations_config(yamlconfig):
Expand Down
7 changes: 6 additions & 1 deletion postgres/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ postgresql.autovacuumed,count,,,,The number of times this table has been vacuume
postgresql.analyzed,count,,,,The number of times this table has been manually analyzed.,0,postgres,analyze,
postgresql.autoanalyzed,count,,,,The number of times this table has been analyzed by the autovacuum daemon.,0,postgres,auto analyze,
postgresql.index_rows_read,gauge,,row,second,The number of index entries returned by scans on this index.,0,postgres,idx rows read,
postgresql.table_size,gauge,,byte,,"The total disk space used by the specified table. Includes TOAST, free space map, and visibility map. Excludes indexes.",0,postgres,tbl size,
postgresql.table_size,gauge,,byte,,"The disk space used by the specified table with TOAST data. Free space map and visibility map are not included.",0,postgres,tbl size,
postgresql.relation_size,gauge,,byte,,"The disk space used by the specified table. TOAST data, indexes, free space map and visibility map are not included.",0,postgres,relation size,
postgresql.index_size,gauge,,byte,,The total disk space used by indexes attached to the specified table.,0,postgres,idx size,
postgresql.toast_size,gauge,,byte,,The total disk space used by the toast table attached to the specified table.,0,postgres,toast size,
postgresql.total_size,gauge,,byte,,"The total disk space used by the table, including indexes and TOAST data.",0,postgres,tot size,
postgresql.individual_index_size,gauge,,byte,,The disk space used by a specified index.,0,postgres,idx size,
postgresql.table.count,gauge,,table,,The number of user tables in this database.,0,postgres,tbl count,
Expand Down Expand Up @@ -137,3 +139,6 @@ postgresql.replication_slot.confirmed_flush_delay_bytes,gauge,,byte,,"The delay
postgresql.pg_stat_statements.dealloc,count,,,,"The number of times pg_stat_statements had to evict least executed queries because pg_stat_statements.max was reached.",-1,postgres,pgss dealloc,
postgresql.control.timeline_id,gauge,,,,"The current timeline id.",0,postgres,control tid,
postgresql.control.checkpoint_delay,gauge,,second,,"The time since the last checkpoint.",0,postgres,control checkpoint,
postgresql.relation.pages,gauge,,,,"Size of a table in pages (1 page == 8KB by default). This is only an estimation used by the planner and is updated by VACUUM or ANALYZE.",0,postgres,relation pages,
postgresql.relation.tuples,gauge,,,,"Number of live rows in the table. This is only an estimation used by the planner and is updated by VACUUM or ANALYZE. If the table has never been vacuumed or analyze, -1 will be reported.",0,postgres,relation tuples,
postgresql.relation.all_visible,gauge,,,,"Number of pages that are marked as all visible in the table's visibility map. This is only an estimation used by the planner and is updated by VACUUM or ANALYZE.",0,postgres,relation all_visible,
2 changes: 1 addition & 1 deletion postgres/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ classifiers = [
"Private :: Do Not Upload",
]
dependencies = [
"datadog-checks-base>=25.4.0",
"datadog-checks-base>=32.1.0",
]
dynamic = [
"version",
Expand Down
36 changes: 23 additions & 13 deletions postgres/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@
requires_static_version = pytest.mark.skipif(USING_LATEST, reason='Version `latest` is ever-changing, skipping')


def _iterate_metric_name(columns):
for column in columns:
if column['type'] == 'tag':
def _iterate_metric_name(query):
for column in query['columns']:
if column['type'].startswith('tag'):
continue
yield column['name']

Expand Down Expand Up @@ -148,8 +148,18 @@ def check_common_metrics(aggregator, expected_tags, count=1):


def check_db_count(aggregator, expected_tags, count=1):
table_count = 5
# We create 2 additional partition tables when partition is available
if float(POSTGRES_VERSION) >= 11.0:
table_count = 7
# And PG >= 14 will also report the parent table
if float(POSTGRES_VERSION) >= 14.0:
table_count = 8
aggregator.assert_metric(
'postgresql.table.count', value=5, count=count, tags=expected_tags + ['db:{}'.format(DB_NAME), 'schema:public']
'postgresql.table.count',
value=table_count,
count=count,
tags=expected_tags + ['db:{}'.format(DB_NAME), 'schema:public'],
)
aggregator.assert_metric('postgresql.db.count', value=106, count=1)

Expand Down Expand Up @@ -200,7 +210,7 @@ def check_wal_receiver_metrics(aggregator, expected_tags, count=1, connected=1):
'postgresql.wal_receiver.connected', count=count, value=1, tags=expected_tags + ['status:disconnected']
)
return
for metric_name in _iterate_metric_name(QUERY_PG_STAT_WAL_RECEIVER['columns']):
for metric_name in _iterate_metric_name(QUERY_PG_STAT_WAL_RECEIVER):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


Expand All @@ -227,7 +237,7 @@ def check_logical_replication_slots(aggregator, expected_tags):
def check_replication_slots(aggregator, expected_tags, count=1):
if float(POSTGRES_VERSION) < 10.0:
return
for metric_name in _iterate_metric_name(QUERY_PG_REPLICATION_SLOTS['columns']):
for metric_name in _iterate_metric_name(QUERY_PG_REPLICATION_SLOTS):
if 'slot_type:physical' in expected_tags and metric_name in [
'postgresql.replication_slot.confirmed_flush_delay_bytes',
]:
Expand All @@ -247,13 +257,13 @@ def check_replication_delay(aggregator, metrics_cache, expected_tags, count=1):


def check_uptime_metrics(aggregator, expected_tags, count=1):
for column in QUERY_PG_UPTIME['columns']:
aggregator.assert_metric(column['name'], count=count, tags=expected_tags)
for metric_name in _iterate_metric_name(QUERY_PG_UPTIME):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


def check_control_metrics(aggregator, expected_tags, count=1):
for column in QUERY_PG_CONTROL_CHECKPOINT['columns']:
aggregator.assert_metric(column['name'], count=count, tags=expected_tags)
for metric_name in _iterate_metric_name(QUERY_PG_CONTROL_CHECKPOINT):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


def check_conflict_metrics(aggregator, expected_tags, count=1):
Expand Down Expand Up @@ -285,21 +295,21 @@ def check_slru_metrics(aggregator, expected_tags, count=1):


def check_snapshot_txid_metrics(aggregator, expected_tags, count=1):
for metric_name in _iterate_metric_name(SNAPSHOT_TXID_METRICS['columns']):
for metric_name in _iterate_metric_name(SNAPSHOT_TXID_METRICS):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


def check_file_wal_metrics(aggregator, expected_tags, count=1):
if float(POSTGRES_VERSION) < 10:
return

for metric_name in _iterate_metric_name(WAL_FILE_METRICS['columns']):
for metric_name in _iterate_metric_name(WAL_FILE_METRICS):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


def check_stat_wal_metrics(aggregator, expected_tags, count=1):
if float(POSTGRES_VERSION) < 14.0:
return

for metric_name in _iterate_metric_name(STAT_WAL_METRICS['columns']):
for metric_name in _iterate_metric_name(STAT_WAL_METRICS):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)
7 changes: 7 additions & 0 deletions postgres/tests/compose/resources/02_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" dogs_nofunc <<-'EOSQL'
DROP FUNCTION datadog.explain_statement(l_query text, out explain JSON)
EOSQL

# Somehow, on old postgres version (11 and 12), wal_level is incorrectly set despite
# being present in postgresql.conf. Alter and restart to make sure we have the correct wal_level.
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" datadog_test <<-'EOSQL'
ALTER SYSTEM SET wal_level = logical;
EOSQL
pg_ctl -D /var/lib/postgresql/data -w restart

psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" datadog_test <<-'EOSQL'
SELECT * FROM pg_create_physical_replication_slot('replication_slot');
SELECT * FROM pg_create_logical_replication_slot('logical_slot', 'test_decoding');
Expand Down
12 changes: 12 additions & 0 deletions postgres/tests/compose/resources/03_load_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" datadog_test <<-EOSQL
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO blocking_bob;
EOSQL

if [[ !("$PG_MAJOR" == 9.*) && !("$PG_MAJOR" == 10) ]]; then
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" datadog_test <<-EOSQL
CREATE TABLE test_part (id SERIAL PRIMARY KEY, filler text) PARTITION BY RANGE (id);
CREATE TABLE test_part1 PARTITION OF test_part FOR VALUES FROM (MINVALUE) TO (500);
CREATE TABLE test_part2 PARTITION OF test_part FOR VALUES FROM (500) TO (MAXVALUE);
CREATE INDEX test_part_id ON test_part(id);
INSERT INTO test_part (filler) SELECT * FROM generate_series(1, 2000);
INSERT INTO test_part (filler) SELECT array_to_string(ARRAY(SELECT chr((65 + round(random() * 50)) :: integer) FROM generate_series(1,3000)), '');
VACUUM ANALYZE test_part;
EOSQL
fi

for DBNAME in dogs dogs_noschema dogs_nofunc; do

psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" "$DBNAME" <<-EOSQL
Expand Down
Loading