Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide a reason for not having an execution plan (MySQL) #9569

Merged
merged 11 commits into from
Jun 29, 2021
87 changes: 68 additions & 19 deletions mysql/datadog_checks/mysql/statement_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from concurrent.futures.thread import ThreadPoolExecutor
from contextlib import closing
from enum import Enum

import pymysql
from cachetools import TTLCache
Expand All @@ -20,7 +21,7 @@
from datadog_checks.base.utils.db.utils import ConstantRateLimiter, default_json_event_encoding, resolve_db_host
from datadog_checks.base.utils.serialization import json

VALID_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update', 'with'})

# unless a specific table is configured, we try all of the events_statements tables in descending order of
# preference
Expand Down Expand Up @@ -198,6 +199,27 @@
)


class DBExplainError(Enum):
"""
Denotes the various reasons a query may not have an explain statement.
"""

# database error i.e connection error
database_error = 'database_error'

# this could be the result of a missing EXPLAIN function
invalid_schema = 'invalid_schema'

# some statements cannot be explained i.e AUTOVACUUM
no_plans_possible = 'no_plans_possible'

# unable to collect an exec plan with the strategy 'PROCEDURE' without a default schema
procedure_strategy_requires_default_schema = 'procedure_strategy_requires_default_schema'

# agent may not have access to the default schema
use_schema_error = 'use_schema_error'


class MySQLStatementSamples(object):
"""
Collects statement samples and execution plans.
Expand Down Expand Up @@ -479,16 +501,29 @@ def _collect_plan_for_statement(self, row):
return None
self._explained_statements_cache[query_cache_key] = True

plan = None
plan, explain_errors = None, None
with closing(self._get_db_connection().cursor()) as cursor:
try:
plan = self._explain_statement(cursor, row['sql_text'], row['current_schema'], obfuscated_statement)
plan, explain_errors = self._explain_statement(
cursor, row['sql_text'], row['current_schema'], obfuscated_statement
)
except Exception as e:
self._check.count(
"dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]
)
self._log.exception("Failed to explain statement: %s", obfuscated_statement)

collection_errors = []
if explain_errors:
for strategy, explain_err_code, explain_err in explain_errors:
collection_errors.append(
{
'strategy': strategy if strategy else None,
'code': explain_err_code.value if explain_err_code else None,
'message': '{}'.format(type(explain_err)) if explain_err else None,
}
)

normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None
if plan:
normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) if plan else None
Expand All @@ -512,7 +547,12 @@ def _collect_plan_for_statement(self, row):
},
"db": {
"instance": row['current_schema'],
"plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature},
"plan": {
"definition": obfuscated_plan,
"cost": plan_cost,
"signature": plan_signature,
"collection_errors": collection_errors if collection_errors else None,
},
"query_signature": query_signature,
"resource_hash": apm_resource_hash,
"statement": obfuscated_statement,
Expand Down Expand Up @@ -657,21 +697,25 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
Tries the available methods used to explain a statement for the given schema. If a non-retryable
error occurs (such as a permissions error), then statements executed under the schema will be
disallowed in future attempts.
returns: An execution plan, otherwise it returns a list of tuples containing: (strategy, error code, error)
rtype: Optional[Dict], List[Tuple[Optional[str], Optional[DBExplainError], Optional[Exception]]]
"""
start_time = time.time()
strategy_cache_key = 'explain_strategy:%s' % schema
explain_strategy_error = 'ERROR'
tags = self._tags + ["schema:{}".format(schema)]

self._log.debug('explaining statement. schema=%s, statement="%s"', schema, statement)

if not self._can_explain(obfuscated_statement):
self._log.debug('Skipping statement which cannot be explained: %s', obfuscated_statement)
return None
return None, [(None, DBExplainError.no_plans_possible, None)]

if self._collection_strategy_cache.get(strategy_cache_key) == explain_strategy_error:
self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
return None
cached_strategy = self._collection_strategy_cache.get(strategy_cache_key)
if cached_strategy:
_, explain_err_code, _ = cached_strategy[0] # (strategy, explain_err_code, explain_err)
if explain_err_code:
self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
return None, None

try:
# If there was a default schema when this query was run, then switch to it before trying to collect
Expand All @@ -682,8 +726,9 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
except pymysql.err.DatabaseError as e:
if len(e.args) != 2:
raise
explain_errors = [(None, DBExplainError.use_schema_error, e)]
if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS:
self._collection_strategy_cache[strategy_cache_key] = explain_strategy_error
self._collection_strategy_cache[strategy_cache_key] = explain_errors
self._check.count(
"dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-use-schema-{}".format(type(e))]
)
Expand All @@ -694,26 +739,29 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
schema,
obfuscated_statement,
)
return None
return None, explain_errors

# Use a cached strategy for the schema, if any, or try each strategy to collect plans
strategies = list(self._preferred_explain_strategies)
cached = self._collection_strategy_cache.get(strategy_cache_key)
if cached:
strategies.remove(cached)
strategies.insert(0, cached)
cached_strategy = self._collection_strategy_cache.get(strategy_cache_key)
if cached_strategy:
strategy, _, _ = cached_strategy[0] # (strategy, explain_err_code, explain_err)
strategies.remove(strategy)
strategies.insert(0, strategy)

explain_errors = []
for strategy in strategies:
try:
if not schema and strategy == "PROCEDURE":
explain_errors.append((strategy, DBExplainError.procedure_strategy_requires_default_schema, None))
self._log.debug(
'skipping PROCEDURE strategy as there is no default schema for this statement="%s"',
obfuscated_statement,
)
continue
plan = self._explain_strategies[strategy](cursor, statement, obfuscated_statement)
if plan:
self._collection_strategy_cache[strategy_cache_key] = strategy
self._collection_strategy_cache[strategy_cache_key] = [(strategy, None, None)]
self._log.debug(
'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"',
strategy,
Expand All @@ -725,13 +773,14 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
(time.time() - start_time) * 1000,
tags=self._tags + ["strategy:{}".format(strategy)],
)
return plan
return plan, None
except pymysql.err.DatabaseError as e:
if len(e.args) != 2:
raise
# we don't cache failed plan collection failures for specific queries because some queries in a schema
# can fail while others succeed. The failed collection will be cached for the specific query
# so we won't try to explain it again for the cache duration there.
explain_errors.append((strategy, DBExplainError.database_error, e))
self._check.count(
"dd.mysql.statement_samples.error",
1,
Expand All @@ -745,7 +794,7 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
obfuscated_statement,
)
continue
return None
return None, explain_errors

def _run_explain(self, cursor, statement, obfuscated_statement):
"""
Expand Down Expand Up @@ -773,7 +822,7 @@ def _run_fully_qualified_explain_procedure(self, cursor, statement, obfuscated_s

@staticmethod
def _can_explain(obfuscated_statement):
return obfuscated_statement.split(' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS
return obfuscated_statement.split(' ', 1)[0].lower() in SUPPORTED_EXPLAIN_STATEMENTS

@staticmethod
def _parse_execution_plan_cost(execution_plan):
Expand Down
34 changes: 30 additions & 4 deletions mysql/tests/test_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,11 +406,29 @@ def run_query(q):
)
@pytest.mark.parametrize("explain_strategy", ['PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT', None])
@pytest.mark.parametrize(
"schema,statement",
"schema,statement,expected_collection_errors",
[
(None, 'select name as nam from testdb.users'),
('information_schema', 'select name as nam from testdb.users'),
('testdb', 'select name as nam from users'),
(
None,
'select name as nam from testdb.users',
[{'strategy': 'PROCEDURE', 'code': 'procedure_strategy_requires_default_schema', 'message': None}],
),
(
'information_schema',
'select * from testdb.users',
[{'strategy': 'PROCEDURE', 'code': 'database_error', 'message': "<class 'pymysql.err.OperationalError'>"}],
),
(
'testdb',
'select name as nam from users',
[
{
'strategy': 'FQ_PROCEDURE',
'code': 'database_error',
'message': "<class 'pymysql.err.ProgrammingError'>",
}
],
),
],
)
@pytest.mark.parametrize("aurora_replication_role", [None, "writer", "reader"])
Expand All @@ -422,6 +440,7 @@ def test_statement_samples_collect(
explain_strategy,
schema,
statement,
expected_collection_errors,
aurora_replication_role,
caplog,
):
Expand Down Expand Up @@ -486,6 +505,13 @@ def test_statement_samples_collect(
assert 'query_block' in json.loads(event['db']['plan']['definition']), "invalid json execution plan"
assert set(event['ddtags'].split(',')) == expected_tags

# Validate the events to ensure we've provided an explanation for not providing an exec plan
for event in matching:
if event['db']['plan']['definition'] is None:
assert event['db']['plan']['collection_errors'] == expected_collection_errors
else:
assert event['db']['plan']['collection_errors'] is None

# we avoid closing these in a try/finally block in order to maintain the connections in case we want to
# debug the test with --pdb
mysql_check._statement_samples._close_db_conn()
Expand Down