Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide a reason for not having an execution plan (MySQL) #9569

Merged
merged 11 commits into from
Jun 29, 2021
82 changes: 63 additions & 19 deletions mysql/datadog_checks/mysql/statement_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from concurrent.futures.thread import ThreadPoolExecutor
from contextlib import closing
from enum import Enum

import pymysql
from cachetools import TTLCache
Expand All @@ -20,7 +21,7 @@
from datadog_checks.base.utils.db.utils import ConstantRateLimiter, default_json_event_encoding, resolve_db_host
from datadog_checks.base.utils.serialization import json

VALID_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved

# unless a specific table is configured, we try all of the events_statements tables in descending order of
# preference
Expand Down Expand Up @@ -198,6 +199,21 @@
)


class DBExplainError(Enum):
"""
Denotes the various reasons a query may not have an explain statement.
"""

# database error i.e connection error
database_error = 'database_error'

# this could be the result of a missing EXPLAIN function
invalid_schema = 'invalid_schema'

# some statements cannot be explained i.e AUTOVACUUM
no_plans_possible = 'no_plans_possible'


class MySQLStatementSamples(object):
"""
Collects statement samples and execution plans.
Expand Down Expand Up @@ -479,16 +495,22 @@ def _collect_plan_for_statement(self, row):
return None
self._explained_statements_cache[query_cache_key] = True

plan = None
plan, explain_err_code, err = None, None, None
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved
with closing(self._get_db_connection().cursor()) as cursor:
try:
plan = self._explain_statement(cursor, row['sql_text'], row['current_schema'], obfuscated_statement)
plan, explain_err_code, err = self._explain_statement(
cursor, row['sql_text'], row['current_schema'], obfuscated_statement
)
except Exception as e:
self._check.count(
"dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]
)
self._log.exception("Failed to explain statement: %s", obfuscated_statement)

collection_error = None
if explain_err_code:
collection_error = {'code': explain_err_code.value, 'message': '{}'.format(type(err)) if err else None}

normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None
if plan:
normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) if plan else None
Expand All @@ -512,7 +534,12 @@ def _collect_plan_for_statement(self, row):
},
"db": {
"instance": row['current_schema'],
"plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature},
"plan": {
"definition": obfuscated_plan,
"cost": plan_cost,
"signature": plan_signature,
"collection_error": collection_error,
},
"query_signature": query_signature,
"resource_hash": apm_resource_hash,
"statement": obfuscated_statement,
Expand Down Expand Up @@ -657,6 +684,8 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
Tries the available methods used to explain a statement for the given schema. If a non-retryable
error occurs (such as a permissions error), then statements executed under the schema will be
disallowed in future attempts.
returns: An execution plan, otherwise it returns an explain error code and optionally an exception.
rtype: Tuple[Optional[Dict], Optional[DBExplainError], Optional[Exception]]
"""
start_time = time.time()
strategy_cache_key = 'explain_strategy:%s' % schema
Expand All @@ -665,13 +694,17 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):

self._log.debug('explaining statement. schema=%s, statement="%s"', schema, statement)

if not self._can_explain(obfuscated_statement):
ok, db_explain_error = self._can_explain(obfuscated_statement)
if not ok:
self._log.debug('Skipping statement which cannot be explained: %s', obfuscated_statement)
return None
return None, db_explain_error, None

if self._collection_strategy_cache.get(strategy_cache_key) == explain_strategy_error:
self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
return None
strategy_cache = self._collection_strategy_cache.get(strategy_cache_key)
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved
if strategy_cache:
strategy_error, db_explain_error, err = strategy_cache
if strategy_error == explain_strategy_error:
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved
self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
return None, db_explain_error, err

try:
# If there was a default schema when this query was run, then switch to it before trying to collect
Expand All @@ -682,8 +715,13 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
except pymysql.err.DatabaseError as e:
if len(e.args) != 2:
raise
db_explain_error = DBExplainError.invalid_schema
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved
if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS:
self._collection_strategy_cache[strategy_cache_key] = explain_strategy_error
self._collection_strategy_cache[strategy_cache_key] = (
explain_strategy_error,
db_explain_error,
e,
)
self._check.count(
"dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-use-schema-{}".format(type(e))]
)
Expand All @@ -694,26 +732,29 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
schema,
obfuscated_statement,
)
return None
return None, db_explain_error, e

# Use a cached strategy for the schema, if any, or try each strategy to collect plans
strategies = list(self._preferred_explain_strategies)
cached = self._collection_strategy_cache.get(strategy_cache_key)
if cached:
strategies.remove(cached)
strategies.insert(0, cached)
strategy_cache = self._collection_strategy_cache.get(strategy_cache_key)
if strategy_cache:
strategy, _, _ = strategy_cache
strategies.remove(strategy)
strategies.insert(0, strategy)

db_explain_error, err = None, None
for strategy in strategies:
try:
if not schema and strategy == "PROCEDURE":
db_explain_error = DBExplainError.invalid_schema
alexbarksdale marked this conversation as resolved.
Show resolved Hide resolved
self._log.debug(
'skipping PROCEDURE strategy as there is no default schema for this statement="%s"',
obfuscated_statement,
)
continue
plan = self._explain_strategies[strategy](cursor, statement, obfuscated_statement)
if plan:
self._collection_strategy_cache[strategy_cache_key] = strategy
self._collection_strategy_cache[strategy_cache_key] = (strategy, None, None)
self._log.debug(
'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"',
strategy,
Expand All @@ -725,13 +766,14 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
(time.time() - start_time) * 1000,
tags=self._tags + ["strategy:{}".format(strategy)],
)
return plan
return plan, None, None
except pymysql.err.DatabaseError as e:
if len(e.args) != 2:
raise
# we don't cache failed plan collection failures for specific queries because some queries in a schema
# can fail while others succeed. The failed collection will be cached for the specific query
# so we won't try to explain it again for the cache duration there.
db_explain_error, err = DBExplainError.database_error, e
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should collect all of the errors, by strategy? The problem with this approach is that we'll only get the error for the last strategy, potentially hiding other setup issues.

Perhaps for mysql instead of just one (code, reason) we have a map of strategy -> (code, reason)? Maybe under a different key: collection_error_by_strategy

@justiniso what do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline, but for context, there are cases where we wont have a strategy to make a mapping of strategy -> (code, msg) (i.e _can_explain()) so we were thinking, what if we simplify it to where we structure it as[{strategy, code, message}, ...] under collection_errors to be more universal?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decided to implement this approach, we can revert if necessary.

self._check.count(
"dd.mysql.statement_samples.error",
1,
Expand All @@ -745,7 +787,7 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
obfuscated_statement,
)
continue
return None
return None, db_explain_error, err

def _run_explain(self, cursor, statement, obfuscated_statement):
"""
Expand Down Expand Up @@ -773,7 +815,9 @@ def _run_fully_qualified_explain_procedure(self, cursor, statement, obfuscated_s

@staticmethod
def _can_explain(obfuscated_statement):
return obfuscated_statement.split(' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS
if obfuscated_statement.split(' ', 1)[0].lower() not in SUPPORTED_EXPLAIN_STATEMENTS:
return False, DBExplainError.no_plans_possible
return True, None

@staticmethod
def _parse_execution_plan_cost(execution_plan):
Expand Down
28 changes: 24 additions & 4 deletions mysql/tests/test_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,11 +406,23 @@ def run_query(q):
)
@pytest.mark.parametrize("explain_strategy", ['PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT', None])
@pytest.mark.parametrize(
"schema,statement",
"schema,statement,expected_collection_error",
[
(None, 'select name as nam from testdb.users'),
('information_schema', 'select name as nam from testdb.users'),
('testdb', 'select name as nam from users'),
(
None,
'select name as nam from testdb.users',
{'code': 'invalid_schema', 'message': None},
),
(
'information_schema',
'select * from testdb.users',
{'code': 'database_error', 'message': "<class 'pymysql.err.OperationalError'>"},
),
(
'testdb',
'select name as nam from users',
{'code': 'database_error', 'message': "<class 'pymysql.err.ProgrammingError'>"},
),
],
)
@pytest.mark.parametrize("aurora_replication_role", [None, "writer", "reader"])
Expand All @@ -422,6 +434,7 @@ def test_statement_samples_collect(
explain_strategy,
schema,
statement,
expected_collection_error,
aurora_replication_role,
caplog,
):
Expand Down Expand Up @@ -486,6 +499,13 @@ def test_statement_samples_collect(
assert 'query_block' in json.loads(event['db']['plan']['definition']), "invalid json execution plan"
assert set(event['ddtags'].split(',')) == expected_tags

# Validate the events to ensure we've provided an explanation for not providing an exec plan
for event in matching:
if event['db']['plan']['definition'] is None:
assert event['db']['plan']['collection_error'] == expected_collection_error
else:
assert event['db']['plan']['collection_error'] is None

# we avoid closing these in a try/finally block in order to maintain the connections in case we want to
# debug the test with --pdb
mysql_check._statement_samples._close_db_conn()
Expand Down