DataDog · alexbarksdale · Jun 29, 2021 · Jun 21, 2021 · Jun 21, 2021 · Jun 22, 2021
@@ -5,6 +5,7 @@
 import time
 from concurrent.futures.thread import ThreadPoolExecutor
 from contextlib import closing
+from enum import Enum
 
 import pymysql
 from cachetools import TTLCache
@@ -20,7 +21,7 @@
 from datadog_checks.base.utils.db.utils import ConstantRateLimiter, default_json_event_encoding, resolve_db_host
 from datadog_checks.base.utils.serialization import json
 
-VALID_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
+SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
 
 # unless a specific table is configured, we try all of the events_statements tables in descending order of
 # preference
@@ -198,6 +199,27 @@
 )
 
 
+class DBExplainError(Enum):
+    """
+    Denotes the various reasons a query may not have an explain statement.
+    """
+
+    # database error i.e connection error
+    database_error = 'database_error'
+
+    # this could be the result of a missing EXPLAIN function
+    invalid_schema = 'invalid_schema'
+
+    # some statements cannot be explained i.e AUTOVACUUM
+    no_plans_possible = 'no_plans_possible'
+
+    # unable to collect an exec plan with the strategy 'PROCEDURE' without a default schema
+    procedure_strategy_requires_default_schema = 'procedure_strategy_requires_default_schema'
+
+    # agent may not have access to the default schema
+    use_schema_error = 'use_schema_error'
+
+
 class MySQLStatementSamples(object):
     """
     Collects statement samples and execution plans.
@@ -479,16 +501,29 @@ def _collect_plan_for_statement(self, row):
             return None
         self._explained_statements_cache[query_cache_key] = True
 
-        plan = None
+        plan, explain_errors = None, None
         with closing(self._get_db_connection().cursor()) as cursor:
             try:
-                plan = self._explain_statement(cursor, row['sql_text'], row['current_schema'], obfuscated_statement)
+                plan, explain_errors = self._explain_statement(
+                    cursor, row['sql_text'], row['current_schema'], obfuscated_statement
+                )
             except Exception as e:
                 self._check.count(
                     "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]
                 )
                 self._log.exception("Failed to explain statement: %s", obfuscated_statement)
 
+        collection_errors = []
+        if explain_errors:
+            for strategy, explain_err_code, explain_err in explain_errors:
+                collection_errors.append(
+                    {
+                        'strategy': strategy if strategy else None,
+                        'code': explain_err_code.value if explain_err_code else None,
+                        'message': '{}'.format(type(explain_err)) if explain_err else None,
+                    }
+                )
+
         normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None
         if plan:
             normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) if plan else None
@@ -512,7 +547,12 @@ def _collect_plan_for_statement(self, row):
                 },
                 "db": {
                     "instance": row['current_schema'],
-                    "plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature},
+                    "plan": {
+                        "definition": obfuscated_plan,
+                        "cost": plan_cost,
+                        "signature": plan_signature,
+                        "collection_errors": collection_errors if collection_errors else None,
+                    },
                     "query_signature": query_signature,
                     "resource_hash": apm_resource_hash,
                     "statement": obfuscated_statement,
@@ -657,21 +697,25 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
         Tries the available methods used to explain a statement for the given schema. If a non-retryable
         error occurs (such as a permissions error), then statements executed under the schema will be
         disallowed in future attempts.
+        returns: An execution plan, otherwise it returns a list of tuples containing: (strategy, error code, error)
+        rtype: Optional[Dict], List[Tuple[Optional[str], Optional[DBExplainError], Optional[Exception]]]
         """
         start_time = time.time()
         strategy_cache_key = 'explain_strategy:%s' % schema
-        explain_strategy_error = 'ERROR'
         tags = self._tags + ["schema:{}".format(schema)]
 
         self._log.debug('explaining statement. schema=%s, statement="%s"', schema, statement)
 
         if not self._can_explain(obfuscated_statement):
             self._log.debug('Skipping statement which cannot be explained: %s', obfuscated_statement)
-            return None
+            return None, [(None, DBExplainError.no_plans_possible, None)]
 
-        if self._collection_strategy_cache.get(strategy_cache_key) == explain_strategy_error:
-            self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
-            return None
+        cached_strategy = self._collection_strategy_cache.get(strategy_cache_key)
+        if cached_strategy:
+            _, explain_err_code, _ = cached_strategy[0]  # (strategy, explain_err_code, explain_err)
+            if explain_err_code:
+                self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
+                return None, None
 
         try:
             # If there was a default schema when this query was run, then switch to it before trying to collect
@@ -682,8 +726,9 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
         except pymysql.err.DatabaseError as e:
             if len(e.args) != 2:
                 raise
+            explain_errors = [(None, DBExplainError.use_schema_error, e)]
             if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS:
-                self._collection_strategy_cache[strategy_cache_key] = explain_strategy_error
+                self._collection_strategy_cache[strategy_cache_key] = explain_errors
             self._check.count(
                 "dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-use-schema-{}".format(type(e))]
             )
@@ -694,26 +739,29 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                 schema,
                 obfuscated_statement,
             )
-            return None
+            return None, explain_errors
 
         # Use a cached strategy for the schema, if any, or try each strategy to collect plans
         strategies = list(self._preferred_explain_strategies)
-        cached = self._collection_strategy_cache.get(strategy_cache_key)
-        if cached:
-            strategies.remove(cached)
-            strategies.insert(0, cached)
+        cached_strategy = self._collection_strategy_cache.get(strategy_cache_key)
+        if cached_strategy:
+            strategy, _, _ = cached_strategy[0]  # (strategy, explain_err_code, explain_err)
+            strategies.remove(strategy)
+            strategies.insert(0, strategy)
 
+        explain_errors = []
         for strategy in strategies:
             try:
                 if not schema and strategy == "PROCEDURE":
+                    explain_errors.append((strategy, DBExplainError.procedure_strategy_requires_default_schema, None))
                     self._log.debug(
                         'skipping PROCEDURE strategy as there is no default schema for this statement="%s"',
                         obfuscated_statement,
                     )
                     continue
                 plan = self._explain_strategies[strategy](cursor, statement, obfuscated_statement)
                 if plan:
-                    self._collection_strategy_cache[strategy_cache_key] = strategy
+                    self._collection_strategy_cache[strategy_cache_key] = [(strategy, None, None)]
                     self._log.debug(
                         'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"',
                         strategy,
@@ -725,13 +773,14 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                         (time.time() - start_time) * 1000,
                         tags=self._tags + ["strategy:{}".format(strategy)],
                     )
-                    return plan
+                    return plan, None
             except pymysql.err.DatabaseError as e:
                 if len(e.args) != 2:
                     raise
                 # we don't cache failed plan collection failures for specific queries because some queries in a schema
                 # can fail while others succeed. The failed collection will be cached for the specific query
                 # so we won't try to explain it again for the cache duration there.
+                explain_errors.append((strategy, DBExplainError.database_error, e))
                 self._check.count(
                     "dd.mysql.statement_samples.error",
                     1,
@@ -745,7 +794,7 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                     obfuscated_statement,
                 )
                 continue
-        return None
+        return None, explain_errors
 
     def _run_explain(self, cursor, statement, obfuscated_statement):
         """
@@ -773,7 +822,7 @@ def _run_fully_qualified_explain_procedure(self, cursor, statement, obfuscated_s
 
     @staticmethod
     def _can_explain(obfuscated_statement):
-        return obfuscated_statement.split(' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS
+        return obfuscated_statement.split(' ', 1)[0].lower() in SUPPORTED_EXPLAIN_STATEMENTS
 
     @staticmethod
     def _parse_execution_plan_cost(execution_plan):

@@ -406,11 +406,29 @@ def run_query(q):
 )
 @pytest.mark.parametrize("explain_strategy", ['PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT', None])
 @pytest.mark.parametrize(
-    "schema,statement",
+    "schema,statement,expected_collection_errors",
     [
-        (None, 'select name as nam from testdb.users'),
-        ('information_schema', 'select name as nam from testdb.users'),
-        ('testdb', 'select name as nam from users'),
+        (
+            None,
+            'select name as nam from testdb.users',
+            [{'strategy': 'PROCEDURE', 'code': 'procedure_strategy_requires_default_schema', 'message': None}],
+        ),
+        (
+            'information_schema',
+            'select * from testdb.users',
+            [{'strategy': 'PROCEDURE', 'code': 'database_error', 'message': "<class 'pymysql.err.OperationalError'>"}],
+        ),
+        (
+            'testdb',
+            'select name as nam from users',
+            [
+                {
+                    'strategy': 'FQ_PROCEDURE',
+                    'code': 'database_error',
+                    'message': "<class 'pymysql.err.ProgrammingError'>",
+                }
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize("aurora_replication_role", [None, "writer", "reader"])
@@ -422,6 +440,7 @@ def test_statement_samples_collect(
     explain_strategy,
     schema,
     statement,
+    expected_collection_errors,
     aurora_replication_role,
     caplog,
 ):
@@ -486,6 +505,13 @@ def test_statement_samples_collect(
         assert 'query_block' in json.loads(event['db']['plan']['definition']), "invalid json execution plan"
         assert set(event['ddtags'].split(',')) == expected_tags
 
+    # Validate the events to ensure we've provided an explanation for not providing an exec plan
+    for event in matching:
+        if event['db']['plan']['definition'] is None:
+            assert event['db']['plan']['collection_errors'] == expected_collection_errors
+        else:
+            assert event['db']['plan']['collection_errors'] is None
+
     # we avoid closing these in a try/finally block in order to maintain the connections in case we want to
     # debug the test with --pdb
     mysql_check._statement_samples._close_db_conn()