DataDog · alexbarksdale · Jun 29, 2021 · Jun 21, 2021 · Jun 21, 2021 · Jun 22, 2021
@@ -5,6 +5,7 @@
 import time
 from concurrent.futures.thread import ThreadPoolExecutor
 from contextlib import closing
+from enum import Enum
 
 import pymysql
 from cachetools import TTLCache
@@ -20,7 +21,7 @@
 from datadog_checks.base.utils.db.utils import ConstantRateLimiter, default_json_event_encoding, resolve_db_host
 from datadog_checks.base.utils.serialization import json
 
-VALID_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
+SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update'})
 
 # unless a specific table is configured, we try all of the events_statements tables in descending order of
 # preference
@@ -198,6 +199,21 @@
 )
 
 
+class DBExplainError(Enum):
+    """
+    Denotes the various reasons a query may not have an explain statement.
+    """
+
+    # database error i.e connection error
+    database_error = 'database_error'
+
+    # this could be the result of a missing EXPLAIN function
+    invalid_schema = 'invalid_schema'
+
+    # some statements cannot be explained i.e AUTOVACUUM
+    no_plans_possible = 'no_plans_possible'
+
+
 class MySQLStatementSamples(object):
     """
     Collects statement samples and execution plans.
@@ -479,16 +495,22 @@ def _collect_plan_for_statement(self, row):
             return None
         self._explained_statements_cache[query_cache_key] = True
 
-        plan = None
+        plan, explain_err_code, err = None, None, None
         with closing(self._get_db_connection().cursor()) as cursor:
             try:
-                plan = self._explain_statement(cursor, row['sql_text'], row['current_schema'], obfuscated_statement)
+                plan, explain_err_code, err = self._explain_statement(
+                    cursor, row['sql_text'], row['current_schema'], obfuscated_statement
+                )
             except Exception as e:
                 self._check.count(
                     "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]
                 )
                 self._log.exception("Failed to explain statement: %s", obfuscated_statement)
 
+        collection_error = None
+        if explain_err_code:
+            collection_error = {'code': explain_err_code.value, 'message': '{}'.format(type(err)) if err else None}
+
         normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None
         if plan:
             normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) if plan else None
@@ -512,7 +534,12 @@ def _collect_plan_for_statement(self, row):
                 },
                 "db": {
                     "instance": row['current_schema'],
-                    "plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature},
+                    "plan": {
+                        "definition": obfuscated_plan,
+                        "cost": plan_cost,
+                        "signature": plan_signature,
+                        "collection_error": collection_error,
+                    },
                     "query_signature": query_signature,
                     "resource_hash": apm_resource_hash,
                     "statement": obfuscated_statement,
@@ -657,6 +684,8 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
         Tries the available methods used to explain a statement for the given schema. If a non-retryable
         error occurs (such as a permissions error), then statements executed under the schema will be
         disallowed in future attempts.
+        returns: An execution plan, otherwise it returns an explain error code and optionally an exception.
+        rtype: Tuple[Optional[Dict], Optional[DBExplainError], Optional[Exception]]
         """
         start_time = time.time()
         strategy_cache_key = 'explain_strategy:%s' % schema
@@ -665,13 +694,17 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
 
         self._log.debug('explaining statement. schema=%s, statement="%s"', schema, statement)
 
-        if not self._can_explain(obfuscated_statement):
+        ok, db_explain_error = self._can_explain(obfuscated_statement)
+        if not ok:
             self._log.debug('Skipping statement which cannot be explained: %s', obfuscated_statement)
-            return None
+            return None, db_explain_error, None
 
-        if self._collection_strategy_cache.get(strategy_cache_key) == explain_strategy_error:
-            self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
-            return None
+        strategy_cache = self._collection_strategy_cache.get(strategy_cache_key)
+        if strategy_cache:
+            strategy_error, db_explain_error, err = strategy_cache
+            if strategy_error == explain_strategy_error:
+                self._log.debug('Skipping statement due to cached collection failure: %s', obfuscated_statement)
+                return None, db_explain_error, err
 
         try:
             # If there was a default schema when this query was run, then switch to it before trying to collect
@@ -682,8 +715,13 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
         except pymysql.err.DatabaseError as e:
             if len(e.args) != 2:
                 raise
+            db_explain_error = DBExplainError.invalid_schema
             if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS:
-                self._collection_strategy_cache[strategy_cache_key] = explain_strategy_error
+                self._collection_strategy_cache[strategy_cache_key] = (
+                    explain_strategy_error,
+                    db_explain_error,
+                    e,
+                )
             self._check.count(
                 "dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-use-schema-{}".format(type(e))]
             )
@@ -694,26 +732,29 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                 schema,
                 obfuscated_statement,
             )
-            return None
+            return None, db_explain_error, e
 
         # Use a cached strategy for the schema, if any, or try each strategy to collect plans
         strategies = list(self._preferred_explain_strategies)
-        cached = self._collection_strategy_cache.get(strategy_cache_key)
-        if cached:
-            strategies.remove(cached)
-            strategies.insert(0, cached)
+        strategy_cache = self._collection_strategy_cache.get(strategy_cache_key)
+        if strategy_cache:
+            strategy, _, _ = strategy_cache
+            strategies.remove(strategy)
+            strategies.insert(0, strategy)
 
+        db_explain_error, err = None, None
         for strategy in strategies:
             try:
                 if not schema and strategy == "PROCEDURE":
+                    db_explain_error = DBExplainError.invalid_schema
                     self._log.debug(
                         'skipping PROCEDURE strategy as there is no default schema for this statement="%s"',
                         obfuscated_statement,
                     )
                     continue
                 plan = self._explain_strategies[strategy](cursor, statement, obfuscated_statement)
                 if plan:
-                    self._collection_strategy_cache[strategy_cache_key] = strategy
+                    self._collection_strategy_cache[strategy_cache_key] = (strategy, None, None)
                     self._log.debug(
                         'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"',
                         strategy,
@@ -725,13 +766,14 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                         (time.time() - start_time) * 1000,
                         tags=self._tags + ["strategy:{}".format(strategy)],
                     )
-                    return plan
+                    return plan, None, None
             except pymysql.err.DatabaseError as e:
                 if len(e.args) != 2:
                     raise
                 # we don't cache failed plan collection failures for specific queries because some queries in a schema
                 # can fail while others succeed. The failed collection will be cached for the specific query
                 # so we won't try to explain it again for the cache duration there.
+                db_explain_error, err = DBExplainError.database_error, e
                 self._check.count(
                     "dd.mysql.statement_samples.error",
                     1,
@@ -745,7 +787,7 @@ def _explain_statement(self, cursor, statement, schema, obfuscated_statement):
                     obfuscated_statement,
                 )
                 continue
-        return None
+        return None, db_explain_error, err
 
     def _run_explain(self, cursor, statement, obfuscated_statement):
         """
@@ -773,7 +815,9 @@ def _run_fully_qualified_explain_procedure(self, cursor, statement, obfuscated_s
 
     @staticmethod
     def _can_explain(obfuscated_statement):
-        return obfuscated_statement.split(' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS
+        if obfuscated_statement.split(' ', 1)[0].lower() not in SUPPORTED_EXPLAIN_STATEMENTS:
+            return False, DBExplainError.no_plans_possible
+        return True, None
 
     @staticmethod
     def _parse_execution_plan_cost(execution_plan):

@@ -406,11 +406,23 @@ def run_query(q):
 )
 @pytest.mark.parametrize("explain_strategy", ['PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT', None])
 @pytest.mark.parametrize(
-    "schema,statement",
+    "schema,statement,expected_collection_error",
     [
-        (None, 'select name as nam from testdb.users'),
-        ('information_schema', 'select name as nam from testdb.users'),
-        ('testdb', 'select name as nam from users'),
+        (
+            None,
+            'select name as nam from testdb.users',
+            {'code': 'invalid_schema', 'message': None},
+        ),
+        (
+            'information_schema',
+            'select * from testdb.users',
+            {'code': 'database_error', 'message': "<class 'pymysql.err.OperationalError'>"},
+        ),
+        (
+            'testdb',
+            'select name as nam from users',
+            {'code': 'database_error', 'message': "<class 'pymysql.err.ProgrammingError'>"},
+        ),
     ],
 )
 @pytest.mark.parametrize("aurora_replication_role", [None, "writer", "reader"])
@@ -422,6 +434,7 @@ def test_statement_samples_collect(
     explain_strategy,
     schema,
     statement,
+    expected_collection_error,
     aurora_replication_role,
     caplog,
 ):
@@ -486,6 +499,13 @@ def test_statement_samples_collect(
         assert 'query_block' in json.loads(event['db']['plan']['definition']), "invalid json execution plan"
         assert set(event['ddtags'].split(',')) == expected_tags
 
+    # Validate the events to ensure we've provided an explanation for not providing an exec plan
+    for event in matching:
+        if event['db']['plan']['definition'] is None:
+            assert event['db']['plan']['collection_error'] == expected_collection_error
+        else:
+            assert event['db']['plan']['collection_error'] is None
+
     # we avoid closing these in a try/finally block in order to maintain the connections in case we want to
     # debug the test with --pdb
     mysql_check._statement_samples._close_db_conn()