From 48a8f440e9940e1ed8218ce7b15bc9b338bc725f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20Bl=C3=A5udd?= Date: Tue, 24 Oct 2023 15:24:13 +0200 Subject: [PATCH] Bug#35948153 Problem setting up events due to stale NdbApi dictionary cache [#1] Problem: A MySQL Server which has been disconnected from schema distribution fails to setup event operations since the columns of the table can't be found in the event. Analysis: The ndbcluster plugin uses NDB table definitions which are cached by the NdbApi. These cached objects are reference counted and there can be multiple versions of the same table in the cache, the intention is that it should be possible to continue using the table even though it changes in NDB. When changing a table in NDB this cache need to be invalidated, both on the local MySQL Server and on all other MySQL Servers connected to the same cluster. Such invalidation is especially important before installing in DD and setting up event subscriptions. The local MySQL Server cache is invalidated directly when releasing the reference from the NdbApi after having modified the table. The other MySQL Servers are primarily invalidated by using schema distribution. Since schema distribution is event driven the invalidation will happen promptly but as with all things in a distributed system there is a possibility that these events are not handled for some reason. This means there must be a fallback mechanism which invalidates stale cache objects. The reported problem occurs since there is a stale NDB table definition in the NdbApi, it has the same name but different columns than the current table in NDB. In most cases the NdbApi continues to operate on a cached NDB table definition but when setting up events the "mismatch on version" will be detected inside the NdbApi(due to the relation between the event and the table), this causes the cache to be invalidated and current version to be loaded from NDB. However the caller is still using the "old" cached table definition and thus when trying to subscribe the columns they can not be found. Solution: 1) Invalidate NDB table definition in schema event handler that handles new table created. This covers the case where table is dropped directly in NDB using for example ndb_drop_table or ndb_restore and then subsequently created using SQL. This scenario is covered by the existing metadata_sync test cases who will be detected by 4) before this part of the fix. 2) Invalidate NDB table definition before table schema synchronization install tables in DD and setup event subscripotion. This function handles the case when schema distribution is reconnecting to the cluster and a table it knew about earlier has changed while schema distribution event handlers have not been active. This scenario is tested by the drop_util_table test case. 3) Invalidate NDB table definition when schema distribution event handler which is used for drop table and cluster failure occurs. At this time it's well known that table does not exists or it's status is unknown. Earlier this invalidation was only performed if there was a version mismatch in the the event vs. table relation. 4) Detect when problem occurs by checking that NDB table definition has not been invalidated (by NdbApi event functions) in the function that setup the event subscription. It's currently not possible to handle the problem this low down, but at least it can be detected and fix added to the callers. This detection is only done in debug compile. Change-Id: I4ed6efb9308be0022e99c51eb23ecf583805b1f4 --- storage/ndb/plugin/ha_ndbcluster_binlog.cc | 23 +++++++++++--------- storage/ndb/plugin/ndb_dd_sync.cc | 6 +++++ storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp | 4 ++++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/storage/ndb/plugin/ha_ndbcluster_binlog.cc b/storage/ndb/plugin/ha_ndbcluster_binlog.cc index 3a24fd8adaff..6b0162274efa 100644 --- a/storage/ndb/plugin/ha_ndbcluster_binlog.cc +++ b/storage/ndb/plugin/ha_ndbcluster_binlog.cc @@ -1240,18 +1240,13 @@ static void ndbcluster_binlog_event_operation_teardown(THD *thd, Ndb *is_ndb, Ndb_event_data::get_event_data(pOp->getCustomData()); NDB_SHARE *const share = event_data->share; - // Invalidate any cached NdbApi table if object version is lower - // than what was used when setting up the NdbEventOperation - // NOTE! This functionality need to be explained further { - Thd_ndb *thd_ndb = get_thd_ndb(thd); - Ndb *ndb = thd_ndb->ndb; - Ndb_table_guard ndbtab_g(ndb, share->db, share->table_name); - const NDBTAB *ev_tab = pOp->getTable(); - const NDBTAB *cache_tab = ndbtab_g.get_table(); - if (cache_tab && cache_tab->getObjectId() == ev_tab->getObjectId() && - cache_tab->getObjectVersion() <= ev_tab->getObjectVersion()) + // Since table has been dropped or cluster connection lost the NdbApi table + // should be invalidated in the global dictionary cache + Ndb_table_guard ndbtab_g(is_ndb, share->db, share->table_name); + if (ndbtab_g.get_table()) { ndbtab_g.invalidate(); + } } // Close the table in MySQL Server @@ -3198,6 +3193,8 @@ class Ndb_schema_event_handler { if (schema->node_id == own_nodeid()) return; write_schema_op_to_binlog(m_thd, schema); + ndbapi_invalidate_table(schema->db, schema->name); + ndb_tdc_close_cached_table(m_thd, schema->db, schema->name); if (!create_table_from_engine(schema->db, schema->name, true, /* force_overwrite */ @@ -5058,6 +5055,12 @@ static int ndbcluster_setup_binlog_for_share(THD *thd, Ndb *ndb, return -1; } } + // The function that check if event exist will silently mark the NDB table + // definition as 'Invalid' when the event's table version does not match the + // cached NDB table definitions version. This indicates that the caller have + // used a stale version of the NDB table definition and is a problem which + // has to be fixed by the caller of this function. + assert(ndbtab->getObjectStatus() != NdbDictionary::Object::Invalid); if (share->have_event_operation()) { DBUG_PRINT("info", ("binlogging already setup")); diff --git a/storage/ndb/plugin/ndb_dd_sync.cc b/storage/ndb/plugin/ndb_dd_sync.cc index bf0281ed7d90..aa775d2cdaac 100644 --- a/storage/ndb/plugin/ndb_dd_sync.cc +++ b/storage/ndb/plugin/ndb_dd_sync.cc @@ -1187,6 +1187,12 @@ bool Ndb_dd_sync::synchronize_table(const char *schema_name, const char *table_name) const { ndb_log_verbose(1, "Synchronizing table '%s.%s'", schema_name, table_name); + { + // Invalidate potentially stale cached table + Ndb_table_guard ndbtab_g(m_thd_ndb->ndb, schema_name, table_name); + ndbtab_g.invalidate(); + } + Ndb_table_guard ndbtab_g(m_thd_ndb->ndb, schema_name, table_name); const NdbDictionary::Table *ndbtab = ndbtab_g.get_table(); if (!ndbtab) { diff --git a/storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp b/storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp index 0c62ebf5505f..0c9bbb50bcf1 100644 --- a/storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp +++ b/storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp @@ -5466,6 +5466,10 @@ NdbEventImpl *NdbDictionaryImpl::getEvent(const char *eventName, ((Uint32)tab->m_id != ev->m_table_id) || (table_version_major(tab->m_version) != table_version_major(ev->m_table_version))) { + // Table id or version does not match the table in the NdbApi dict cache, + // the cached table is invalidated and fetched from NDB again. For NdbApi + // user this have the effect that a different version of the table is used + // after calling NdbApi event functions. DBUG_PRINT("info", ("mismatch on verison in cache")); releaseTableGlobal(*tab, 1); tab = fetchGlobalTableImplRef(InitTable(ev->getTableName()));