From 8446d6aa8dd0531baa48ebade897b37f55de5208 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Wed, 22 Jul 2020 16:30:44 -0500 Subject: [PATCH] stats: fix race condition when refreshing cache This commit fixes a race condition that was introduced in #51616. The race condition existed because it was possible for two refreshes to be triggered in close succession, and there was no mechanism to prevent the first refresh from overwriting the second if it was delayed. We fix the race condition by introducing two boolean flags into each cache entry: refreshing and mustRefreshAgain. If refreshing is true, that indicates that the current statistics for this table are stale, and we are in the process of fetching the updated stats from the database. In the mean time, other callers can use the stale stats and do not need to wait. If a goroutine tries to perform a refresh when a refresh is already in progress, it will see that refreshing=true and will set the mustRefreshAgain flag to true before returning. When the original goroutine that was performing the refresh returns from the database and sees that mustRefreshAgain=true, it will trigger another refresh. Informs #50863 Release note: None --- pkg/sql/stats/stats_cache.go | 47 +++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/pkg/sql/stats/stats_cache.go b/pkg/sql/stats/stats_cache.go index dc10f41de371..b84e80d9b97d 100644 --- a/pkg/sql/stats/stats_cache.go +++ b/pkg/sql/stats/stats_cache.go @@ -65,11 +65,25 @@ type TableStatisticsCache struct { // The cache stores *cacheEntry objects. The fields are protected by the // cache-wide mutex. type cacheEntry struct { - // If true, we are in the process of updating the statistics for this - // table. Other callers can wait on the waitCond until this is false. + // If mustWait is true, we do not have any statistics for this table and we + // are in the process of fetching the stats from the database. Other callers + // can wait on the waitCond until this is false. mustWait bool waitCond sync.Cond + // If refreshing is true, the current statistics for this table are stale, + // and we are in the process of fetching the updated stats from the database. + // In the mean time, other callers can use the stale stats and do not need to + // wait. + // + // If a goroutine tries to perform a refresh when a refresh is already + // in progress, it will see that refreshing=true and will set the + // mustRefreshAgain flag to true before returning. When the original + // goroutine that was performing the refresh returns from the database and + // sees that mustRefreshAgain=true, it will trigger another refresh. + refreshing bool + mustRefreshAgain bool + stats []*TableStatistic // err is populated if the internal query to retrieve stats hit an error. @@ -246,18 +260,33 @@ func (sc *TableStatisticsCache) refreshCacheEntry(ctx context.Context, tableID s if !found || e.err != nil { return } - sc.mu.numInternalQueries++ + + // Don't perform a refresh if a refresh is already in progress, but let that + // goroutine know it needs to refresh again. + if e.refreshing { + e.mustRefreshAgain = true + return + } + e.refreshing = true var stats []*TableStatistic var err error - func() { - sc.mu.Unlock() - defer sc.mu.Lock() - - stats, err = sc.getTableStatsFromDB(ctx, tableID) - }() + for { + func() { + sc.mu.numInternalQueries++ + sc.mu.Unlock() + defer sc.mu.Lock() + + stats, err = sc.getTableStatsFromDB(ctx, tableID) + }() + if !e.mustRefreshAgain { + break + } + e.mustRefreshAgain = false + } e.stats, e.err = stats, err + e.refreshing = false if err != nil { // Don't keep the cache entry around, so that we retry the query.