From aa7fa4fac4270409263817383fc080905fdbefdb Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Tue, 9 Aug 2022 23:12:13 -0700 Subject: [PATCH] sql/stats: forecast table statistics Add function to forecast table statistics based on observed statistics. These forecasts are based on linear regression models over time. For each set of columns with statistics, we construct a linear regression model over time for each statistic (row count, null count, distinct count, average row size, and histogram). If all models are good fits then we produce a statistics forecast for the set of columns. Assists: #79872 Release note: None --- pkg/jobs/jobspb/wrap.go | 3 + pkg/sql/stats/BUILD.bazel | 3 + pkg/sql/stats/forecast.go | 363 ++++++++++++++++++ pkg/sql/stats/forecast_test.go | 650 +++++++++++++++++++++++++++++++++ pkg/sql/stats/histogram.go | 3 + pkg/sql/stats/quantile.go | 2 + pkg/sql/stats/stats_cache.go | 33 ++ 7 files changed, 1057 insertions(+) create mode 100644 pkg/sql/stats/forecast.go create mode 100644 pkg/sql/stats/forecast_test.go diff --git a/pkg/jobs/jobspb/wrap.go b/pkg/jobs/jobspb/wrap.go index 1a9ffa90b706..77b77be0a96d 100644 --- a/pkg/jobs/jobspb/wrap.go +++ b/pkg/jobs/jobspb/wrap.go @@ -87,6 +87,9 @@ const AutoStatsName = "__auto__" // during import. const ImportStatsName = "__import__" +// ForecastStatsName is the name to use for statistic forecasts. +const ForecastStatsName = "__forecast__" + // AutomaticJobTypes is a list of automatic job types that currently exist. var AutomaticJobTypes = [...]Type{ TypeAutoCreateStats, diff --git a/pkg/sql/stats/BUILD.bazel b/pkg/sql/stats/BUILD.bazel index 875b8ac73203..8804363cedf2 100644 --- a/pkg/sql/stats/BUILD.bazel +++ b/pkg/sql/stats/BUILD.bazel @@ -8,6 +8,7 @@ go_library( srcs = [ "automatic_stats.go", "delete_stats.go", + "forecast.go", "histogram.go", "json.go", "new_stat.go", @@ -58,6 +59,7 @@ go_library( "//pkg/util/timeutil/pgdate", "//pkg/util/tracing", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", ], ) @@ -69,6 +71,7 @@ go_test( "automatic_stats_test.go", "create_stats_job_test.go", "delete_stats_test.go", + "forecast_test.go", "histogram_test.go", "main_test.go", "quantile_test.go", diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go new file mode 100644 index 000000000000..e1ed47595bb0 --- /dev/null +++ b/pkg/sql/stats/forecast.go @@ -0,0 +1,363 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package stats + +import ( + "context" + "math" + "time" + + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" +) + +// minObservationsForForecast is the minimum number of observed statistics +// required to produce a statistics forecast. Forecasts based on 1 or 2 +// observations will always have R² = 1 (perfect goodness of fit) regardless of +// the accuracy of the forecast. +const minObservationsForForecast = 3 + +// minGoodnessOfFit is the minimum R² (goodness of fit) measurement all +// predictive models in a forecast must have for us to use the forecast. +const minGoodnessOfFit = 0.95 + +// maxForecastDistance is the farthest into the future we can forecast from the +// latest observed statistics. +const maxForecastDistance = time.Hour * 24 * 7 + +// ForecastTableStatistics produces zero or more statistics forecasts based on +// the given observed statistics. The observed statistics must be ordered by +// collection time descending, with the latest collected statistics first. The +// observed statistics may be a mixture of statistics for different sets of +// columns, but should not contain statistics for any old nonexistent columns. +// +// Whether a forecast is produced for a set of columns depends on how well the +// observed statistics for that set of columns fit a linear regression model. +// This means a forecast will not necessarily be produced for every set of +// columns in the table. Any forecasts produced will have the same CreatedAt +// time, which will be up to a week after the latest observed statistics (and +// could be in the past, present, or future relative to the current time). Any +// forecasts produced will not necessarily have the same RowCount or be +// consistent with the other forecasts produced. (For example, DistinctCount in +// the forecast for columns {a, b} could very well end up less than +// DistinctCount in the forecast for column {a}.) +// +// ForecastTableStatistics is deterministic: given the same observations it will +// return the same forecasts. +// +// TODO(michae2): Use nil *eval.Context or custom tree.CompareContext instead of +// taking an evalCtx. +func ForecastTableStatistics( + ctx context.Context, evalCtx tree.CompareContext, observed []*TableStatistic, +) []*TableStatistic { + // Early sanity check. We'll check this again in forecastColumnStatistics. + if len(observed) < minObservationsForForecast { + return nil + } + + // To make forecasts deterministic, we must choose a time to forecast at based + // on only the observed statistics. We choose the time of the latest + // statistics + the average time between automatic stats collections, which + // should be roughly when the next automatic stats collection will occur. To + // avoid wildly futuristic predictions we cap this at maxForecastDistance. + latest := observed[0].CreatedAt + horizon := latest.Add(maxForecastDistance) + at := latest.Add(avgRefreshTime(observed)) + if at.After(horizon) { + at = horizon + } + + // Group observed statistics by column set, and remove statistics with + // inverted histograms. + var forecastCols []string + observedByCols := make(map[string][]*TableStatistic) + for _, stat := range observed { + // We don't have a good way to detect inverted statistics right now, so skip + // all statistics with histograms of type BYTES. This means we cannot + // forecast statistics for normal BYTES columns. + // TODO(michae2): Improve this when issue #50655 is fixed. + if stat.HistogramData != nil && stat.HistogramData.ColumnType.Family() == types.BytesFamily { + continue + } + colKey := MakeSortedColStatKey(stat.ColumnIDs) + obs, ok := observedByCols[colKey] + if !ok { + forecastCols = append(forecastCols, colKey) + } + observedByCols[colKey] = append(obs, stat) + } + + forecasts := make([]*TableStatistic, 0, len(forecastCols)) + for _, colKey := range forecastCols { + forecast, err := forecastColumnStatistics( + ctx, evalCtx, observedByCols[colKey], at, minGoodnessOfFit, + ) + if err != nil { + log.VEventf( + ctx, 2, "could not forecast statistics for table %v columns %s: %v", + observed[0].TableID, redact.SafeString(colKey), err, + ) + continue + } + forecasts = append(forecasts, forecast) + } + return forecasts +} + +// forecastColumnStatistics produces a statistics forecast at the given time, +// based on the given observed statistics. The observed statistics must all be +// for the same set of columns, must not contain any inverted histograms, must +// have a single observation per collection time, and must be ordered by +// collection time descending with the latest collected statistics first. The +// given time to forecast at can be in the past, present, or future. +// +// To create a forecast, we construct a linear regression model over time for +// each statistic (row count, null count, distinct count, average row size, and +// histogram). If all models are good fits (i.e. have R² >= minRequiredFit) then +// we use them to predict the value of each statistic at the given time. If any +// model except the histogram model is a poor fit (i.e. has R² < minRequiredFit) +// then we return an error instead of a forecast. If the histogram model is a +// poor fit we adjust the latest observed histogram to match predicted values. +// +// forecastColumnStatistics is deterministic: given the same observations and +// forecast time, it will return the same forecast. +func forecastColumnStatistics( + ctx context.Context, + evalCtx tree.CompareContext, + observed []*TableStatistic, + at time.Time, + minRequiredFit float64, +) (forecast *TableStatistic, err error) { + if len(observed) < minObservationsForForecast { + return nil, errors.New("not enough observations to forecast statistics") + } + + forecastAt := float64(at.Unix()) + tableID := observed[0].TableID + columnIDs := observed[0].ColumnIDs + + // Gather inputs for our regression models. + createdAts := make([]float64, len(observed)) + rowCounts := make([]float64, len(observed)) + nullCounts := make([]float64, len(observed)) + // For distinct counts and avg sizes, we skip over empty table stats and + // only-null stats to avoid skew. + nonEmptyCreatedAts := make([]float64, 0, len(observed)) + distinctCounts := make([]float64, 0, len(observed)) + avgSizes := make([]float64, 0, len(observed)) + for i, stat := range observed { + // Guard against multiple observations with the same collection time, to + // avoid skewing the regression models. + if i > 0 && observed[i].CreatedAt.Equal(observed[i-1].CreatedAt) { + return nil, errors.Newf( + "multiple observations with the same collection time %v", observed[i].CreatedAt, + ) + } + createdAts[i] = float64(stat.CreatedAt.Unix()) + rowCounts[i] = float64(stat.RowCount) + nullCounts[i] = float64(stat.NullCount) + if stat.RowCount-stat.NullCount > 0 { + nonEmptyCreatedAts = append(nonEmptyCreatedAts, float64(stat.CreatedAt.Unix())) + distinctCounts = append(distinctCounts, float64(stat.DistinctCount)) + avgSizes = append(avgSizes, float64(stat.AvgSize)) + } + } + + // predict tries to predict the value of the given statistic at forecast time. + predict := func(name redact.SafeString, y, createdAts []float64) (float64, error) { + yₙ, r2 := float64SimpleLinearRegression(createdAts, y, forecastAt) + log.VEventf( + ctx, 3, "forecast for table %v columns %v predicted %s %v R² %v", + tableID, columnIDs, name, yₙ, r2, + ) + if r2 < minRequiredFit { + return yₙ, errors.Newf( + "predicted %v R² %v below min required R² %v", name, r2, minRequiredFit, + ) + } + // Clamp the predicted value to [0, MaxInt64] and round to nearest integer. + if yₙ < 0 { + return 0, nil + } + if yₙ > math.MaxInt64 { + return math.MaxInt64, nil + } + return math.Round(yₙ), nil + } + + rowCount, err := predict("RowCount", rowCounts, createdAts) + if err != nil { + return nil, err + } + nullCount, err := predict("NullCount", nullCounts, createdAts) + if err != nil { + return nil, err + } + var distinctCount, avgSize float64 + if len(nonEmptyCreatedAts) > 0 { + distinctCount, err = predict("DistinctCount", distinctCounts, nonEmptyCreatedAts) + if err != nil { + return nil, err + } + avgSize, err = predict("AvgSize", avgSizes, nonEmptyCreatedAts) + if err != nil { + return nil, err + } + } + + // Adjust predicted statistics for consistency. + if nullCount > rowCount { + nullCount = rowCount + } + nonNullRowCount := rowCount - nullCount + + minDistinctCount := float64(0) + maxDistinctCount := nonNullRowCount + if nonNullRowCount > 0 { + minDistinctCount++ + } + if nullCount > 0 { + minDistinctCount++ + maxDistinctCount++ + } + if distinctCount < minDistinctCount { + distinctCount = minDistinctCount + } + if distinctCount > maxDistinctCount { + distinctCount = maxDistinctCount + } + nonNullDistinctCount := distinctCount + if nullCount > 0 { + nonNullDistinctCount-- + } + + forecast = &TableStatistic{ + TableStatisticProto: TableStatisticProto{ + TableID: tableID, + StatisticID: 0, // TODO(michae2): Add support for SHOW HISTOGRAM. + Name: jobspb.ForecastStatsName, + ColumnIDs: columnIDs, + CreatedAt: at, + RowCount: uint64(rowCount), + DistinctCount: uint64(distinctCount), + NullCount: uint64(nullCount), + AvgSize: uint64(avgSize), + }, + } + + // Try to predict a histogram if there was one in the latest observed + // stats. If we cannot predict a histogram, we will use the latest observed + // histogram. NOTE: If any of the observed histograms were for inverted + // indexes this will produce an incorrect histogram. + if observed[0].HistogramData != nil { + hist, err := predictHistogram( + ctx, evalCtx, observed, forecastAt, minRequiredFit, nonNullRowCount, + ) + if err != nil { + // If we did not successfully predict a histogram then copy the latest + // histogram so we can adjust it. + log.VEventf( + ctx, 3, "forecast for table %v columns %v: could not predict histogram due to: %v", + tableID, columnIDs, err, + ) + hist.buckets = append([]cat.HistogramBucket{}, observed[0].nonNullHistogram().buckets...) + } + + // Now adjust for consistency. + hist.adjustCounts(evalCtx, nonNullRowCount, nonNullDistinctCount) + + // Finally, convert back to HistogramData. + histData, err := hist.toHistogramData(observed[0].HistogramData.ColumnType) + if err != nil { + return nil, err + } + forecast.HistogramData = &histData + forecast.setHistogramBuckets(hist) + } + + return forecast, nil +} + +// predictHistogram tries to predict the histogram at forecast time. +func predictHistogram( + ctx context.Context, + evalCtx tree.CompareContext, + observed []*TableStatistic, + forecastAt float64, + minRequiredFit float64, + nonNullRowCount float64, +) (histogram, error) { + if observed[0].HistogramData == nil { + return histogram{}, errors.New("latest observed stat missing histogram") + } + + // Empty table case. + if nonNullRowCount < 1 { + return histogram{buckets: make([]cat.HistogramBucket, 0)}, nil + } + + tableID := observed[0].TableID + columnIDs := observed[0].ColumnIDs + colType := observed[0].HistogramData.ColumnType + + // Convert histograms to quantile functions. We don't need every observation + // to have a histogram, but we do need at least minObservationsForForecast + // histograms. + createdAts := make([]float64, 0, len(observed)) + quantiles := make([]quantile, 0, len(observed)) + for _, stat := range observed { + if stat.HistogramData == nil { + continue + } + if !stat.HistogramData.ColumnType.Equivalent(colType) { + continue + } + if !canMakeQuantile(stat.HistogramData.Version, stat.HistogramData.ColumnType) { + continue + } + // Skip empty table stats and only-null stats to avoid skew. + if stat.RowCount-stat.NullCount < 1 { + continue + } + q, err := makeQuantile(stat.nonNullHistogram(), float64(stat.RowCount-stat.NullCount)) + if err != nil { + return histogram{}, err + } + createdAts = append(createdAts, float64(stat.CreatedAt.Unix())) + quantiles = append(quantiles, q) + } + + if len(quantiles) < minObservationsForForecast { + return histogram{}, errors.New("not enough observations to forecast histogram") + } + + // Construct a linear regression model of quantile functions over time, and + // use it to predict a quantile function at the given time. + yₙ, r2 := quantileSimpleLinearRegression(createdAts, quantiles, forecastAt) + yₙ = yₙ.fixMalformed() + log.VEventf( + ctx, 3, "forecast for table %v columns %v predicted quantile %v R² %v", + tableID, columnIDs, yₙ, r2, + ) + if r2 < minRequiredFit { + return histogram{}, errors.Newf( + "predicted histogram R² %v below min required R² %v", r2, minRequiredFit, + ) + } + + // Finally, convert the predicted quantile function back to a histogram. + return yₙ.toHistogram(evalCtx, colType, nonNullRowCount) +} diff --git a/pkg/sql/stats/forecast_test.go b/pkg/sql/stats/forecast_test.go new file mode 100644 index 000000000000..b4a4ad43dc5a --- /dev/null +++ b/pkg/sql/stats/forecast_test.go @@ -0,0 +1,650 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package stats + +import ( + "context" + "reflect" + "strconv" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" + "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" + "github.com/cockroachdb/cockroach/pkg/sql/types" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" +) + +// TestForecastColumnStatistics calls forecastColumnStatistics with various +// observed stats. +func TestForecastColumnStatistics(t *testing.T) { + testCases := []struct { + observed []*testStat + at uint64 + err bool + forecast *testStat + }{ + // Error: Too few observations + { + at: 1, + err: true, + }, + // Error: Too few observations + { + observed: []*testStat{ + {at: 1, row: 1, dist: 1, null: 0, size: 1}, + }, + at: 2, + err: true, + }, + // Error: Too few observations + { + observed: []*testStat{ + {at: 1, row: 1, dist: 1, null: 0, size: 1}, + {at: 2, row: 2, dist: 2, null: 0, size: 1}, + }, + at: 3, + err: true, + }, + // Error: Multiple observations with the same collection time + { + observed: []*testStat{ + {at: 1, row: 1, dist: 1, null: 0, size: 1}, + {at: 2, row: 2, dist: 2, null: 0, size: 1}, + {at: 2, row: 2, dist: 2, null: 0, size: 1}, + }, + at: 3, + err: true, + }, + // Constant empty table + { + observed: []*testStat{ + {at: 1, row: 0, dist: 0, null: 0, size: 0}, + {at: 2, row: 0, dist: 0, null: 0, size: 0}, + {at: 3, row: 0, dist: 0, null: 0, size: 0}, + }, + at: 4, + forecast: &testStat{at: 4, row: 0, dist: 0, null: 0, size: 0}, + }, + // Constant all null rows + { + observed: []*testStat{ + {at: 1, row: 1, dist: 1, null: 1, size: 0}, + {at: 2, row: 1, dist: 1, null: 1, size: 0}, + {at: 3, row: 1, dist: 1, null: 1, size: 0}, + }, + at: 4, + forecast: &testStat{at: 4, row: 1, dist: 1, null: 1, size: 0}, + }, + // Constant all non-null rows + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 0, size: 1}, + {at: 2, row: 2, dist: 1, null: 0, size: 1}, + {at: 3, row: 2, dist: 1, null: 0, size: 1}, + }, + at: 4, + forecast: &testStat{at: 4, row: 2, dist: 1, null: 0, size: 1}, + }, + // Constant null and non-null rows + { + observed: []*testStat{ + {at: 1, row: 9, dist: 4, null: 5, size: 1}, + {at: 2, row: 9, dist: 4, null: 5, size: 1}, + {at: 3, row: 9, dist: 4, null: 5, size: 1}, + }, + at: 4, + forecast: &testStat{at: 4, row: 9, dist: 4, null: 5, size: 1}, + }, + // Growing number of null rows + { + observed: []*testStat{ + {at: 1, row: 7, dist: 2, null: 2, size: 1}, + {at: 2, row: 7, dist: 2, null: 4, size: 1}, + {at: 3, row: 7, dist: 2, null: 6, size: 1}, + }, + at: 4, + forecast: &testStat{at: 4, row: 7, dist: 1, null: 7, size: 1}, + }, + // Shrinking number of null rows + { + observed: []*testStat{ + {at: 1, row: 6, dist: 2, null: 5, size: 0}, + {at: 2, row: 6, dist: 2, null: 3, size: 0}, + {at: 3, row: 6, dist: 2, null: 1, size: 0}, + }, + at: 4, + forecast: &testStat{at: 4, row: 6, dist: 2, null: 0, size: 0}, + }, + // Growing number of non-null rows + { + observed: []*testStat{ + {at: 1, row: 10, dist: 8, null: 3, size: 2}, + {at: 2, row: 20, dist: 18, null: 3, size: 2}, + {at: 3, row: 30, dist: 28, null: 3, size: 2}, + {at: 5, row: 50, dist: 48, null: 3, size: 2}, + }, + at: 6, + forecast: &testStat{at: 6, row: 60, dist: 58, null: 3, size: 2}, + }, + // Shrinking number of non-null rows + { + observed: []*testStat{ + {at: 1, row: 13, dist: 10, null: 4, size: 11}, + {at: 2, row: 12, dist: 9, null: 4, size: 11}, + {at: 5, row: 9, dist: 6, null: 4, size: 11}, + }, + at: 9, + forecast: &testStat{at: 9, row: 5, dist: 2, null: 4, size: 11}, + }, + // Growing number of null and non-null rows + { + observed: []*testStat{ + {at: 1, row: 4, dist: 3, null: 2, size: 5}, + {at: 3, row: 5, dist: 2, null: 4, size: 5}, + {at: 5, row: 6, dist: 1, null: 6, size: 5}, + }, + at: 7, + forecast: &testStat{at: 7, row: 7, dist: 1, null: 7, size: 5}, + }, + // Shrinking number of null and non-null rows + { + observed: []*testStat{ + {at: 5, row: 14, dist: 9, null: 4, size: 1}, + {at: 6, row: 10, dist: 6, null: 3, size: 1}, + {at: 7, row: 6, dist: 3, null: 2, size: 1}, + }, + at: 8, + forecast: &testStat{at: 8, row: 2, dist: 2, null: 1, size: 1}, + }, + // Growing distinct count + { + observed: []*testStat{ + {at: 1, row: 21, dist: 3, null: 2, size: 10}, + {at: 2, row: 22, dist: 6, null: 2, size: 10}, + {at: 5, row: 25, dist: 15, null: 2, size: 10}, + {at: 6, row: 26, dist: 18, null: 2, size: 10}, + {at: 7, row: 27, dist: 21, null: 2, size: 10}, + }, + at: 10, + forecast: &testStat{at: 10, row: 30, dist: 29, null: 2, size: 10}, + }, + // Shrinking distinct count + { + observed: []*testStat{ + {at: 5, row: 25, dist: 15, null: 0, size: 1}, + {at: 6, row: 25, dist: 10, null: 0, size: 1}, + {at: 7, row: 25, dist: 5, null: 0, size: 1}, + }, + at: 11, + forecast: &testStat{at: 11, row: 25, dist: 1, null: 0, size: 1}, + }, + // Growing AvgSize + { + observed: []*testStat{ + {at: 2, row: 9, dist: 3, null: 0, size: 1}, + {at: 4, row: 9, dist: 5, null: 0, size: 11}, + {at: 6, row: 9, dist: 7, null: 0, size: 21}, + }, + at: 7, + forecast: &testStat{at: 7, row: 9, dist: 8, null: 0, size: 26}, + }, + // Shrinking AvgSize + { + observed: []*testStat{ + {at: 2, row: 10, dist: 8, null: 0, size: 30}, + {at: 4, row: 10, dist: 8, null: 0, size: 20}, + {at: 6, row: 10, dist: 8, null: 0, size: 10}, + }, + at: 9, + forecast: &testStat{at: 9, row: 10, dist: 8, null: 0, size: 0}, + }, + // Growing from empty table + { + observed: []*testStat{ + {at: 1, row: 0, dist: 0, null: 0, size: 0}, + {at: 2, row: 1, dist: 1, null: 0, size: 2}, + {at: 3, row: 2, dist: 1, null: 0, size: 2}, + }, + at: 4, + forecast: &testStat{at: 4, row: 3, dist: 1, null: 0, size: 2}, + }, + // Shrinking to empty table + { + observed: []*testStat{ + {at: 1, row: 3, dist: 1, null: 0, size: 2}, + {at: 2, row: 2, dist: 1, null: 0, size: 2}, + {at: 3, row: 1, dist: 1, null: 0, size: 2}, + }, + at: 4, + forecast: &testStat{at: 4, row: 0, dist: 0, null: 0, size: 2}, + }, + // Error: RowCount bad fit + { + observed: []*testStat{ + {at: 1, row: 7, dist: 1, null: 1, size: 1}, + {at: 2, row: 9, dist: 2, null: 2, size: 2}, + {at: 3, row: 5, dist: 3, null: 3, size: 3}, + }, + at: 4, + err: true, + }, + // Error: NullCount bad fit + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 2, size: 1}, + {at: 2, row: 4, dist: 2, null: 0, size: 2}, + {at: 3, row: 6, dist: 3, null: 3, size: 3}, + }, + at: 4, + err: true, + }, + // Error: DistinctCount bad fit + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 0, size: 1}, + {at: 2, row: 4, dist: 2, null: 0, size: 2}, + {at: 3, row: 6, dist: 5, null: 0, size: 3}, + }, + at: 4, + err: true, + }, + // Error: AvgSize bad fit + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 0, size: 1}, + {at: 2, row: 4, dist: 2, null: 0, size: 2}, + {at: 3, row: 6, dist: 5, null: 0, size: 3}, + }, + at: 4, + err: true, + }, + // Skip only-null stats for DistinctCount + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 2, size: 1}, + {at: 2, row: 4, dist: 2, null: 2, size: 2}, + {at: 3, row: 6, dist: 5, null: 2, size: 3}, + }, + at: 4, + forecast: &testStat{at: 4, row: 8, dist: 7, null: 2, size: 4}, + }, + // Skip only-null stats for AvgSize + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 2, size: 0}, + {at: 2, row: 4, dist: 2, null: 2, size: 7}, + {at: 3, row: 6, dist: 3, null: 2, size: 6}, + }, + at: 4, + forecast: &testStat{at: 4, row: 8, dist: 4, null: 2, size: 5}, + }, + // Histogram, constant empty table + { + observed: []*testStat{ + {at: 1, row: 0, dist: 0, null: 0, size: 0, hist: testHistogram{}}, + {at: 2, row: 0, dist: 0, null: 0, size: 0, hist: testHistogram{}}, + {at: 3, row: 0, dist: 0, null: 0, size: 0, hist: testHistogram{}}, + }, + at: 4, + forecast: &testStat{at: 4, row: 0, dist: 0, null: 0, size: 0, hist: testHistogram{}}, + }, + // Histogram, constant all null rows + { + observed: []*testStat{ + {at: 1, row: 1, dist: 1, null: 1, size: 0, hist: testHistogram{}}, + {at: 2, row: 1, dist: 1, null: 1, size: 0, hist: testHistogram{}}, + {at: 3, row: 1, dist: 1, null: 1, size: 0, hist: testHistogram{}}, + }, + at: 4, + forecast: &testStat{at: 4, row: 1, dist: 1, null: 1, size: 0, hist: testHistogram{}}, + }, + // Histogram, constant all non-null rows + { + observed: []*testStat{ + {at: 1, row: 2, dist: 1, null: 0, size: 1, hist: testHistogram{{2, 0, 0, 99}}}, + {at: 2, row: 2, dist: 1, null: 0, size: 1, hist: testHistogram{{2, 0, 0, 99}}}, + {at: 3, row: 2, dist: 1, null: 0, size: 1, hist: testHistogram{{2, 0, 0, 99}}}, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 2, dist: 1, null: 0, size: 1, hist: testHistogram{{2, 0, 0, 99}}, + }, + }, + // Histogram, constant null and non-null rows + { + observed: []*testStat{ + { + at: 1, row: 9, dist: 4, null: 5, size: 1, + hist: testHistogram{{2, 0, 0, 99}, {1, 1, 1, 119}}, + }, + { + at: 2, row: 9, dist: 4, null: 5, size: 1, + hist: testHistogram{{2, 0, 0, 99}, {1, 1, 1, 119}}, + }, + { + at: 3, row: 9, dist: 4, null: 5, size: 1, + hist: testHistogram{{2, 0, 0, 99}, {1, 1, 1, 119}}, + }, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 9, dist: 4, null: 5, size: 1, + hist: testHistogram{{2, 0, 0, 99}, {1, 1, 1, 119}}, + }, + }, + // Histogram, growing number of null and non-null rows + { + observed: []*testStat{ + { + at: 1, row: 4, dist: 3, null: 2, size: 5, + hist: testHistogram{{1, 0, 0, 9000}, {1, 0, 0, 10000}}, + }, + { + at: 3, row: 5, dist: 2, null: 4, size: 5, + hist: testHistogram{{1, 0, 0, 10000}}, + }, + { + at: 5, row: 6, dist: 1, null: 6, size: 5, + hist: testHistogram{}, + }, + }, + at: 7, + forecast: &testStat{ + at: 7, row: 7, dist: 1, null: 7, size: 5, + hist: testHistogram{}, + }, + }, + // Histogram, shrinking number of null and non-null rows + { + observed: []*testStat{ + { + at: 5, row: 14, dist: 9, null: 4, size: 1, + hist: testHistogram{{0, 0, 0, 30}, {1, 9, 7, 40}}, + }, + { + at: 6, row: 10, dist: 6, null: 3, size: 1, + hist: testHistogram{{0, 0, 0, 30}, {1, 6, 4, 40}}, + }, + { + at: 7, row: 6, dist: 3, null: 2, size: 1, + hist: testHistogram{{0, 0, 0, 30}, {1, 3, 1, 40}}, + }, + }, + at: 8, + forecast: &testStat{ + at: 8, row: 2, dist: 2, null: 1, size: 1, + hist: testHistogram{{1, 0, 0, 40}}, + }, + }, + // Histogram, growing distinct count + { + observed: []*testStat{ + { + at: 1, row: 21, dist: 3, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {19, 0, 0, 100}}, + }, + { + at: 2, row: 22, dist: 6, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {17, 3, 3, 100}}, + }, + { + at: 5, row: 25, dist: 15, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {11, 12, 12, 100}}, + }, + { + at: 6, row: 26, dist: 18, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {9, 15, 15, 100}}, + }, + { + at: 7, row: 27, dist: 21, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {7, 18, 18, 100}}, + }, + }, + at: 10, + forecast: &testStat{ + at: 10, row: 30, dist: 29, null: 2, size: 10, + hist: testHistogram{{0, 0, 0, 85}, {1, 27, 27, 100}}, + }, + }, + // Histogram, shrinking distinct count + { + observed: []*testStat{ + { + at: 5, row: 25, dist: 15, null: 0, size: 1, + hist: testHistogram{{7, 0, 0, 404}, {0, 18, 14, 500}}, + }, + { + at: 6, row: 25, dist: 10, null: 0, size: 1, + hist: testHistogram{{10, 0, 0, 404}, {0, 15, 9, 500}}, + }, + { + at: 7, row: 25, dist: 5, null: 0, size: 1, + hist: testHistogram{{13, 0, 0, 404}, {0, 12, 4, 500}}, + }, + }, + at: 11, + forecast: &testStat{ + at: 11, row: 25, dist: 1, null: 0, size: 1, + hist: testHistogram{{25, 0, 0, 404}}, + }, + }, + // Histogram, growing from empty table + { + observed: []*testStat{ + { + at: 1, row: 0, dist: 0, null: 0, size: 0, + hist: testHistogram{}, + }, + { + at: 2, row: 1, dist: 1, null: 0, size: 2, + hist: testHistogram{{1, 0, 0, -2345}}, + }, + { + at: 3, row: 2, dist: 1, null: 0, size: 2, + hist: testHistogram{{2, 0, 0, -2345}}, + }, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 3, dist: 1, null: 0, size: 2, + hist: testHistogram{{3, 0, 0, -2345}}, + }, + }, + // Histogram, shrinking to empty table + { + observed: []*testStat{ + { + at: 1, row: 3, dist: 1, null: 0, size: 2, + hist: testHistogram{{3, 0, 0, 1700}}, + }, + { + at: 2, row: 2, dist: 1, null: 0, size: 2, + hist: testHistogram{{2, 0, 0, 1700}}, + }, + { + at: 3, row: 1, dist: 1, null: 0, size: 2, + hist: testHistogram{{1, 0, 0, 1700}}, + }, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 0, dist: 0, null: 0, size: 2, + hist: testHistogram{}, + }, + }, + // Histogram, skip only-null stats + { + observed: []*testStat{ + { + at: 1, row: 3, dist: 1, null: 3, size: 2, + hist: testHistogram{}, + }, + { + at: 2, row: 5, dist: 2, null: 3, size: 2, + hist: testHistogram{{1, 0, 0, 200}, {0, 1, 1, 800}}, + }, + { + at: 3, row: 7, dist: 3, null: 3, size: 2, + hist: testHistogram{{2, 0, 0, 200}, {0, 2, 2, 800}}, + }, + { + at: 4, row: 9, dist: 4, null: 3, size: 2, + hist: testHistogram{{3, 0, 0, 200}, {0, 3, 3, 800}}, + }, + }, + at: 5, + forecast: &testStat{ + at: 5, row: 11, dist: 5, null: 3, size: 2, + hist: testHistogram{{4, 0, 0, 200}, {1, 3, 2, 800}}, + }, + }, + // Histogram, constant numbers but changing shape + { + observed: []*testStat{ + { + at: 1, row: 16, dist: 7, null: 0, size: 1, + hist: testHistogram{{1, 0, 0, 14}, {2, 6, 2, 15}, {1, 6, 2, 16}}, + }, + { + at: 2, row: 16, dist: 7, null: 0, size: 1, + hist: testHistogram{{1, 0, 0, 14}, {2, 6, 2, 14.75}, {1, 6, 2, 18}}, + }, + { + at: 3, row: 16, dist: 7, null: 0, size: 1, + hist: testHistogram{{1, 0, 0, 14}, {2, 6, 2, 14.5}, {1, 6, 2, 20}}, + }, + { + at: 4, row: 16, dist: 7, null: 0, size: 1, + hist: testHistogram{{1, 0, 0, 14}, {2, 6, 2, 14.25}, {1, 6, 2, 22}}, + }, + }, + at: 5, + forecast: &testStat{ + at: 5, row: 16, dist: 7, null: 0, size: 1, + hist: testHistogram{{9, 0, 0, 14}, {1, 6, 5, 24}}, + }, + }, + // Histogram, too few observations + { + observed: []*testStat{ + { + at: 1, row: 10, dist: 2, null: 0, size: 1, + }, + { + at: 2, row: 20, dist: 2, null: 0, size: 1, + hist: testHistogram{{10, 0, 0, 100}, {10, 0, 0, 200}}, + }, + { + at: 3, row: 30, dist: 2, null: 0, size: 1, + hist: testHistogram{{15, 0, 0, 100}, {15, 0, 0, 300}}, + }, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 40, dist: 2, null: 0, size: 1, + hist: testHistogram{{20, 0, 0, 100}, {20, 0, 0, 300}}, + }, + }, + // Histogram, bad fit + { + observed: []*testStat{ + { + at: 1, row: 10, dist: 2, null: 0, size: 1, + hist: testHistogram{{5, 0, 0, 50}, {5, 0, 0, 100}}, + }, + { + at: 2, row: 20, dist: 2, null: 0, size: 1, + hist: testHistogram{{10, 0, 0, 50}, {10, 0, 0, 200}}, + }, + { + at: 3, row: 30, dist: 2, null: 0, size: 1, + hist: testHistogram{{15, 0, 0, 50}, {15, 0, 0, 301}}, + }, + }, + at: 4, + forecast: &testStat{ + at: 4, row: 40, dist: 2, null: 0, size: 1, + hist: testHistogram{{20, 0, 0, 50}, {20, 0, 0, 301}}, + }, + }, + } + ctx := context.Background() + evalCtx := eval.NewTestingEvalContext(cluster.MakeTestingClusterSettings()) + for i, tc := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + + // Set up observed TableStatistics in CreatedAt desc order. + observed := make([]*TableStatistic, len(tc.observed)) + for j := range tc.observed { + observed[len(observed)-j-1] = tc.observed[j].toTableStatistic("testStat", i) + } + expected := tc.forecast.toTableStatistic(jobspb.ForecastStatsName, i) + at := testStatTime(tc.at) + + forecast, err := forecastColumnStatistics(ctx, evalCtx, observed, at, 1) + if err != nil { + if !tc.err { + t.Errorf("test case %d unexpected forecastColumnStatistics err: %v", i, err) + } + return + } + if tc.err { + t.Errorf("test case %d expected forecastColumnStatistics err, was:\n%s", i, forecast) + return + } + if !reflect.DeepEqual(forecast, expected) { + t.Errorf("test case %d incorrect forecast\n%s\nexpected\n%s", i, forecast, expected) + } + }) + } +} + +type testStat struct { + at, row, dist, null, size uint64 + hist testHistogram +} + +func (ts *testStat) toTableStatistic(name string, tableID int) *TableStatistic { + if ts == nil { + return nil + } + stat := &TableStatistic{ + TableStatisticProto: TableStatisticProto{ + TableID: catid.DescID(tableID), + StatisticID: 0, + Name: name, + ColumnIDs: []descpb.ColumnID{1}, + CreatedAt: testStatTime(ts.at), + RowCount: ts.row, + DistinctCount: ts.dist, + NullCount: ts.null, + AvgSize: ts.size, + }, + } + if ts.hist != nil { + hist := ts.hist.toHistogram() + histData, err := hist.toHistogramData(types.Float) + if err != nil { + panic(err) + } + stat.HistogramData = &histData + stat.setHistogramBuckets(hist) + } + return stat +} + +func testStatTime(at uint64) time.Time { + return timeutil.Unix(int64(at), 0) +} diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go index 34cac9611c81..6a3f87784008 100644 --- a/pkg/sql/stats/histogram.go +++ b/pkg/sql/stats/histogram.go @@ -311,6 +311,9 @@ func (h *histogram) adjustCounts( adjustmentFactorDistinctRange = (distinctCountTotal - distinctCountEq) / distinctCountRange } adjustmentFactorRowCount := rowCountTotal / (rowCountRange + rowCountEq) + // TODO(michae2): Consider moving this section above the sections adjusting + // NumEq and NumRange for distinct counts. This would help the adjustments be + // less surprising in some cases. for i := range h.buckets { h.buckets[i].DistinctRange *= adjustmentFactorDistinctRange h.buckets[i].NumRange *= adjustmentFactorRowCount diff --git a/pkg/sql/stats/quantile.go b/pkg/sql/stats/quantile.go index 7990c97d7604..e48130f36109 100644 --- a/pkg/sql/stats/quantile.go +++ b/pkg/sql/stats/quantile.go @@ -283,6 +283,8 @@ func (q quantile) toHistogram( // Steal from NumRange so that NumEq is at least 1, if it wouldn't make // NumRange 0. This makes the histogram look more like something // EquiDepthHistogram would produce. + // TODO(michae2): Consider removing this logic if statistics_builder + // doesn't need it. currentBucket.NumRange -= 1 - numEq numEq = 1 } diff --git a/pkg/sql/stats/stats_cache.go b/pkg/sql/stats/stats_cache.go index 0a8eba36ae7d..b3e7c13502cf 100644 --- a/pkg/sql/stats/stats_cache.go +++ b/pkg/sql/stats/stats_cache.go @@ -12,6 +12,7 @@ package stats import ( "context" + "fmt" "sync" "github.com/cockroachdb/cockroach/pkg/keys" @@ -597,6 +598,8 @@ func DecodeHistogramBuckets(tabStat *TableStatistic) error { // make histograms easier to work with. The length of res.Histogram // is therefore 1 greater than the length of the histogram data // buckets. + // TODO(michae2): Combine this with setHistogramBuckets, especially if we + // need to change both after #6224 is fixed (NULLS LAST in index ordering). tabStat.Histogram = make([]cat.HistogramBucket, len(tabStat.HistogramData.Buckets)+1) tabStat.Histogram[0] = cat.HistogramBucket{ NumEq: float64(tabStat.NullCount), @@ -628,6 +631,36 @@ func DecodeHistogramBuckets(tabStat *TableStatistic) error { return nil } +// setHistogramBuckets shallow-copies the passed histogram into the +// TableStatistic, and prepends a bucket for NULL rows using the +// TableStatistic's null count. The resulting TableStatistic looks the same as +// if DecodeHistogramBuckets had been called. +func (tabStat *TableStatistic) setHistogramBuckets(hist histogram) { + tabStat.Histogram = hist.buckets + if tabStat.NullCount > 0 { + tabStat.Histogram = append([]cat.HistogramBucket{{ + NumEq: float64(tabStat.NullCount), + UpperBound: tree.DNull, + }}, tabStat.Histogram...) + } +} + +// nonNullHistogram returns the TableStatistic histogram with the NULL bucket +// removed. +func (tabStat *TableStatistic) nonNullHistogram() histogram { + if len(tabStat.Histogram) > 0 && tabStat.Histogram[0].UpperBound == tree.DNull { + return histogram{buckets: tabStat.Histogram[1:]} + } + return histogram{buckets: tabStat.Histogram} +} + +// String implements the fmt.Stringer interface. +func (tabStat *TableStatistic) String() string { + return fmt.Sprintf( + "%s histogram:%s", &tabStat.TableStatisticProto, histogram{buckets: tabStat.Histogram}, + ) +} + // getTableStatsFromDB retrieves the statistics in system.table_statistics // for the given table ID. //