From 0f90a99883bef8f0c674ace3fe5f5319dad35306 Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Mon, 18 Apr 2022 10:27:49 -0300 Subject: [PATCH 1/9] Add two metrics to the IndexGateway. - Add a new `query_readiness_duration_seconds` metric, that reports query readiness duration of a tablemanager/index gateway instance. We should use it later to report performance against the ring mode - Add a new `usersToBeQueryReadyForTotal` metric, that reports number of users involved in the query readiness operation. We should use it later to correlate number of users with the query readiness duration. --- pkg/storage/stores/shipper/downloads/metrics.go | 14 ++++++++++++++ .../stores/shipper/downloads/table_manager.go | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/pkg/storage/stores/shipper/downloads/metrics.go b/pkg/storage/stores/shipper/downloads/metrics.go index aff0c93bc5be8..bf02f7d20db76 100644 --- a/pkg/storage/stores/shipper/downloads/metrics.go +++ b/pkg/storage/stores/shipper/downloads/metrics.go @@ -3,6 +3,7 @@ package downloads import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/weaveworks/common/instrument" ) const ( @@ -14,6 +15,8 @@ type metrics struct { queryTimeTableDownloadDurationSeconds *prometheus.CounterVec tablesSyncOperationTotal *prometheus.CounterVec tablesDownloadOperationDurationSeconds prometheus.Gauge + ensureQueryReadinessDurationSeconds prometheus.Histogram + usersToBeQueryReadyForTotal prometheus.Gauge } func newMetrics(r prometheus.Registerer) *metrics { @@ -23,6 +26,17 @@ func newMetrics(r prometheus.Registerer) *metrics { Name: "query_time_table_download_duration_seconds", Help: "Time (in seconds) spent in downloading of files per table at query time", }, []string{"table"}), + ensureQueryReadinessDurationSeconds: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ + Namespace: "loki_boltdb_shipper", + Name: "query_readiness_duration_seconds", + Help: "Time (in seconds) spent making an index gateway ready to be queried", + Buckets: instrument.DefBuckets, + }), + usersToBeQueryReadyForTotal: promauto.With(r).NewGauge(prometheus.GaugeOpts{ + Namespace: "loki_boltdb_shipper", + Name: "users_to_be_query_ready_for_total", + Help: "Total number of users an index gateway instance has to be query ready for.", + }), tablesSyncOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "loki_boltdb_shipper", Name: "tables_sync_operation_total", diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index b552cc272351a..0ac7104b1989c 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -241,6 +241,14 @@ func (tm *TableManager) cleanupCache() error { // ensureQueryReadiness compares tables required for being query ready with the tables we already have and downloads the missing ones. func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { + start := time.Now() + usersToBeQueryReadyLen := 0 + defer func() { + duration := time.Since(start) + tm.metrics.ensureQueryReadinessDurationSeconds.Observe(duration.Seconds()) + tm.metrics.usersToBeQueryReadyForTotal.Set(float64(usersToBeQueryReadyLen)) + }() + activeTableNumber := getActiveTableNumber() // find the largest query readiness number @@ -309,6 +317,9 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { return err } + usersToBeQueryReadyLen += len(usersToBeQueryReadyFor) + + level.Debug(util_log.Logger).Log("msg", "instance should be query ready for users", "users", usersToBeQueryReadyFor) if err := table.EnsureQueryReadiness(ctx, usersToBeQueryReadyFor); err != nil { return err } From 63321f931b1888b5fe6f70a426f675d9f01a9424 Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Wed, 20 Apr 2022 13:16:25 -0300 Subject: [PATCH 2/9] Remove `usersToBeQueryReadyForTotal`. - It will report all users always for now, so it isn't too helpful the way it is. --- pkg/storage/stores/shipper/downloads/metrics.go | 6 ------ pkg/storage/stores/shipper/downloads/table_manager.go | 4 ---- 2 files changed, 10 deletions(-) diff --git a/pkg/storage/stores/shipper/downloads/metrics.go b/pkg/storage/stores/shipper/downloads/metrics.go index bf02f7d20db76..6b7d0692904ea 100644 --- a/pkg/storage/stores/shipper/downloads/metrics.go +++ b/pkg/storage/stores/shipper/downloads/metrics.go @@ -16,7 +16,6 @@ type metrics struct { tablesSyncOperationTotal *prometheus.CounterVec tablesDownloadOperationDurationSeconds prometheus.Gauge ensureQueryReadinessDurationSeconds prometheus.Histogram - usersToBeQueryReadyForTotal prometheus.Gauge } func newMetrics(r prometheus.Registerer) *metrics { @@ -32,11 +31,6 @@ func newMetrics(r prometheus.Registerer) *metrics { Help: "Time (in seconds) spent making an index gateway ready to be queried", Buckets: instrument.DefBuckets, }), - usersToBeQueryReadyForTotal: promauto.With(r).NewGauge(prometheus.GaugeOpts{ - Namespace: "loki_boltdb_shipper", - Name: "users_to_be_query_ready_for_total", - Help: "Total number of users an index gateway instance has to be query ready for.", - }), tablesSyncOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "loki_boltdb_shipper", Name: "tables_sync_operation_total", diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index 0ac7104b1989c..dc8b6be2c06dc 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -242,11 +242,9 @@ func (tm *TableManager) cleanupCache() error { // ensureQueryReadiness compares tables required for being query ready with the tables we already have and downloads the missing ones. func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { start := time.Now() - usersToBeQueryReadyLen := 0 defer func() { duration := time.Since(start) tm.metrics.ensureQueryReadinessDurationSeconds.Observe(duration.Seconds()) - tm.metrics.usersToBeQueryReadyForTotal.Set(float64(usersToBeQueryReadyLen)) }() activeTableNumber := getActiveTableNumber() @@ -317,8 +315,6 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { return err } - usersToBeQueryReadyLen += len(usersToBeQueryReadyFor) - level.Debug(util_log.Logger).Log("msg", "instance should be query ready for users", "users", usersToBeQueryReadyFor) if err := table.EnsureQueryReadiness(ctx, usersToBeQueryReadyFor); err != nil { return err From cbbd4b6829ac41ba6b736505e159761160dd5ecb Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Wed, 20 Apr 2022 13:17:56 -0300 Subject: [PATCH 3/9] Rename metric help text to not mislead people. --- pkg/storage/stores/shipper/downloads/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/storage/stores/shipper/downloads/metrics.go b/pkg/storage/stores/shipper/downloads/metrics.go index 6b7d0692904ea..aba157a9526db 100644 --- a/pkg/storage/stores/shipper/downloads/metrics.go +++ b/pkg/storage/stores/shipper/downloads/metrics.go @@ -28,7 +28,7 @@ func newMetrics(r prometheus.Registerer) *metrics { ensureQueryReadinessDurationSeconds: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Namespace: "loki_boltdb_shipper", Name: "query_readiness_duration_seconds", - Help: "Time (in seconds) spent making an index gateway ready to be queried", + Help: "Time (in seconds) spent making this instance ready to be queried", Buckets: instrument.DefBuckets, }), tablesSyncOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ From f9c520ab76835d3b4e4ee833e1eb61adfcb86c7f Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Wed, 20 Apr 2022 13:19:31 -0300 Subject: [PATCH 4/9] Log queryReadiness duration. --- pkg/storage/stores/shipper/downloads/table_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index dc8b6be2c06dc..378a342a7b0a0 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -315,7 +315,7 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { return err } - level.Debug(util_log.Logger).Log("msg", "instance should be query ready for users", "users", usersToBeQueryReadyFor) + level.Debug(util_log.Logger).Log("msg", "instance should be query ready for users", "users", usersToBeQueryReadyFor, "query_readiness_duration", time.Since(start)) if err := table.EnsureQueryReadiness(ctx, usersToBeQueryReadyFor); err != nil { return err } From b4737dcaac048a813e477b7cbf58d57d133f97a6 Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Mon, 25 Apr 2022 10:04:48 -0300 Subject: [PATCH 5/9] Fix where log message and duration and triggered. --- pkg/storage/stores/shipper/downloads/table_manager.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index 378a342a7b0a0..765e279a246d0 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -243,8 +243,7 @@ func (tm *TableManager) cleanupCache() error { func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { start := time.Now() defer func() { - duration := time.Since(start) - tm.metrics.ensureQueryReadinessDurationSeconds.Observe(duration.Seconds()) + tm.metrics.ensureQueryReadinessDurationSeconds.Observe(time.Since(start).Seconds()) }() activeTableNumber := getActiveTableNumber() @@ -315,10 +314,11 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { return err } - level.Debug(util_log.Logger).Log("msg", "instance should be query ready for users", "users", usersToBeQueryReadyFor, "query_readiness_duration", time.Since(start)) + perTableStart := time.Now() if err := table.EnsureQueryReadiness(ctx, usersToBeQueryReadyFor); err != nil { return err } + level.Debug(util_log.Logger).Log("msg", "instance query ready for users", "users", usersToBeQueryReadyFor, "query_readiness_duration", time.Since(perTableStart), "table", tableName) } return nil From ddd4380e51432f2f1725e5e3acafa7c7e05f11b5 Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Tue, 26 Apr 2022 10:19:12 -0300 Subject: [PATCH 6/9] Join users list in a single string. - This is necessary since go-kit doesn't support array type. --- pkg/storage/stores/shipper/downloads/table_manager.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index 765e279a246d0..77c68cbe783a0 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -7,6 +7,7 @@ import ( "path/filepath" "regexp" "strconv" + "strings" "sync" "time" @@ -318,7 +319,8 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { if err := table.EnsureQueryReadiness(ctx, usersToBeQueryReadyFor); err != nil { return err } - level.Debug(util_log.Logger).Log("msg", "instance query ready for users", "users", usersToBeQueryReadyFor, "query_readiness_duration", time.Since(perTableStart), "table", tableName) + joinedUsers := strings.Join(usersToBeQueryReadyFor, ",") + level.Debug(util_log.Logger).Log("msg", "instance query ready for users", "users", joinedUsers, "query_readiness_duration", time.Since(perTableStart), "table", tableName) } return nil From e90a816cfd8af6d3f7f7d700d7cee1d83252adea Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Wed, 27 Apr 2022 09:49:44 -0300 Subject: [PATCH 7/9] Tweak queryReadiness log messages. - As suggested by Ed on https://github.com/grafana/loki/pull/5972#discussion_r859734129 and https://github.com/grafana/loki/pull/5972#discussion_r859736072 --- pkg/storage/stores/shipper/downloads/table_manager.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index 77c68cbe783a0..06e84c1ec6093 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -244,6 +244,7 @@ func (tm *TableManager) cleanupCache() error { func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { start := time.Now() defer func() { + level.Info(util_log.Logger).Log("msg", "query readiness setup completed", "duration", time.Since(start)) tm.metrics.ensureQueryReadinessDurationSeconds.Observe(time.Since(start).Seconds()) }() @@ -320,7 +321,7 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { return err } joinedUsers := strings.Join(usersToBeQueryReadyFor, ",") - level.Debug(util_log.Logger).Log("msg", "instance query ready for users", "users", joinedUsers, "query_readiness_duration", time.Since(perTableStart), "table", tableName) + level.Info(util_log.Logger).Log("msg", "index pre-download for query readiness completed", "users", joinedUsers, "duration", time.Since(perTableStart), "table", tableName) } return nil From bb1ed790130118fd78fc89e0426e3df25c4208f3 Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Thu, 28 Apr 2022 09:33:43 -0300 Subject: [PATCH 8/9] Ensure queryReadinessDuration metric. - It is redundant with a recently added log line. --- pkg/storage/stores/shipper/downloads/metrics.go | 8 -------- pkg/storage/stores/shipper/downloads/table_manager.go | 1 - 2 files changed, 9 deletions(-) diff --git a/pkg/storage/stores/shipper/downloads/metrics.go b/pkg/storage/stores/shipper/downloads/metrics.go index aba157a9526db..aff0c93bc5be8 100644 --- a/pkg/storage/stores/shipper/downloads/metrics.go +++ b/pkg/storage/stores/shipper/downloads/metrics.go @@ -3,7 +3,6 @@ package downloads import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/weaveworks/common/instrument" ) const ( @@ -15,7 +14,6 @@ type metrics struct { queryTimeTableDownloadDurationSeconds *prometheus.CounterVec tablesSyncOperationTotal *prometheus.CounterVec tablesDownloadOperationDurationSeconds prometheus.Gauge - ensureQueryReadinessDurationSeconds prometheus.Histogram } func newMetrics(r prometheus.Registerer) *metrics { @@ -25,12 +23,6 @@ func newMetrics(r prometheus.Registerer) *metrics { Name: "query_time_table_download_duration_seconds", Help: "Time (in seconds) spent in downloading of files per table at query time", }, []string{"table"}), - ensureQueryReadinessDurationSeconds: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ - Namespace: "loki_boltdb_shipper", - Name: "query_readiness_duration_seconds", - Help: "Time (in seconds) spent making this instance ready to be queried", - Buckets: instrument.DefBuckets, - }), tablesSyncOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "loki_boltdb_shipper", Name: "tables_sync_operation_total", diff --git a/pkg/storage/stores/shipper/downloads/table_manager.go b/pkg/storage/stores/shipper/downloads/table_manager.go index 06e84c1ec6093..04d61d2784c28 100644 --- a/pkg/storage/stores/shipper/downloads/table_manager.go +++ b/pkg/storage/stores/shipper/downloads/table_manager.go @@ -245,7 +245,6 @@ func (tm *TableManager) ensureQueryReadiness(ctx context.Context) error { start := time.Now() defer func() { level.Info(util_log.Logger).Log("msg", "query readiness setup completed", "duration", time.Since(start)) - tm.metrics.ensureQueryReadinessDurationSeconds.Observe(time.Since(start).Seconds()) }() activeTableNumber := getActiveTableNumber() From b1da77032b1b11304773f66ac5f7ca3c9f1bd90b Mon Sep 17 00:00:00 2001 From: Dylan Guedes Date: Thu, 28 Apr 2022 10:11:30 -0300 Subject: [PATCH 9/9] noop