From 37786625b9b4cc5a734239cf9b1f13e6185dc577 Mon Sep 17 00:00:00 2001 From: bcmmbaga Date: Fri, 23 Aug 2024 15:41:48 +0300 Subject: [PATCH 1/4] Add logging for slow SQL queries in SaveAccount and GetAccount --- management/server/sql_store.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/management/server/sql_store.go b/management/server/sql_store.go index 912e31410c6..15bf2c28af9 100644 --- a/management/server/sql_store.go +++ b/management/server/sql_store.go @@ -134,6 +134,12 @@ func (s *SqlStore) AcquireReadLockByUID(ctx context.Context, uniqueID string) (u func (s *SqlStore) SaveAccount(ctx context.Context, account *Account) error { start := time.Now() + defer func() { + elapsed := time.Since(start) + if elapsed > 1*time.Second { + log.WithContext(ctx).Warnf("SaveAccount for account %s exceeded 1s, took: %v", account.Id, elapsed) + } + }() // todo: remove this check after the issue is resolved s.checkAccountDomainBeforeSave(ctx, account.Id, account.Domain) @@ -513,6 +519,13 @@ func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*Account) { } func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, error) { + start := time.Now() + defer func() { + elapsed := time.Since(start) + if elapsed > 1*time.Second { + log.WithContext(ctx).Warnf("GetAccount for account %s exceeded 1s, took: %v", accountID, elapsed) + } + }() var account Account result := s.db.Model(&account). From ba3fc262cd36deb83fe3eea406d2f7a1ba0bb304 Mon Sep 17 00:00:00 2001 From: bcmmbaga Date: Fri, 23 Aug 2024 15:59:47 +0300 Subject: [PATCH 2/4] Add resource count log for large accounts --- management/server/sql_store.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/management/server/sql_store.go b/management/server/sql_store.go index 15bf2c28af9..08caa87737c 100644 --- a/management/server/sql_store.go +++ b/management/server/sql_store.go @@ -590,6 +590,16 @@ func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, } account.NameServerGroupsG = nil + totalObjects := len(account.Policies) + + len(account.RoutesG) + + len(account.SetupKeysG) + + len(account.PeersG) + + len(account.NameServerGroupsG) + + len(account.Policies) + if totalObjects > 5000 { + log.WithContext(ctx).Debugf("account: %s has a total resource count of %d objects, exceeding 5,000", accountID, totalObjects) + } + return &account, nil } From 6bd0947b55e508e2f00b73cc0a680e1e1f5b7558 Mon Sep 17 00:00:00 2001 From: bcmmbaga Date: Fri, 23 Aug 2024 19:15:42 +0300 Subject: [PATCH 3/4] Refactor metrics middleware to simplify counters and histograms --- management/server/http/handler.go | 21 --- .../server/telemetry/http_api_metrics.go | 126 +++++++----------- 2 files changed, 48 insertions(+), 99 deletions(-) diff --git a/management/server/http/handler.go b/management/server/http/handler.go index 3fe26d0ce1c..366efa9b73f 100644 --- a/management/server/http/handler.go +++ b/management/server/http/handler.go @@ -100,27 +100,6 @@ func APIHandler(ctx context.Context, accountManager s.AccountManager, LocationMa api.addPostureCheckEndpoint() api.addLocationsEndpoint() - err := api.Router.Walk(func(route *mux.Route, _ *mux.Router, _ []*mux.Route) error { - methods, err := route.GetMethods() - if err != nil { // we may have wildcard routes from integrations without methods, skip them for now - methods = []string{} - } - for _, method := range methods { - template, err := route.GetPathTemplate() - if err != nil { - return err - } - err = metricsMiddleware.AddHTTPRequestResponseCounter(template, method) - if err != nil { - return err - } - } - return nil - }) - if err != nil { - return nil, err - } - return rootRouter, nil } diff --git a/management/server/telemetry/http_api_metrics.go b/management/server/telemetry/http_api_metrics.go index a80453dca88..357f019c794 100644 --- a/management/server/telemetry/http_api_metrics.go +++ b/management/server/telemetry/http_api_metrics.go @@ -8,6 +8,7 @@ import ( "time" "github.com/google/uuid" + "github.com/gorilla/mux" log "github.com/sirupsen/logrus" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" @@ -54,112 +55,89 @@ func (rw *WrappedResponseWriter) WriteHeader(code int) { // HTTPMiddleware handler used to collect metrics of every request/response coming to the API. // Also adds request tracing (logging). type HTTPMiddleware struct { - meter metric.Meter - ctx context.Context + ctx context.Context // all HTTP requests by endpoint & method - httpRequestCounters map[string]metric.Int64Counter + httpRequestCounter metric.Int64Counter // all HTTP responses by endpoint & method & status code - httpResponseCounters map[string]metric.Int64Counter + httpResponseCounter metric.Int64Counter // all HTTP requests totalHTTPRequestsCounter metric.Int64Counter // all HTTP responses totalHTTPResponseCounter metric.Int64Counter // all HTTP responses by status code - totalHTTPResponseCodeCounters map[int]metric.Int64Counter + totalHTTPResponseCodeCounter metric.Int64Counter // all HTTP requests durations by endpoint and method - httpRequestDurations map[string]metric.Int64Histogram + httpRequestDuration metric.Int64Histogram // all HTTP requests durations totalHTTPRequestDuration metric.Int64Histogram } // NewMetricsMiddleware creates a new HTTPMiddleware func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddleware, error) { - totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpRequestCounterPrefix), metric.WithUnit("1")) + httpRequestCounter, err := meter.Int64Counter(httpRequestCounterPrefix, metric.WithUnit("1")) if err != nil { return nil, err } - totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpResponseCounterPrefix), metric.WithUnit("1")) + httpResponseCounter, err := meter.Int64Counter(httpResponseCounterPrefix, metric.WithUnit("1")) if err != nil { return nil, err } - totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s_total", httpRequestDurationPrefix), metric.WithUnit("milliseconds")) + totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpRequestCounterPrefix), metric.WithUnit("1")) if err != nil { return nil, err } - return &HTTPMiddleware{ - ctx: ctx, - httpRequestCounters: map[string]metric.Int64Counter{}, - httpResponseCounters: map[string]metric.Int64Counter{}, - httpRequestDurations: map[string]metric.Int64Histogram{}, - totalHTTPResponseCodeCounters: map[int]metric.Int64Counter{}, - meter: meter, - totalHTTPRequestsCounter: totalHTTPRequestsCounter, - totalHTTPResponseCounter: totalHTTPResponseCounter, - totalHTTPRequestDuration: totalHTTPRequestDuration, - }, - nil -} + totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpResponseCounterPrefix), metric.WithUnit("1")) + if err != nil { + return nil, err + } -// AddHTTPRequestResponseCounter adds a new meter for an HTTP defaultEndpoint and Method (GET, POST, etc) -// Creates one request counter and multiple response counters (one per http response status code). -func (m *HTTPMiddleware) AddHTTPRequestResponseCounter(endpoint string, method string) error { - meterKey := getRequestCounterKey(endpoint, method) - httpReqCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1")) + totalHTTPResponseCodeCounter, err := meter.Int64Counter(fmt.Sprintf("%s.code.total", httpResponseCounterPrefix), metric.WithUnit("1")) if err != nil { - return err + return nil, err } - m.httpRequestCounters[meterKey] = httpReqCounter - durationKey := getRequestDurationKey(endpoint, method) - requestDuration, err := m.meter.Int64Histogram(durationKey, metric.WithUnit("milliseconds")) + httpRequestDuration, err := meter.Int64Histogram(httpRequestDurationPrefix, metric.WithUnit("milliseconds")) if err != nil { - return err + return nil, err } - m.httpRequestDurations[durationKey] = requestDuration - - respCodes := []int{200, 204, 400, 401, 403, 404, 500, 502, 503} - for _, code := range respCodes { - meterKey = getResponseCounterKey(endpoint, method, code) - httpRespCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1")) - if err != nil { - return err - } - m.httpResponseCounters[meterKey] = httpRespCounter - meterKey = fmt.Sprintf("%s_%d_total", httpResponseCounterPrefix, code) - totalHTTPResponseCodeCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1")) - if err != nil { - return err - } - m.totalHTTPResponseCodeCounters[code] = totalHTTPResponseCodeCounter + totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s.total", httpRequestDurationPrefix), metric.WithUnit("milliseconds")) + if err != nil { + return nil, err } - return nil + return &HTTPMiddleware{ + ctx: ctx, + httpRequestCounter: httpRequestCounter, + httpResponseCounter: httpResponseCounter, + httpRequestDuration: httpRequestDuration, + totalHTTPResponseCodeCounter: totalHTTPResponseCodeCounter, + totalHTTPRequestsCounter: totalHTTPRequestsCounter, + totalHTTPResponseCounter: totalHTTPResponseCounter, + totalHTTPRequestDuration: totalHTTPRequestDuration, + }, + nil } func replaceEndpointChars(endpoint string) string { - endpoint = strings.ReplaceAll(endpoint, "/", "_") endpoint = strings.ReplaceAll(endpoint, "{", "") endpoint = strings.ReplaceAll(endpoint, "}", "") return endpoint } -func getRequestCounterKey(endpoint, method string) string { - endpoint = replaceEndpointChars(endpoint) - return fmt.Sprintf("%s%s_%s", httpRequestCounterPrefix, endpoint, method) -} - -func getRequestDurationKey(endpoint, method string) string { - endpoint = replaceEndpointChars(endpoint) - return fmt.Sprintf("%s%s_%s", httpRequestDurationPrefix, endpoint, method) -} - -func getResponseCounterKey(endpoint, method string, status int) string { - endpoint = replaceEndpointChars(endpoint) - return fmt.Sprintf("%s%s_%s_%d", httpResponseCounterPrefix, endpoint, method, status) +func getEndpointMetricAttr(r *http.Request) string { + var endpoint string + route := mux.CurrentRoute(r) + if route != nil { + pathTmpl, err := route.GetPathTemplate() + if err == nil { + endpoint = replaceEndpointChars(pathTmpl) + } + } + return endpoint } // Handler logs every request and response and adds the, to metrics. @@ -176,11 +154,10 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler { log.WithContext(ctx).Tracef("HTTP request %v: %v %v", reqID, r.Method, r.URL) - metricKey := getRequestCounterKey(r.URL.Path, r.Method) + endpointAttr := attribute.String("endpoint", getEndpointMetricAttr(r)) + methodAttr := attribute.String("method", r.Method) - if c, ok := m.httpRequestCounters[metricKey]; ok { - c.Add(m.ctx, 1) - } + m.httpRequestCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr)) m.totalHTTPRequestsCounter.Add(m.ctx, 1) w := WrapResponseWriter(rw) @@ -193,21 +170,14 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler { log.WithContext(ctx).Tracef("HTTP response %v: %v %v status %v", reqID, r.Method, r.URL, w.Status()) } - metricKey = getResponseCounterKey(r.URL.Path, r.Method, w.Status()) - if c, ok := m.httpResponseCounters[metricKey]; ok { - c.Add(m.ctx, 1) - } + statusCodeAttr := attribute.Int("code", w.Status()) + m.httpResponseCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr, statusCodeAttr)) m.totalHTTPResponseCounter.Add(m.ctx, 1) - if c, ok := m.totalHTTPResponseCodeCounters[w.Status()]; ok { - c.Add(m.ctx, 1) - } + m.totalHTTPResponseCodeCounter.Add(m.ctx, 1, metric.WithAttributes(statusCodeAttr)) - durationKey := getRequestDurationKey(r.URL.Path, r.Method) reqTook := time.Since(reqStart) - if c, ok := m.httpRequestDurations[durationKey]; ok { - c.Record(m.ctx, reqTook.Milliseconds()) - } + m.httpRequestDuration.Record(m.ctx, reqTook.Milliseconds(), metric.WithAttributes(endpointAttr, methodAttr)) log.WithContext(ctx).Debugf("request %s %s took %d ms and finished with status %d", r.Method, r.URL.Path, reqTook.Milliseconds(), w.Status()) if w.Status() == 200 && (r.Method == http.MethodPut || r.Method == http.MethodPost || r.Method == http.MethodDelete) { From 04e6a3df1417ad6b1794b98f130cb12790d7c717 Mon Sep 17 00:00:00 2001 From: bcmmbaga Date: Fri, 23 Aug 2024 19:22:40 +0300 Subject: [PATCH 4/4] Update log levels and remove redundant resource count check --- management/server/account.go | 6 ++++++ management/server/sql_store.go | 14 ++------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/management/server/account.go b/management/server/account.go index 49341a67bf8..7159aa9ac1c 100644 --- a/management/server/account.go +++ b/management/server/account.go @@ -476,6 +476,12 @@ func (a *Account) GetPeerNetworkMap( objectCount := int64(len(peersToConnect) + len(expiredPeers) + len(routesUpdate) + len(firewallRules)) metrics.CountNetworkMapObjects(objectCount) metrics.CountGetPeerNetworkMapDuration(time.Since(start)) + + if objectCount > 5000 { + log.WithContext(ctx).Tracef("account: %s has a total resource count of %d objects, "+ + "peers to connect: %d, expired peers: %d, routes: %d, firewall rules: %d", + a.Id, objectCount, len(peersToConnect), len(expiredPeers), len(routesUpdate), len(firewallRules)) + } } return nm diff --git a/management/server/sql_store.go b/management/server/sql_store.go index 08caa87737c..0fb3d391f4d 100644 --- a/management/server/sql_store.go +++ b/management/server/sql_store.go @@ -137,7 +137,7 @@ func (s *SqlStore) SaveAccount(ctx context.Context, account *Account) error { defer func() { elapsed := time.Since(start) if elapsed > 1*time.Second { - log.WithContext(ctx).Warnf("SaveAccount for account %s exceeded 1s, took: %v", account.Id, elapsed) + log.WithContext(ctx).Tracef("SaveAccount for account %s exceeded 1s, took: %v", account.Id, elapsed) } }() @@ -523,7 +523,7 @@ func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, defer func() { elapsed := time.Since(start) if elapsed > 1*time.Second { - log.WithContext(ctx).Warnf("GetAccount for account %s exceeded 1s, took: %v", accountID, elapsed) + log.WithContext(ctx).Tracef("GetAccount for account %s exceeded 1s, took: %v", accountID, elapsed) } }() @@ -590,16 +590,6 @@ func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, } account.NameServerGroupsG = nil - totalObjects := len(account.Policies) + - len(account.RoutesG) + - len(account.SetupKeysG) + - len(account.PeersG) + - len(account.NameServerGroupsG) + - len(account.Policies) - if totalObjects > 5000 { - log.WithContext(ctx).Debugf("account: %s has a total resource count of %d objects, exceeding 5,000", accountID, totalObjects) - } - return &account, nil }