From cdb8b3fd11ef8fea4bc5f752c5192f6ac84d297f Mon Sep 17 00:00:00 2001 From: Yuri Shkuro Date: Sun, 18 Feb 2024 21:26:36 -0500 Subject: [PATCH] Return more detailed errors from ES storage (#5209) ## Which problem is this PR solving? - In many cases when ES operations fail, the only error returned to the UI or written to Jaeger log is `all shards fail`, which is pretty useless for the actual troubleshooting. - Meanwhile, the driver actually returns an error struct that contains a `RootCause` field. ## Description of the changes - Inspect the error and include root cause if present ## How was this change tested? Run all-in-one and `curl 'http://localhost:16686/api/services'` Before: `$ curl 'http://localhost:16686/api/services'` `{"data":null,"total":0,"limit":0,"offset":0,"errors":[{"code":500,"msg":"search services failed: elastic: Error 400 (Bad Request): all shards failed [type=search_phase_execution_exception]"}]}` After: `$ curl 'http://localhost:16686/api/services'` `{"data":null,"total":0,"limit":0,"offset":0,"errors":[{"code":500,"msg":"search services failed: elastic: Error 400 (Bad Request): all shards failed [type=search_phase_execution_exception]: RootCause[Fielddata is disabled on [serviceName] in [jaeger-service-]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [serviceName] in order to load field data by uninverting the inverted index. Note that this can use significant memory. [type=illegal_argument_exception]]"}]}` --------- Signed-off-by: Yuri Shkuro --- pkg/es/errors.go | 33 +++++++++++++++++ pkg/es/errors_test.go | 35 +++++++++++++++++++ plugin/storage/es/spanstore/reader.go | 7 ++-- .../storage/es/spanstore/service_operation.go | 4 +-- 4 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 pkg/es/errors.go create mode 100644 pkg/es/errors_test.go diff --git a/pkg/es/errors.go b/pkg/es/errors.go new file mode 100644 index 00000000000..d29c0e157e7 --- /dev/null +++ b/pkg/es/errors.go @@ -0,0 +1,33 @@ +// Copyright (c) 2024 The Jaeger Authors. +// SPDX-License-Identifier: Apache-2.0 + +package es + +import ( + "errors" + "fmt" + + "github.com/olivere/elastic" +) + +// DetailedError creates a more detailed error if the error stack contains elastic.Error. +// This is useful because by default olivere/elastic returns errors that print like this: +// +// elastic: Error 400 (Bad Request): all shards failed [type=search_phase_execution_exception] +// +// This is pretty useless because it masks the underlying root cause. +// DetailedError would instead return an error like this: +// +// : RootCause[... detailed error message ...] +func DetailedError(err error) error { + var esErr *elastic.Error + if errors.As(err, &esErr) { + if esErr.Details != nil && len(esErr.Details.RootCause) > 0 { + rc := esErr.Details.RootCause[0] + if rc != nil { + return fmt.Errorf("%w: RootCause[%s [type=%s]]", err, rc.Reason, rc.Type) + } + } + } + return err +} diff --git a/pkg/es/errors_test.go b/pkg/es/errors_test.go new file mode 100644 index 00000000000..e57ec1403b2 --- /dev/null +++ b/pkg/es/errors_test.go @@ -0,0 +1,35 @@ +// Copyright (c) 2024 The Jaeger Authors. +// SPDX-License-Identifier: Apache-2.0 + +package es + +import ( + "fmt" + "testing" + + "github.com/olivere/elastic" + "github.com/stretchr/testify/require" +) + +func TestDetailedError(t *testing.T) { + require.ErrorContains(t, fmt.Errorf("some err"), "some err", "no panic") + + esErr := &elastic.Error{ + Status: 400, + Details: &elastic.ErrorDetails{ + Type: "type1", + Reason: "useless reason, e.g. all shards failed", + RootCause: []*elastic.ErrorDetails{ + { + Type: "type2", + Reason: "actual reason", + }, + }, + }, + } + require.ErrorContains(t, DetailedError(esErr), "actual reason") + + esErr.Details.RootCause[0] = nil + require.ErrorContains(t, DetailedError(esErr), "useless reason") + require.NotContains(t, DetailedError(esErr).Error(), "actual reason") +} diff --git a/plugin/storage/es/spanstore/reader.go b/plugin/storage/es/spanstore/reader.go index 1fdce799bc0..1285c399529 100644 --- a/plugin/storage/es/spanstore/reader.go +++ b/plugin/storage/es/spanstore/reader.go @@ -246,7 +246,7 @@ func (s *SpanReader) GetTrace(ctx context.Context, traceID model.TraceID) (*mode currentTime := time.Now() traces, err := s.multiRead(ctx, []model.TraceID{traceID}, currentTime.Add(-s.maxSpanAge), currentTime) if err != nil { - return nil, err + return nil, es.DetailedError(err) } if len(traces) == 0 { return nil, spanstore.ErrTraceNotFound @@ -337,7 +337,7 @@ func (s *SpanReader) FindTraces(ctx context.Context, traceQuery *spanstore.Trace uniqueTraceIDs, err := s.FindTraceIDs(ctx, traceQuery) if err != nil { - return nil, err + return nil, es.DetailedError(err) } return s.multiRead(ctx, uniqueTraceIDs, traceQuery.StartTimeMin, traceQuery.StartTimeMax) } @@ -412,6 +412,7 @@ func (s *SpanReader) multiRead(ctx context.Context, traceIDs []model.TraceID, st traceIDs = nil results, err := s.client().MultiSearch().Add(searchRequests...).Index(indices...).Do(ctx) if err != nil { + err = es.DetailedError(err) logErrorToSpan(childSpan, err) return nil, err } @@ -426,6 +427,7 @@ func (s *SpanReader) multiRead(ctx context.Context, traceIDs []model.TraceID, st } spans, err := s.collectSpans(result.Hits.Hits) if err != nil { + err = es.DetailedError(err) logErrorToSpan(childSpan, err) return nil, err } @@ -580,6 +582,7 @@ func (s *SpanReader) findTraceIDs(ctx context.Context, traceQuery *spanstore.Tra searchResult, err := searchService.Do(ctx) if err != nil { + err = es.DetailedError(err) s.logger.Info("es search services failed", zap.Any("traceQuery", traceQuery), zap.Error(err)) return nil, fmt.Errorf("search services failed: %w", err) } diff --git a/plugin/storage/es/spanstore/service_operation.go b/plugin/storage/es/spanstore/service_operation.go index d2303196e6c..a6c320b3f1c 100644 --- a/plugin/storage/es/spanstore/service_operation.go +++ b/plugin/storage/es/spanstore/service_operation.go @@ -87,7 +87,7 @@ func (s *ServiceOperationStorage) getServices(context context.Context, indices [ searchResult, err := searchService.Do(context) if err != nil { - return nil, fmt.Errorf("search services failed: %w", err) + return nil, fmt.Errorf("search services failed: %w", es.DetailedError(err)) } if searchResult.Aggregations == nil { return []string{}, nil @@ -118,7 +118,7 @@ func (s *ServiceOperationStorage) getOperations(context context.Context, indices searchResult, err := searchService.Do(context) if err != nil { - return nil, fmt.Errorf("search operations failed: %w", err) + return nil, fmt.Errorf("search operations failed: %w", es.DetailedError(err)) } if searchResult.Aggregations == nil { return []string{}, nil