Skip to content

Commit

Permalink
Return more detailed errors from ES storage (#5209)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?
- In many cases when ES operations fail, the only error returned to the
UI or written to Jaeger log is `all shards fail`, which is pretty
useless for the actual troubleshooting.
- Meanwhile, the driver actually returns an error struct that contains a
`RootCause` field.

## Description of the changes
- Inspect the error and include root cause if present

## How was this change tested?
Run all-in-one and `curl 'http://localhost:16686/api/services'`

Before:
`$ curl 'http://localhost:16686/api/services'`

`{"data":null,"total":0,"limit":0,"offset":0,"errors":[{"code":500,"msg":"search
services failed: elastic: Error 400 (Bad Request): all shards failed
[type=search_phase_execution_exception]"}]}`

After:
`$ curl 'http://localhost:16686/api/services'`

`{"data":null,"total":0,"limit":0,"offset":0,"errors":[{"code":500,"msg":"search
services failed: elastic: Error 400 (Bad Request): all shards failed
[type=search_phase_execution_exception]: RootCause[Fielddata is disabled
on [serviceName] in [jaeger-service-]. Text fields are not optimised for
operations that require per-document field data like aggregations and
sorting, so these operations are disabled by default. Please use a
keyword field instead. Alternatively, set fielddata=true on
[serviceName] in order to load field data by uninverting the inverted
index. Note that this can use significant memory.
[type=illegal_argument_exception]]"}]}`

---------

Signed-off-by: Yuri Shkuro <[email protected]>
  • Loading branch information
yurishkuro authored Feb 19, 2024
1 parent 5375e1e commit cdb8b3f
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 4 deletions.
33 changes: 33 additions & 0 deletions pkg/es/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) 2024 The Jaeger Authors.
// SPDX-License-Identifier: Apache-2.0

package es

import (
"errors"
"fmt"

"github.com/olivere/elastic"
)

// DetailedError creates a more detailed error if the error stack contains elastic.Error.
// This is useful because by default olivere/elastic returns errors that print like this:
//
// elastic: Error 400 (Bad Request): all shards failed [type=search_phase_execution_exception]
//
// This is pretty useless because it masks the underlying root cause.
// DetailedError would instead return an error like this:
//
// <same as above>: RootCause[... detailed error message ...]
func DetailedError(err error) error {
var esErr *elastic.Error
if errors.As(err, &esErr) {
if esErr.Details != nil && len(esErr.Details.RootCause) > 0 {
rc := esErr.Details.RootCause[0]
if rc != nil {
return fmt.Errorf("%w: RootCause[%s [type=%s]]", err, rc.Reason, rc.Type)
}
}
}
return err
}
35 changes: 35 additions & 0 deletions pkg/es/errors_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (c) 2024 The Jaeger Authors.
// SPDX-License-Identifier: Apache-2.0

package es

import (
"fmt"
"testing"

"github.com/olivere/elastic"
"github.com/stretchr/testify/require"
)

func TestDetailedError(t *testing.T) {
require.ErrorContains(t, fmt.Errorf("some err"), "some err", "no panic")

esErr := &elastic.Error{
Status: 400,
Details: &elastic.ErrorDetails{
Type: "type1",
Reason: "useless reason, e.g. all shards failed",
RootCause: []*elastic.ErrorDetails{
{
Type: "type2",
Reason: "actual reason",
},
},
},
}
require.ErrorContains(t, DetailedError(esErr), "actual reason")

esErr.Details.RootCause[0] = nil
require.ErrorContains(t, DetailedError(esErr), "useless reason")
require.NotContains(t, DetailedError(esErr).Error(), "actual reason")
}
7 changes: 5 additions & 2 deletions plugin/storage/es/spanstore/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ func (s *SpanReader) GetTrace(ctx context.Context, traceID model.TraceID) (*mode
currentTime := time.Now()
traces, err := s.multiRead(ctx, []model.TraceID{traceID}, currentTime.Add(-s.maxSpanAge), currentTime)
if err != nil {
return nil, err
return nil, es.DetailedError(err)
}
if len(traces) == 0 {
return nil, spanstore.ErrTraceNotFound
Expand Down Expand Up @@ -337,7 +337,7 @@ func (s *SpanReader) FindTraces(ctx context.Context, traceQuery *spanstore.Trace

uniqueTraceIDs, err := s.FindTraceIDs(ctx, traceQuery)
if err != nil {
return nil, err
return nil, es.DetailedError(err)
}
return s.multiRead(ctx, uniqueTraceIDs, traceQuery.StartTimeMin, traceQuery.StartTimeMax)
}
Expand Down Expand Up @@ -412,6 +412,7 @@ func (s *SpanReader) multiRead(ctx context.Context, traceIDs []model.TraceID, st
traceIDs = nil
results, err := s.client().MultiSearch().Add(searchRequests...).Index(indices...).Do(ctx)
if err != nil {
err = es.DetailedError(err)
logErrorToSpan(childSpan, err)
return nil, err
}
Expand All @@ -426,6 +427,7 @@ func (s *SpanReader) multiRead(ctx context.Context, traceIDs []model.TraceID, st
}
spans, err := s.collectSpans(result.Hits.Hits)
if err != nil {
err = es.DetailedError(err)
logErrorToSpan(childSpan, err)
return nil, err
}
Expand Down Expand Up @@ -580,6 +582,7 @@ func (s *SpanReader) findTraceIDs(ctx context.Context, traceQuery *spanstore.Tra

searchResult, err := searchService.Do(ctx)
if err != nil {
err = es.DetailedError(err)
s.logger.Info("es search services failed", zap.Any("traceQuery", traceQuery), zap.Error(err))
return nil, fmt.Errorf("search services failed: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions plugin/storage/es/spanstore/service_operation.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func (s *ServiceOperationStorage) getServices(context context.Context, indices [

searchResult, err := searchService.Do(context)
if err != nil {
return nil, fmt.Errorf("search services failed: %w", err)
return nil, fmt.Errorf("search services failed: %w", es.DetailedError(err))
}
if searchResult.Aggregations == nil {
return []string{}, nil
Expand Down Expand Up @@ -118,7 +118,7 @@ func (s *ServiceOperationStorage) getOperations(context context.Context, indices

searchResult, err := searchService.Do(context)
if err != nil {
return nil, fmt.Errorf("search operations failed: %w", err)
return nil, fmt.Errorf("search operations failed: %w", es.DetailedError(err))
}
if searchResult.Aggregations == nil {
return []string{}, nil
Expand Down

0 comments on commit cdb8b3f

Please sign in to comment.