From c0a17868d8d2eb53801e296b2723f8c7f683ec94 Mon Sep 17 00:00:00 2001 From: Andrew Wilkins Date: Thu, 5 Aug 2021 11:39:21 +0800 Subject: [PATCH] apm: fix deadlock in breakdown metrics calculation Fix a deadlock that can occur when concurrently ending a parent and child span, due to the parent waiting for the child to release the transaction lock, and the child waiting to lock the parent. Locks are now taken in a consistent order. --- CHANGELOG.asciidoc | 2 ++ span.go | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 5d46e1c60..25f3af043 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -23,6 +23,8 @@ endif::[] https://github.com/elastic/apm-agent-go/compare/v1.13.0...master[View commits] +- Fix concurrency bugs in breakdown metrics and module/apmhttp.WithClientTrace {pull}997[#(997)] + [[release-notes-1.x]] === Go Agent version 1.x diff --git a/span.go b/span.go index afdc6aa36..ac7670483 100644 --- a/span.go +++ b/span.go @@ -77,6 +77,12 @@ func (tx *Transaction) StartSpanOptions(name, spanType string, opts SpanOptions) } transactionID := tx.traceContext.Span + // Lock the parent first to avoid deadlocks in breakdown metrics calculation. + if opts.parent != nil { + opts.parent.mu.Lock() + defer opts.parent.mu.Unlock() + } + // Prevent tx from being ended while we're starting a span. tx.mu.RLock() defer tx.mu.RUnlock() @@ -117,8 +123,6 @@ func (tx *Transaction) StartSpanOptions(name, spanType string, opts SpanOptions) if tx.breakdownMetricsEnabled { if span.parent != nil { - span.parent.mu.Lock() - defer span.parent.mu.Unlock() if !span.parent.ended() { span.parent.childrenTimer.childStarted(span.timestamp) } @@ -334,6 +338,13 @@ func (s *Span) ParentID() SpanID { func (s *Span) reportSelfTime() { endTime := s.timestamp.Add(s.Duration) + // If this span has a parent span, lock it before proceeding to + // prevent deadlocking when concurrently ending parent and child. + if s.parent != nil { + s.parent.mu.Lock() + defer s.parent.mu.Unlock() + } + // TODO(axw) try to find a way to not lock the transaction when // ending every span. We already lock them when starting spans. s.tx.mu.RLock() @@ -345,11 +356,9 @@ func (s *Span) reportSelfTime() { s.tx.TransactionData.mu.Lock() defer s.tx.TransactionData.mu.Unlock() if s.parent != nil { - s.parent.mu.Lock() if !s.parent.ended() { s.parent.childrenTimer.childEnded(endTime) } - s.parent.mu.Unlock() } else { s.tx.childrenTimer.childEnded(endTime) }