Skip to content

Commit

Permalink
Fixed deadlock in leader controller gauge callback (#354)
Browse files Browse the repository at this point in the history
At collection time, we have the Otel lock and try to grab the leader
controller read-lock

```
1 @ 0x4482d6 0x4595de 0x4595b5 0x475a45 0x1a6d77b 0x1a6d757 0xa50f9e 0xa2f726 0xa322e7 0xa2b60e 0xa4b702 0x89c9d3 0x479ae1
# labels: {"oxia":"metrics"}
#	0x475a44	sync.runtime_SemacquireMutex+0x24						/usr/local/go/src/runtime/sema.go:77
#	0x1a6d77a	sync.(*RWMutex).RLock+0x5a							/usr/local/go/src/sync/rwmutex.go:71
#	0x1a6d756	oxia/server.NewLeaderController.func1+0x36					/src/oxia/server/leader_controller.go:121
#	0xa50f9d	oxia/common/metrics.NewGauge.func1+0x5d						/src/oxia/common/metrics/gauge.go:58
#	0xa2f725	go.opentelemetry.io/otel/sdk/metric.(*meter).RegisterCallback.func1+0x85	/go/pkg/mod/go.opentelemetry.io/otel/sdk/[email protected]/meter.go:263
#	0xa322e6	go.opentelemetry.io/otel/sdk/metric.(*pipeline).produce+0x326			/go/pkg/mod/go.opentelemetry.io/otel/sdk/[email protected]/pipeline.go:144
#	0xa2b60d	go.opentelemetry.io/otel/sdk/metric.(*manualReader).Collect+0xed		/go/pkg/mod/go.opentelemetry.io/otel/sdk/[email protected]/manual_reader.go:139
#	0xa4b701	go.opentelemetry.io/otel/exporters/prometheus.(*collector).Collect+0x81		/go/pkg/mod/go.opentelemetry.io/otel/exporters/[email protected]/exporter.go:119
#	0x89c9d2	github.com/prometheus/client_golang/prometheus.(*Registry).Gather.func1+0xf2	/go/pkg/mod/github.com/prometheus/[email protected]/prometheus/registry.go:457
```

During `NewTerm` handling, we have the leader controller lock and we try
to unregister the gauge, acquiring the Otel lock

```
7 @ 0x4482d6 0x4595de 0x4595b5 0x475a45 0x490785 0xa31e8d 0xa31e61 0xa48b8f 0xa50b02 0x1a6e155 0x1a6b063 0x996d98 0x985807 0x996c58 0x96e9b0 0x973a6f 0x96c298 0x479ae1
# labels: {"bind":"[::]:6649", "oxia":"internal"}
#	0x475a44	sync.runtime_SemacquireMutex+0x24								/usr/local/go/src/runtime/sema.go:77
#	0x490784	sync.(*Mutex).lockSlow+0x164									/usr/local/go/src/sync/mutex.go:171
#	0xa31e8c	sync.(*Mutex).Lock+0x4c										/usr/local/go/src/sync/mutex.go:90
#	0xa31e60	go.opentelemetry.io/otel/sdk/metric.(*pipeline).addMultiCallback.func1+0x20			/go/pkg/mod/go.opentelemetry.io/otel/sdk/[email protected]/pipeline.go:116
#	0xa48b8e	go.opentelemetry.io/otel/sdk/metric.unregisterFuncs.Unregister+0x6e				/go/pkg/mod/go.opentelemetry.io/otel/sdk/[email protected]/pipeline.go:521
#	0xa50b01	oxia/common/metrics.(*gauge).Unregister+0x21							/src/oxia/common/metrics/gauge.go:36
#	0x1a6e154	oxia/server.(*leaderController).NewTerm+0x234							/src/oxia/server/leader_controller.go:231
#	0x1a6b062	oxia/server.(*internalRpcServer).NewTerm+0x4c2							/src/oxia/server/internal_rpc_server.go:122
#	0x996d97	oxia/proto._OxiaCoordination_NewTerm_Handler.func1+0x77						/src/oxia/proto/replication_grpc.pb.go:207
#	0x985806	github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1+0x86	/go/pkg/mod/github.com/grpc-ecosystem/[email protected]/server_metrics.go:107
#	0x996c57	oxia/proto._OxiaCoordination_NewTerm_Handler+0x137						/src/oxia/proto/replication_grpc.pb.go:209
#	0x96e9af	google.golang.org/grpc.(*Server).processUnaryRPC+0xdef						/go/pkg/mod/google.golang.org/[email protected]/server.go:1345
#	0x973a6e	google.golang.org/grpc.(*Server).handleStream+0xa2e						/go/pkg/mod/google.golang.org/[email protected]/server.go:1722
#	0x96c297	google.golang.org/grpc.(*Server).serveStreams.func1.2+0x97					/go/pkg/mod/google.golang.org/[email protected]/server.go:966

```
  • Loading branch information
merlimat authored Jun 15, 2023
1 parent 0d6b5ef commit 1614d08
Showing 1 changed file with 6 additions and 8 deletions.
14 changes: 6 additions & 8 deletions server/leader_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,18 @@ func NewLeaderController(config Config, namespace string, shardId int64, rpcClie

lc.headOffsetGauge = metrics.NewGauge("oxia_server_leader_head_offset",
"The current head offset", "offset", labels, func() int64 {
lc.RLock()
defer lc.RUnlock()
if lc.quorumAckTracker != nil {
return lc.quorumAckTracker.HeadOffset()
qat := lc.quorumAckTracker
if qat != nil {
return qat.HeadOffset()
}

return -1
})
lc.commitOffsetGauge = metrics.NewGauge("oxia_server_leader_commit_offset",
"The current commit offset", "offset", labels, func() int64 {
lc.RLock()
defer lc.RUnlock()
if lc.quorumAckTracker != nil {
return lc.quorumAckTracker.CommitOffset()
qat := lc.quorumAckTracker
if qat != nil {
return qat.CommitOffset()
}

return -1
Expand Down

0 comments on commit 1614d08

Please sign in to comment.