You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
We sometimes see the Juju raft tests timeout because Raft.Shutdown has hung.
I've got a standalone reproduction (not that small though) that hangs reliably. Digging into it the InmemTransport only applies its timeout on receiving responses back from the peer, but not sending them.
package main
import (
"fmt""io""log""os""sync""time""net/http"
_ "net/http/pprof""github.com/hashicorp/raft"
)
funcmain() {
gofunc() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
syncStderr:=&syncWriter{target: os.Stderr}
r0, t0, err:=newRaft("node-0", syncStderr)
iferr!=nil {
panic(err)
}
f:=r0.BootstrapCluster(raft.Configuration{
Servers: []raft.Server{{
ID: "node-0",
Address: t0.LocalAddr(),
}},
})
iferr:=f.Error(); err!=nil {
panic(err)
}
// Wait for r0 to be leader.select {
caseisLeader:=<-r0.LeaderCh():
if!isLeader {
panic("r0 wasn't leader")
}
case<-time.After(time.Second):
panic("timed out waiting for r0 to be leader")
}
// Make two more and add them to the cluster.r1, t1, err:=newRaft("node-1", syncStderr)
iferr!=nil {
panic(err)
}
r2, t2, err:=newRaft("node-2", syncStderr)
iferr!=nil {
panic(err)
}
connectTransports(t0, t1, t2)
f1:=r0.AddVoter("node-1", t1.LocalAddr(), 0, 0)
f2:=r0.AddVoter("node-2", t2.LocalAddr(), 0, 0)
iferr:=f1.Error(); err!=nil {
panic(err)
}
iferr:=f2.Error(); err!=nil {
panic(err)
}
// Shut the leader down.iferr:=r0.Shutdown().Error(); err!=nil {
panic(err)
}
iferr:=t0.Close(); err!=nil {
panic(err)
}
// Wait until one of the other nodes is leader.varnewLeader*raft.Raftvarnamestring
loop:
for {
select {
casevalue:=<-r1.LeaderCh():
ifvalue {
newLeader=r1name="node-1"break loop
}
casevalue:=<-r2.LeaderCh():
ifvalue {
newLeader=r2name="node-2"break loop
}
case<-time.After(2*time.Second):
panic("timed out waiting for new leader")
}
}
// If this sleep is omitted the hang doesn't happen.time.Sleep(2*time.Second)
// Try to shut the new leader down.fmt.Printf("**** shutting down new leader %s\n", name)
disconnectTransports(t0, t1, t2)
iferr:=newLeader.Shutdown().Error(); err!=nil {
panic(err)
}
fmt.Printf("**** new leader %s shut down successfully\n", name)
}
funcnewRaft(id raft.ServerID, output io.Writer) (*raft.Raft, *raft.InmemTransport, error) {
_, transport:=raft.NewInmemTransport("")
store:=raft.NewInmemStore()
snapshotStore:=raft.NewInmemSnapshotStore()
config:=raft.DefaultConfig()
config.ShutdownOnRemove=falseconfig.LocalID=idconfig.HeartbeatTimeout=100*time.Millisecondconfig.ElectionTimeout=config.HeartbeatTimeoutconfig.LeaderLeaseTimeout=config.HeartbeatTimeoutconfig.Logger=log.New(output, fmt.Sprintf("%s ", id), log.LstdFlags)
iferr:=raft.ValidateConfig(config); err!=nil {
returnnil, nil, err
}
r, err:=raft.NewRaft(config, &nullFSM{}, store, store, snapshotStore, transport)
iferr!=nil {
transport.Close()
returnnil, nil, err
}
returnr, transport, nil
}
// Connect the provided transport bidirectionally.funcconnectTransports(transports...raft.LoopbackTransport) {
for_, t1:=rangetransports {
for_, t2:=rangetransports {
ift1==t2 {
continue
}
t1.Connect(t2.LocalAddr(), t2)
}
}
}
funcdisconnectTransports(transports...raft.LoopbackTransport) {
for_, t1:=rangetransports {
for_, t2:=rangetransports {
ift1==t2 {
continue
}
t1.Disconnect(t2.LocalAddr())
}
}
}
typenullFSMstruct{}
func (m*nullFSM) Apply(log*raft.Log) interface{} {
returnnil
}
func (m*nullFSM) Snapshot() (raft.FSMSnapshot, error) {
return&nullSnapshot{}, nil
}
func (m*nullFSM) Restore(rc io.ReadCloser) error {
rc.Close()
returnnil
}
typenullSnapshotstruct{}
func (s*nullSnapshot) Persist(sink raft.SnapshotSink) error {
sink.Close()
returnnil
}
func (s*nullSnapshot) Release() {}
typesyncWriterstruct {
mu sync.Mutextarget io.Writer
}
func (w*syncWriter) Write(data []byte) (int, error) {
w.mu.Lock()
deferw.mu.Unlock()
returnw.target.Write(data)
}
The text was updated successfully, but these errors were encountered:
We sometimes see the Juju raft tests timeout because Raft.Shutdown has hung.
I've got a standalone reproduction (not that small though) that hangs reliably. Digging into it the InmemTransport only applies its timeout on receiving responses back from the peer, but not sending them.
The text was updated successfully, but these errors were encountered: