Skip to content

Commit

Permalink
runtime: print a stack trace at "morestack on g0"
Browse files Browse the repository at this point in the history
Error like "morestack on g0" is one of the errors that is very
hard to debug, because often it doesn't print a useful stack trace.
The runtime doesn't directly print a stack trace because it is
a bad stack state to call print. Sometimes the SIGABRT may trigger
a traceback, but sometimes not especially in a cgo binary. Even if
it triggers a traceback it often does not include the stack trace
of the bad stack.

This CL makes it explicitly print a stack trace and throw. The
idea is to have some space as an "emergency" crash stack. When the
stack is in a really bad state, we switch to the crash stack and
do a traceback.

Currently only implemented on AMD64 and ARM64.

TODO: also handle errors like "morestack on gsignal" and bad
systemstack. Also handle other architectures.

Change-Id: Ibfc397202f2bb0737c5cbe99f2763de83301c1c1
Reviewed-on: https://go-review.googlesource.com/c/go/+/419435
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Michael Pratt <[email protected]>
  • Loading branch information
cherrymui committed Oct 26, 2023
1 parent 29b8039 commit 0262ea1
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 27 deletions.
8 changes: 8 additions & 0 deletions src/runtime/asm.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,11 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
// See map.go comment on the need for this routine.
TEXT ·mapinitnoop<ABIInternal>(SB),NOSPLIT,$0-0
RET

#ifndef GOARCH_amd64
#ifndef GOARCH_arm64
// stub to appease shared build mode.
TEXT ·switchToCrashStack0<ABIInternal>(SB),NOSPLIT,$0-0
UNDEF
#endif
#endif
55 changes: 39 additions & 16 deletions src/runtime/asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,30 @@ bad:
CALL AX
INT $3

// func switchToCrashStack0(fn func())
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
MOVQ g_m(R14), BX // curm

// set g to gcrash
LEAQ runtime·gcrash(SB), R14 // g = &gcrash
MOVQ BX, g_m(R14) // g.m = curm
MOVQ R14, m_g0(BX) // curm.g0 = g
get_tls(CX)
MOVQ R14, g(CX)

// switch to crashstack
MOVQ (g_stack+stack_hi)(R14), BX
SUBQ $(4*8), BX
MOVQ BX, SP

// call target function
MOVQ AX, DX
MOVQ 0(AX), AX
CALL AX

// should never return
CALL runtime·abort(SB)
UNDEF

/*
* support for morestack
Expand All @@ -551,17 +575,26 @@ bad:
TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
// Cannot grow scheduler stack (m->g0).
get_tls(CX)
MOVQ g(CX), BX
MOVQ g_m(BX), BX
MOVQ m_g0(BX), SI
CMPQ g(CX), SI
MOVQ g(CX), DI // DI = g
MOVQ g_m(DI), BX // BX = m

// Set g->sched to context in f.
MOVQ 0(SP), AX // f's PC
MOVQ AX, (g_sched+gobuf_pc)(DI)
LEAQ 8(SP), AX // f's SP
MOVQ AX, (g_sched+gobuf_sp)(DI)
MOVQ BP, (g_sched+gobuf_bp)(DI)
MOVQ DX, (g_sched+gobuf_ctxt)(DI)

MOVQ m_g0(BX), SI // SI = m.g0
CMPQ DI, SI
JNE 3(PC)
CALL runtime·badmorestackg0(SB)
CALL runtime·abort(SB)

// Cannot grow signal stack (m->gsignal).
MOVQ m_gsignal(BX), SI
CMPQ g(CX), SI
CMPQ DI, SI
JNE 3(PC)
CALL runtime·badmorestackgsignal(SB)
CALL runtime·abort(SB)
Expand All @@ -573,17 +606,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
MOVQ AX, (m_morebuf+gobuf_pc)(BX)
LEAQ 16(SP), AX // f's caller's SP
MOVQ AX, (m_morebuf+gobuf_sp)(BX)
get_tls(CX)
MOVQ g(CX), SI
MOVQ SI, (m_morebuf+gobuf_g)(BX)

// Set g->sched to context in f.
MOVQ 0(SP), AX // f's PC
MOVQ AX, (g_sched+gobuf_pc)(SI)
LEAQ 8(SP), AX // f's SP
MOVQ AX, (g_sched+gobuf_sp)(SI)
MOVQ BP, (g_sched+gobuf_bp)(SI)
MOVQ DX, (g_sched+gobuf_ctxt)(SI)
MOVQ DI, (m_morebuf+gobuf_g)(BX)

// Call newstack on m->g0's stack.
MOVQ m_g0(BX), BX
Expand Down
43 changes: 34 additions & 9 deletions src/runtime/asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,30 @@ noswitch:
SUB $8, RSP, R29 // restore FP
B (R3)

// func switchToCrashStack0(fn func())
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
MOVD R0, R26 // context register
MOVD g_m(g), R1 // curm

// set g to gcrash
MOVD $runtime·gcrash(SB), g // g = &gcrash
BL runtime·save_g(SB) // clobbers R0
MOVD R1, g_m(g) // g.m = curm
MOVD g, m_g0(R1) // curm.g0 = g

// switch to crashstack
MOVD (g_stack+stack_hi)(g), R1
SUB $(4*8), R1
MOVD R1, RSP

// call target function
MOVD 0(R26), R0
CALL (R0)

// should never return
CALL runtime·abort(SB)
UNDEF

/*
* support for morestack
*/
Expand All @@ -278,6 +302,16 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
// Cannot grow scheduler stack (m->g0).
MOVD g_m(g), R8
MOVD m_g0(R8), R4

// Called from f.
// Set g->sched to context in f
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD R3, (g_sched+gobuf_lr)(g)
MOVD R26, (g_sched+gobuf_ctxt)(g)

CMP g, R4
BNE 3(PC)
BL runtime·badmorestackg0(SB)
Expand All @@ -290,15 +324,6 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
BL runtime·badmorestackgsignal(SB)
B runtime·abort(SB)

// Called from f.
// Set g->sched to context in f
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD R3, (g_sched+gobuf_lr)(g)
MOVD R26, (g_sched+gobuf_ctxt)(g)

// Called from f.
// Set m->morebuf to f's callers.
MOVD R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC
Expand Down
8 changes: 8 additions & 0 deletions src/runtime/crash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,14 @@ func TestG0StackOverflow(t *testing.T) {
if n := strings.Count(string(out), "morestack on g0\n"); n != 1 {
t.Fatalf("%s\n(exit status %v)", out, err)
}
if runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" {
// check for a stack trace
want := "runtime.stackOverflow"
if n := strings.Count(string(out), want); n < 5 {
t.Errorf("output does not contain %q at least 5 times:\n%s", want, out)
}
return // it's not a signal-style traceback
}
// Check that it's a signal-style traceback.
if runtime.GOOS != "windows" {
if want := "PC="; !strings.Contains(string(out), want) {
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/export_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ func G0StackOverflow() {
// The stack bounds for g0 stack is not always precise.
// Use an artificially small stack, to trigger a stack overflow
// without actually run out of the system stack (which may seg fault).
g0.stack.lo = sp - 4096
g0.stack.lo = sp - 4096 - stackSystem
g0.stackguard0 = g0.stack.lo + stackGuard
g0.stackguard1 = g0.stackguard0

Expand Down
58 changes: 57 additions & 1 deletion src/runtime/proc.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,20 @@ func badreflectcall() {
//go:nosplit
//go:nowritebarrierrec
func badmorestackg0() {
writeErrStr("fatal: morestack on g0\n")
if !crashStackImplemented {
writeErrStr("fatal: morestack on g0\n")
return
}

g := getg()
switchToCrashStack(func() {
print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
g.m.traceback = 2 // include pc and sp in stack trace
traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
print("\n")

throw("morestack on g0")
})
}

//go:nosplit
Expand All @@ -530,6 +543,49 @@ func badctxt() {
throw("ctxt != 0")
}

// crashstack is a space that can be used as the stack when it is
// crashing on bad stack conditions, e.g. morestack on g0.
// gcrash is the corresponding (fake) g.
var crashstack [16384]byte

var gcrash = g{
stack: stack{uintptr(unsafe.Pointer(&crashstack)), uintptr(unsafe.Pointer(&crashstack)) + unsafe.Sizeof(crashstack)},
stackguard0: uintptr(unsafe.Pointer(&crashstack)) + 1000,
stackguard1: uintptr(unsafe.Pointer(&crashstack)) + 1000,
}

var crashingG atomic.Pointer[g]

// Switch to crashstack and call fn, with special handling of
// concurrent and recursive cases.
//
// Nosplit as it is called in a bad stack condition (we know
// morestack would fail).
//
//go:nosplit
//go:nowritebarrierrec
func switchToCrashStack(fn func()) {
me := getg()
if crashingG.CompareAndSwapNoWB(nil, me) {
switchToCrashStack0(fn) // should never return
abort()
}
if crashingG.Load() == me {
// recursive crashing. too bad.
writeErrStr("fatal: recursive switchToCrashStack\n")
abort()
}
// Another g is crashing. Give it some time, hopefully it will finish traceback.
usleep_no_g(100)
writeErrStr("fatal: concurrent switchToCrashStack\n")
abort()
}

const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64"

//go:noescape
func switchToCrashStack0(func()) // in assembly

func lockedOSThread() bool {
gp := getg()
return gp.lockedm != 0 && gp.m.lockedg != 0
Expand Down

0 comments on commit 0262ea1

Please sign in to comment.