Skip to content

Commit dfe781e

Browse files
committed
runtime: fix coro interactions with thread-locked goroutines
This change fixes problems with thread-locked goroutines using newcoro/coroswitch/etc. Currently, the coro paths do not consider thread-locked goroutines at all and can quickly result in broken scheduler state or lost/leaked goroutines. One possible fix to these issues is to fall back on goroutine+channel semantics, but that turns out to be fairly complicated to implement and results in significant performance cliffs. More complex thread-lock state donation tricks also result in some fairly complicated state tracking that doesn't seem worth it given the use-cases of iter.Pull (and even then, there will be performance cliffs). This change implements a much simpler, but more restrictive semantics. In particular, thread-lock state is tied to the coro at the first call to newcoro (i.e. iter.Pull). From then on, the invariant is that if the coro has any thread-lock state *or* a goroutine calling into coroswitch has any thread-lock state, that the full gamut of thread-lock state must remain the same as it was when newcoro was called (the full gamut meaning internal and external lock counts as well as the identity of the thread that was locked to). This semantics allows the common cases to be always fast, but comes with a non-orthogonality caveat. Specifically, when iter.Pull is used in conjunction with thread-locked goroutines, complex cases (passing next between goroutines or passing yield between goroutines) are likely to fail. Simple cases, where any number of iter.Pull iterators are used in a straightforward way (nested, in series, etc.) from the same goroutine, will work and will be guaranteed to be fast regardless of thread-lock state. This is a compromise for the near-term and we may consider lifting the restrictions imposed by this CL in the future. Fixes #65889. Fixes #65946. Change-Id: I3fb5791e36a61f5ded50226a229a79d28739b24e Reviewed-on: https://go-review.googlesource.com/c/go/+/583675 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]> Reviewed-by: Austin Clements <[email protected]>
1 parent 5890b02 commit dfe781e

File tree

6 files changed

+516
-13
lines changed

6 files changed

+516
-13
lines changed

src/runtime/coro.go

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ import "unsafe"
2424
type coro struct {
2525
gp guintptr
2626
f func(*coro)
27+
28+
// State for validating thread-lock interactions.
29+
mp *m
30+
lockedExt uint32 // mp's external LockOSThread counter at coro creation time.
31+
lockedInt uint32 // mp's internal lockOSThread counter at coro creation time.
2732
}
2833

2934
//go:linkname newcoro
@@ -37,9 +42,18 @@ func newcoro(f func(*coro)) *coro {
3742
pc := getcallerpc()
3843
gp := getg()
3944
systemstack(func() {
45+
mp := gp.m
4046
start := corostart
4147
startfv := *(**funcval)(unsafe.Pointer(&start))
4248
gp = newproc1(startfv, gp, pc, true, waitReasonCoroutine)
49+
50+
// Scribble down locked thread state if needed and/or donate
51+
// thread-lock state to the new goroutine.
52+
if mp.lockedExt+mp.lockedInt != 0 {
53+
c.mp = mp
54+
c.lockedExt = mp.lockedExt
55+
c.lockedInt = mp.lockedInt
56+
}
4357
})
4458
gp.coroarg = c
4559
c.gp.set(gp)
@@ -90,17 +104,28 @@ func coroswitch(c *coro) {
90104
// It is important not to add more atomic operations or other
91105
// expensive operations to the fast path.
92106
func coroswitch_m(gp *g) {
93-
// TODO(go.dev/issue/65889): Something really nasty will happen if either
94-
// goroutine in this handoff tries to lock itself to an OS thread.
95-
// There's an explicit multiplexing going on here that needs to be
96-
// disabled if either the consumer or the iterator ends up in such
97-
// a state.
98107
c := gp.coroarg
99108
gp.coroarg = nil
100109
exit := gp.coroexit
101110
gp.coroexit = false
102111
mp := gp.m
103112

113+
// Track and validate thread-lock interactions.
114+
//
115+
// The rules with thread-lock interactions are simple. When a coro goroutine is switched to,
116+
// the same thread must be used, and the locked state must match with the thread-lock state of
117+
// the goroutine which called newcoro. Thread-lock state consists of the thread and the number
118+
// of internal (cgo callback, etc.) and external (LockOSThread) thread locks.
119+
locked := gp.lockedm != 0
120+
if c.mp != nil || locked {
121+
if mp != c.mp || mp.lockedInt != c.lockedInt || mp.lockedExt != c.lockedExt {
122+
print("coro: got thread ", unsafe.Pointer(mp), ", want ", unsafe.Pointer(c.mp), "\n")
123+
print("coro: got lock internal ", mp.lockedInt, ", want ", c.lockedInt, "\n")
124+
print("coro: got lock external ", mp.lockedExt, ", want ", c.lockedExt, "\n")
125+
throw("coro: OS thread locking must match locking at coroutine creation")
126+
}
127+
}
128+
104129
// Acquire tracer for writing for the duration of this call.
105130
//
106131
// There's a lot of state manipulation performed with shortcuts
@@ -109,11 +134,18 @@ func coroswitch_m(gp *g) {
109134
// emitting an event for every single transition.
110135
trace := traceAcquire()
111136

137+
if locked {
138+
// Detach the goroutine from the thread; we'll attach to the goroutine we're
139+
// switching to before returning.
140+
gp.lockedm.set(nil)
141+
}
142+
112143
if exit {
113-
// TODO(65889): If we're locked to the current OS thread and
114-
// we exit here while tracing is enabled, we're going to end up
115-
// in a really bad place (traceAcquire also calls acquirem; there's
116-
// no releasem before the thread exits).
144+
// The M might have a non-zero OS thread lock count when we get here, gdestroy
145+
// will avoid destroying the M if the G isn't explicitly locked to it via lockedm,
146+
// which we cleared above. It's fine to gdestroy here also, even when locked to
147+
// the thread, because we'll be switching back to another goroutine anyway, which
148+
// will take back its thread-lock state before returning.
117149
gdestroy(gp)
118150
gp = nil
119151
} else {
@@ -156,6 +188,14 @@ func coroswitch_m(gp *g) {
156188
}
157189
}
158190

191+
// Check if we're switching to ourselves. This case is able to break our
192+
// thread-lock invariants and an unbuffered channel implementation of
193+
// coroswitch would deadlock. It's clear that this case should just not
194+
// work.
195+
if gnext == gp {
196+
throw("coroswitch of a goroutine to itself")
197+
}
198+
159199
// Emit the trace event after getting gnext but before changing curg.
160200
// GoSwitch expects that the current G is running and that we haven't
161201
// switched yet for correct status emission.
@@ -175,6 +215,12 @@ func coroswitch_m(gp *g) {
175215
casgstatus(gnext, _Grunnable, _Grunning)
176216
}
177217

218+
// Donate locked state.
219+
if locked {
220+
mp.lockedg.set(gnext)
221+
gnext.lockedm.set(mp)
222+
}
223+
178224
// Release the trace locker. We've completed all the necessary transitions..
179225
if trace.ok() {
180226
traceRelease(trace)

src/runtime/coro_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package runtime_test
6+
7+
import (
8+
"runtime"
9+
"strings"
10+
"testing"
11+
)
12+
13+
func TestCoroLockOSThread(t *testing.T) {
14+
for _, test := range []string{
15+
"CoroLockOSThreadIterLock",
16+
"CoroLockOSThreadIterLockYield",
17+
"CoroLockOSThreadLock",
18+
"CoroLockOSThreadLockIterNested",
19+
"CoroLockOSThreadLockIterLock",
20+
"CoroLockOSThreadLockIterLockYield",
21+
"CoroLockOSThreadLockIterYieldNewG",
22+
"CoroLockOSThreadLockAfterPull",
23+
"CoroLockOSThreadStopLocked",
24+
"CoroLockOSThreadStopLockedIterNested",
25+
} {
26+
t.Run(test, func(t *testing.T) {
27+
checkCoroTestProgOutput(t, runTestProg(t, "testprog", test))
28+
})
29+
}
30+
}
31+
32+
func TestCoroCgoCallback(t *testing.T) {
33+
if runtime.GOOS == "windows" {
34+
t.Skip("coro cgo callback tests not supported on Windows")
35+
}
36+
for _, test := range []string{
37+
"CoroCgoIterCallback",
38+
"CoroCgoIterCallbackYield",
39+
"CoroCgoCallback",
40+
"CoroCgoCallbackIterNested",
41+
"CoroCgoCallbackIterCallback",
42+
"CoroCgoCallbackIterCallbackYield",
43+
"CoroCgoCallbackAfterPull",
44+
"CoroCgoStopCallback",
45+
"CoroCgoStopCallbackIterNested",
46+
} {
47+
t.Run(test, func(t *testing.T) {
48+
checkCoroTestProgOutput(t, runTestProg(t, "testprogcgo", test))
49+
})
50+
}
51+
}
52+
53+
func checkCoroTestProgOutput(t *testing.T, output string) {
54+
t.Helper()
55+
56+
c := strings.SplitN(output, "\n", 2)
57+
if len(c) == 1 {
58+
t.Fatalf("expected at least one complete line in the output, got:\n%s", output)
59+
}
60+
expect, ok := strings.CutPrefix(c[0], "expect: ")
61+
if !ok {
62+
t.Fatalf("expected first line of output to start with \"expect: \", got: %q", c[0])
63+
}
64+
rest := c[1]
65+
if expect == "OK" && rest != "OK\n" {
66+
t.Fatalf("expected just 'OK' in the output, got:\n%s", rest)
67+
}
68+
if !strings.Contains(rest, expect) {
69+
t.Fatalf("expected %q in the output, got:\n%s", expect, rest)
70+
}
71+
}

src/runtime/crash_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,23 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
168168
cmd := exec.Command(testenv.GoToolPath(t), append([]string{"build", "-o", exe}, flags...)...)
169169
t.Logf("running %v", cmd)
170170
cmd.Dir = "testdata/" + binary
171-
out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
171+
cmd = testenv.CleanCmdEnv(cmd)
172+
173+
// Add the rangefunc GOEXPERIMENT unconditionally since some tests depend on it.
174+
// TODO(61405): Remove this once it's enabled by default.
175+
edited := false
176+
for i := range cmd.Env {
177+
e := cmd.Env[i]
178+
if _, vars, ok := strings.Cut(e, "GOEXPERIMENT="); ok {
179+
cmd.Env[i] = "GOEXPERIMENT=" + vars + ",rangefunc"
180+
edited = true
181+
}
182+
}
183+
if !edited {
184+
cmd.Env = append(cmd.Env, "GOEXPERIMENT=rangefunc")
185+
}
186+
187+
out, err := cmd.CombinedOutput()
172188
if err != nil {
173189
target.err = fmt.Errorf("building %s %v: %v\n%s", binary, flags, err, out)
174190
} else {

src/runtime/proc.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4217,9 +4217,9 @@ func gdestroy(gp *g) {
42174217
return
42184218
}
42194219

4220-
if mp.lockedInt != 0 {
4221-
print("invalid m->lockedInt = ", mp.lockedInt, "\n")
4222-
throw("internal lockOSThread error")
4220+
if locked && mp.lockedInt != 0 {
4221+
print("runtime: mp.lockedInt = ", mp.lockedInt, "\n")
4222+
throw("exited a goroutine internally locked to the OS thread")
42234223
}
42244224
gfput(pp, gp)
42254225
if locked {

0 commit comments

Comments
 (0)