Skip to content

Commit 63b6da3

Browse files
Peter ZijlstraIngo Molnar
Peter Zijlstra
authored and
Ingo Molnar
committed
perf: Fix perf_event_exit_task() race
There is a race against perf_event_exit_task() vs event_function_call(),find_get_context(),perf_install_in_context() (iow, everyone). Since there is no permanent marker on a context that its dead, it is quite possible that we access (and even modify) a context after its passed through perf_event_exit_task(). For instance, find_get_context() might find the context still installed, but by the time we get to perf_install_in_context() it might already have passed through perf_event_exit_task() and be considered dead, we will however still add the event to it. Solve this by marking a ctx dead by setting its ctx->task value to -1, it must be !0 so we still know its a (former) task context. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: David Ahern <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Stephane Eranian <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Vince Weaver <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent c97f473 commit 63b6da3

File tree

1 file changed

+85
-66
lines changed

1 file changed

+85
-66
lines changed

kernel/events/core.c

Lines changed: 85 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
148148
raw_spin_unlock(&cpuctx->ctx.lock);
149149
}
150150

151+
#define TASK_TOMBSTONE ((void *)-1L)
152+
153+
static bool is_kernel_event(struct perf_event *event)
154+
{
155+
return event->owner == TASK_TOMBSTONE;
156+
}
157+
151158
/*
152159
* On task ctx scheduling...
153160
*
@@ -196,31 +203,21 @@ static int event_function(void *info)
196203
struct perf_event_context *ctx = event->ctx;
197204
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
198205
struct perf_event_context *task_ctx = cpuctx->task_ctx;
206+
int ret = 0;
199207

200208
WARN_ON_ONCE(!irqs_disabled());
201209

210+
perf_ctx_lock(cpuctx, task_ctx);
202211
/*
203212
* Since we do the IPI call without holding ctx->lock things can have
204213
* changed, double check we hit the task we set out to hit.
205-
*
206-
* If ctx->task == current, we know things must remain valid because
207-
* we have IRQs disabled so we cannot schedule.
208214
*/
209215
if (ctx->task) {
210-
if (ctx->task != current)
211-
return -EAGAIN;
212-
213-
WARN_ON_ONCE(task_ctx != ctx);
214-
} else {
215-
WARN_ON_ONCE(&cpuctx->ctx != ctx);
216-
}
216+
if (ctx->task != current) {
217+
ret = -EAGAIN;
218+
goto unlock;
219+
}
217220

218-
perf_ctx_lock(cpuctx, task_ctx);
219-
/*
220-
* Now that we hold locks, double check state. Paranoia pays.
221-
*/
222-
if (task_ctx) {
223-
WARN_ON_ONCE(task_ctx->task != current);
224221
/*
225222
* We only use event_function_call() on established contexts,
226223
* and event_function() is only ever called when active (or
@@ -233,12 +230,16 @@ static int event_function(void *info)
233230
* And since we have ctx->is_active, cpuctx->task_ctx must
234231
* match.
235232
*/
236-
WARN_ON_ONCE(cpuctx->task_ctx != task_ctx);
233+
WARN_ON_ONCE(task_ctx != ctx);
234+
} else {
235+
WARN_ON_ONCE(&cpuctx->ctx != ctx);
237236
}
237+
238238
efs->func(event, cpuctx, ctx, efs->data);
239+
unlock:
239240
perf_ctx_unlock(cpuctx, task_ctx);
240241

241-
return 0;
242+
return ret;
242243
}
243244

244245
static void event_function_local(struct perf_event *event, event_f func, void *data)
@@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
256257
static void event_function_call(struct perf_event *event, event_f func, void *data)
257258
{
258259
struct perf_event_context *ctx = event->ctx;
259-
struct task_struct *task = ctx->task;
260+
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
260261
struct event_function_struct efs = {
261262
.event = event,
262263
.func = func,
@@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
278279
}
279280

280281
again:
282+
if (task == TASK_TOMBSTONE)
283+
return;
284+
281285
if (!task_function_call(task, event_function, &efs))
282286
return;
283287

284288
raw_spin_lock_irq(&ctx->lock);
285-
if (ctx->is_active) {
286-
/*
287-
* Reload the task pointer, it might have been changed by
288-
* a concurrent perf_event_context_sched_out().
289-
*/
290-
task = ctx->task;
291-
raw_spin_unlock_irq(&ctx->lock);
292-
goto again;
289+
/*
290+
* Reload the task pointer, it might have been changed by
291+
* a concurrent perf_event_context_sched_out().
292+
*/
293+
task = ctx->task;
294+
if (task != TASK_TOMBSTONE) {
295+
if (ctx->is_active) {
296+
raw_spin_unlock_irq(&ctx->lock);
297+
goto again;
298+
}
299+
func(event, NULL, ctx, data);
293300
}
294-
func(event, NULL, ctx, data);
295301
raw_spin_unlock_irq(&ctx->lock);
296302
}
297303

298-
#define EVENT_OWNER_KERNEL ((void *) -1)
299-
300-
static bool is_kernel_event(struct perf_event *event)
301-
{
302-
return event->owner == EVENT_OWNER_KERNEL;
303-
}
304-
305304
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
306305
PERF_FLAG_FD_OUTPUT |\
307306
PERF_FLAG_PID_CGROUP |\
@@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx)
10251024
if (atomic_dec_and_test(&ctx->refcount)) {
10261025
if (ctx->parent_ctx)
10271026
put_ctx(ctx->parent_ctx);
1028-
if (ctx->task)
1027+
if (ctx->task && ctx->task != TASK_TOMBSTONE)
10291028
put_task_struct(ctx->task);
10301029
call_rcu(&ctx->rcu_head, free_ctx);
10311030
}
@@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event)
11861185

11871186
/*
11881187
* Get the perf_event_context for a task and lock it.
1188+
*
11891189
* This has to cope with with the fact that until it is locked,
11901190
* the context could get moved to another task.
11911191
*/
@@ -1226,10 +1226,13 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
12261226
goto retry;
12271227
}
12281228

1229-
if (!atomic_inc_not_zero(&ctx->refcount)) {
1229+
if (ctx->task == TASK_TOMBSTONE ||
1230+
!atomic_inc_not_zero(&ctx->refcount)) {
12301231
raw_spin_unlock(&ctx->lock);
12311232
ctx = NULL;
12321233
}
1234+
1235+
WARN_ON_ONCE(ctx->task != task);
12331236
}
12341237
rcu_read_unlock();
12351238
if (!ctx)
@@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info)
21402143
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
21412144
struct perf_event_context *task_ctx = cpuctx->task_ctx;
21422145

2146+
raw_spin_lock(&cpuctx->ctx.lock);
21432147
if (ctx->task) {
2148+
raw_spin_lock(&ctx->lock);
21442149
/*
21452150
* If we hit the 'wrong' task, we've since scheduled and
21462151
* everything should be sorted, nothing to do!
21472152
*/
2153+
task_ctx = ctx;
21482154
if (ctx->task != current)
2149-
return 0;
2155+
goto unlock;
21502156

21512157
/*
21522158
* If task_ctx is set, it had better be to us.
21532159
*/
21542160
WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
2155-
task_ctx = ctx;
2161+
} else if (task_ctx) {
2162+
raw_spin_lock(&task_ctx->lock);
21562163
}
21572164

2158-
perf_ctx_lock(cpuctx, task_ctx);
21592165
ctx_resched(cpuctx, task_ctx);
2166+
unlock:
21602167
perf_ctx_unlock(cpuctx, task_ctx);
21612168

21622169
return 0;
@@ -2188,14 +2195,24 @@ perf_install_in_context(struct perf_event_context *ctx,
21882195
* happened and that will have taken care of business.
21892196
*/
21902197
raw_spin_lock_irq(&ctx->lock);
2198+
task = ctx->task;
2199+
/*
2200+
* Worse, we cannot even rely on the ctx actually existing anymore. If
2201+
* between find_get_context() and perf_install_in_context() the task
2202+
* went through perf_event_exit_task() its dead and we should not be
2203+
* adding new events.
2204+
*/
2205+
if (task == TASK_TOMBSTONE) {
2206+
raw_spin_unlock_irq(&ctx->lock);
2207+
return;
2208+
}
21912209
update_context_time(ctx);
21922210
/*
21932211
* Update cgrp time only if current cgrp matches event->cgrp.
21942212
* Must be done before calling add_event_to_ctx().
21952213
*/
21962214
update_cgrp_time_from_event(event);
21972215
add_event_to_ctx(event, ctx);
2198-
task = ctx->task;
21992216
raw_spin_unlock_irq(&ctx->lock);
22002217

22012218
if (task)
@@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
25382555
raw_spin_lock(&ctx->lock);
25392556
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
25402557
if (context_equiv(ctx, next_ctx)) {
2541-
/*
2542-
* XXX do we need a memory barrier of sorts
2543-
* wrt to rcu_dereference() of perf_event_ctxp
2544-
*/
2545-
task->perf_event_ctxp[ctxn] = next_ctx;
2546-
next->perf_event_ctxp[ctxn] = ctx;
2547-
ctx->task = next;
2548-
next_ctx->task = task;
2558+
WRITE_ONCE(ctx->task, next);
2559+
WRITE_ONCE(next_ctx->task, task);
25492560

25502561
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
25512562

2563+
/*
2564+
* RCU_INIT_POINTER here is safe because we've not
2565+
* modified the ctx and the above modification of
2566+
* ctx->task and ctx->task_ctx_data are immaterial
2567+
* since those values are always verified under
2568+
* ctx->lock which we're now holding.
2569+
*/
2570+
RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2571+
RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2572+
25522573
do_switch = 0;
25532574

25542575
perf_event_sync_stat(ctx, next_ctx);
@@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
85458566
}
85468567

85478568
/* Mark owner so we could distinguish it from user events. */
8548-
event->owner = EVENT_OWNER_KERNEL;
8569+
event->owner = TASK_TOMBSTONE;
85498570

85508571
account_event(event);
85518572

@@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event,
87258746

87268747
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
87278748
{
8728-
struct perf_event *child_event, *next;
87298749
struct perf_event_context *child_ctx, *clone_ctx = NULL;
8750+
struct perf_event *child_event, *next;
8751+
unsigned long flags;
87308752

8731-
if (likely(!child->perf_event_ctxp[ctxn]))
8753+
WARN_ON_ONCE(child != current);
8754+
8755+
child_ctx = perf_lock_task_context(child, ctxn, &flags);
8756+
if (!child_ctx)
87328757
return;
87338758

8734-
local_irq_disable();
8735-
WARN_ON_ONCE(child != current);
8736-
/*
8737-
* We can't reschedule here because interrupts are disabled,
8738-
* and child must be current.
8739-
*/
8740-
child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8759+
task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
87418760

87428761
/*
8743-
* Take the context lock here so that if find_get_context is
8744-
* reading child->perf_event_ctxp, we wait until it has
8745-
* incremented the context's refcount before we do put_ctx below.
8762+
* Now that the context is inactive, destroy the task <-> ctx relation
8763+
* and mark the context dead.
87468764
*/
8747-
raw_spin_lock(&child_ctx->lock);
8748-
task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
8749-
child->perf_event_ctxp[ctxn] = NULL;
8765+
RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
8766+
put_ctx(child_ctx); /* cannot be last */
8767+
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
8768+
put_task_struct(current); /* cannot be last */
87508769

87518770
/*
87528771
* If this context is a clone; unclone it so it can't get
@@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
87558774
*/
87568775
clone_ctx = unclone_ctx(child_ctx);
87578776
update_context_time(child_ctx);
8758-
raw_spin_unlock_irq(&child_ctx->lock);
8777+
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
87598778

87608779
if (clone_ctx)
87618780
put_ctx(clone_ctx);

0 commit comments

Comments
 (0)