@@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
148
148
raw_spin_unlock (& cpuctx -> ctx .lock );
149
149
}
150
150
151
+ #define TASK_TOMBSTONE ((void *)-1L)
152
+
153
+ static bool is_kernel_event (struct perf_event * event )
154
+ {
155
+ return event -> owner == TASK_TOMBSTONE ;
156
+ }
157
+
151
158
/*
152
159
* On task ctx scheduling...
153
160
*
@@ -196,31 +203,21 @@ static int event_function(void *info)
196
203
struct perf_event_context * ctx = event -> ctx ;
197
204
struct perf_cpu_context * cpuctx = __get_cpu_context (ctx );
198
205
struct perf_event_context * task_ctx = cpuctx -> task_ctx ;
206
+ int ret = 0 ;
199
207
200
208
WARN_ON_ONCE (!irqs_disabled ());
201
209
210
+ perf_ctx_lock (cpuctx , task_ctx );
202
211
/*
203
212
* Since we do the IPI call without holding ctx->lock things can have
204
213
* changed, double check we hit the task we set out to hit.
205
- *
206
- * If ctx->task == current, we know things must remain valid because
207
- * we have IRQs disabled so we cannot schedule.
208
214
*/
209
215
if (ctx -> task ) {
210
- if (ctx -> task != current )
211
- return - EAGAIN ;
212
-
213
- WARN_ON_ONCE (task_ctx != ctx );
214
- } else {
215
- WARN_ON_ONCE (& cpuctx -> ctx != ctx );
216
- }
216
+ if (ctx -> task != current ) {
217
+ ret = - EAGAIN ;
218
+ goto unlock ;
219
+ }
217
220
218
- perf_ctx_lock (cpuctx , task_ctx );
219
- /*
220
- * Now that we hold locks, double check state. Paranoia pays.
221
- */
222
- if (task_ctx ) {
223
- WARN_ON_ONCE (task_ctx -> task != current );
224
221
/*
225
222
* We only use event_function_call() on established contexts,
226
223
* and event_function() is only ever called when active (or
@@ -233,12 +230,16 @@ static int event_function(void *info)
233
230
* And since we have ctx->is_active, cpuctx->task_ctx must
234
231
* match.
235
232
*/
236
- WARN_ON_ONCE (cpuctx -> task_ctx != task_ctx );
233
+ WARN_ON_ONCE (task_ctx != ctx );
234
+ } else {
235
+ WARN_ON_ONCE (& cpuctx -> ctx != ctx );
237
236
}
237
+
238
238
efs -> func (event , cpuctx , ctx , efs -> data );
239
+ unlock :
239
240
perf_ctx_unlock (cpuctx , task_ctx );
240
241
241
- return 0 ;
242
+ return ret ;
242
243
}
243
244
244
245
static void event_function_local (struct perf_event * event , event_f func , void * data )
@@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
256
257
static void event_function_call (struct perf_event * event , event_f func , void * data )
257
258
{
258
259
struct perf_event_context * ctx = event -> ctx ;
259
- struct task_struct * task = ctx -> task ;
260
+ struct task_struct * task = READ_ONCE ( ctx -> task ); /* verified in event_function */
260
261
struct event_function_struct efs = {
261
262
.event = event ,
262
263
.func = func ,
@@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
278
279
}
279
280
280
281
again :
282
+ if (task == TASK_TOMBSTONE )
283
+ return ;
284
+
281
285
if (!task_function_call (task , event_function , & efs ))
282
286
return ;
283
287
284
288
raw_spin_lock_irq (& ctx -> lock );
285
- if (ctx -> is_active ) {
286
- /*
287
- * Reload the task pointer, it might have been changed by
288
- * a concurrent perf_event_context_sched_out().
289
- */
290
- task = ctx -> task ;
291
- raw_spin_unlock_irq (& ctx -> lock );
292
- goto again ;
289
+ /*
290
+ * Reload the task pointer, it might have been changed by
291
+ * a concurrent perf_event_context_sched_out().
292
+ */
293
+ task = ctx -> task ;
294
+ if (task != TASK_TOMBSTONE ) {
295
+ if (ctx -> is_active ) {
296
+ raw_spin_unlock_irq (& ctx -> lock );
297
+ goto again ;
298
+ }
299
+ func (event , NULL , ctx , data );
293
300
}
294
- func (event , NULL , ctx , data );
295
301
raw_spin_unlock_irq (& ctx -> lock );
296
302
}
297
303
298
- #define EVENT_OWNER_KERNEL ((void *) -1)
299
-
300
- static bool is_kernel_event (struct perf_event * event )
301
- {
302
- return event -> owner == EVENT_OWNER_KERNEL ;
303
- }
304
-
305
304
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
306
305
PERF_FLAG_FD_OUTPUT |\
307
306
PERF_FLAG_PID_CGROUP |\
@@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx)
1025
1024
if (atomic_dec_and_test (& ctx -> refcount )) {
1026
1025
if (ctx -> parent_ctx )
1027
1026
put_ctx (ctx -> parent_ctx );
1028
- if (ctx -> task )
1027
+ if (ctx -> task && ctx -> task != TASK_TOMBSTONE )
1029
1028
put_task_struct (ctx -> task );
1030
1029
call_rcu (& ctx -> rcu_head , free_ctx );
1031
1030
}
@@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event)
1186
1185
1187
1186
/*
1188
1187
* Get the perf_event_context for a task and lock it.
1188
+ *
1189
1189
* This has to cope with with the fact that until it is locked,
1190
1190
* the context could get moved to another task.
1191
1191
*/
@@ -1226,10 +1226,13 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1226
1226
goto retry ;
1227
1227
}
1228
1228
1229
- if (!atomic_inc_not_zero (& ctx -> refcount )) {
1229
+ if (ctx -> task == TASK_TOMBSTONE ||
1230
+ !atomic_inc_not_zero (& ctx -> refcount )) {
1230
1231
raw_spin_unlock (& ctx -> lock );
1231
1232
ctx = NULL ;
1232
1233
}
1234
+
1235
+ WARN_ON_ONCE (ctx -> task != task );
1233
1236
}
1234
1237
rcu_read_unlock ();
1235
1238
if (!ctx )
@@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info)
2140
2143
struct perf_cpu_context * cpuctx = __get_cpu_context (ctx );
2141
2144
struct perf_event_context * task_ctx = cpuctx -> task_ctx ;
2142
2145
2146
+ raw_spin_lock (& cpuctx -> ctx .lock );
2143
2147
if (ctx -> task ) {
2148
+ raw_spin_lock (& ctx -> lock );
2144
2149
/*
2145
2150
* If we hit the 'wrong' task, we've since scheduled and
2146
2151
* everything should be sorted, nothing to do!
2147
2152
*/
2153
+ task_ctx = ctx ;
2148
2154
if (ctx -> task != current )
2149
- return 0 ;
2155
+ goto unlock ;
2150
2156
2151
2157
/*
2152
2158
* If task_ctx is set, it had better be to us.
2153
2159
*/
2154
2160
WARN_ON_ONCE (cpuctx -> task_ctx != ctx && cpuctx -> task_ctx );
2155
- task_ctx = ctx ;
2161
+ } else if (task_ctx ) {
2162
+ raw_spin_lock (& task_ctx -> lock );
2156
2163
}
2157
2164
2158
- perf_ctx_lock (cpuctx , task_ctx );
2159
2165
ctx_resched (cpuctx , task_ctx );
2166
+ unlock :
2160
2167
perf_ctx_unlock (cpuctx , task_ctx );
2161
2168
2162
2169
return 0 ;
@@ -2188,14 +2195,24 @@ perf_install_in_context(struct perf_event_context *ctx,
2188
2195
* happened and that will have taken care of business.
2189
2196
*/
2190
2197
raw_spin_lock_irq (& ctx -> lock );
2198
+ task = ctx -> task ;
2199
+ /*
2200
+ * Worse, we cannot even rely on the ctx actually existing anymore. If
2201
+ * between find_get_context() and perf_install_in_context() the task
2202
+ * went through perf_event_exit_task() its dead and we should not be
2203
+ * adding new events.
2204
+ */
2205
+ if (task == TASK_TOMBSTONE ) {
2206
+ raw_spin_unlock_irq (& ctx -> lock );
2207
+ return ;
2208
+ }
2191
2209
update_context_time (ctx );
2192
2210
/*
2193
2211
* Update cgrp time only if current cgrp matches event->cgrp.
2194
2212
* Must be done before calling add_event_to_ctx().
2195
2213
*/
2196
2214
update_cgrp_time_from_event (event );
2197
2215
add_event_to_ctx (event , ctx );
2198
- task = ctx -> task ;
2199
2216
raw_spin_unlock_irq (& ctx -> lock );
2200
2217
2201
2218
if (task )
@@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2538
2555
raw_spin_lock (& ctx -> lock );
2539
2556
raw_spin_lock_nested (& next_ctx -> lock , SINGLE_DEPTH_NESTING );
2540
2557
if (context_equiv (ctx , next_ctx )) {
2541
- /*
2542
- * XXX do we need a memory barrier of sorts
2543
- * wrt to rcu_dereference() of perf_event_ctxp
2544
- */
2545
- task -> perf_event_ctxp [ctxn ] = next_ctx ;
2546
- next -> perf_event_ctxp [ctxn ] = ctx ;
2547
- ctx -> task = next ;
2548
- next_ctx -> task = task ;
2558
+ WRITE_ONCE (ctx -> task , next );
2559
+ WRITE_ONCE (next_ctx -> task , task );
2549
2560
2550
2561
swap (ctx -> task_ctx_data , next_ctx -> task_ctx_data );
2551
2562
2563
+ /*
2564
+ * RCU_INIT_POINTER here is safe because we've not
2565
+ * modified the ctx and the above modification of
2566
+ * ctx->task and ctx->task_ctx_data are immaterial
2567
+ * since those values are always verified under
2568
+ * ctx->lock which we're now holding.
2569
+ */
2570
+ RCU_INIT_POINTER (task -> perf_event_ctxp [ctxn ], next_ctx );
2571
+ RCU_INIT_POINTER (next -> perf_event_ctxp [ctxn ], ctx );
2572
+
2552
2573
do_switch = 0 ;
2553
2574
2554
2575
perf_event_sync_stat (ctx , next_ctx );
@@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8545
8566
}
8546
8567
8547
8568
/* Mark owner so we could distinguish it from user events. */
8548
- event -> owner = EVENT_OWNER_KERNEL ;
8569
+ event -> owner = TASK_TOMBSTONE ;
8549
8570
8550
8571
account_event (event );
8551
8572
@@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event,
8725
8746
8726
8747
static void perf_event_exit_task_context (struct task_struct * child , int ctxn )
8727
8748
{
8728
- struct perf_event * child_event , * next ;
8729
8749
struct perf_event_context * child_ctx , * clone_ctx = NULL ;
8750
+ struct perf_event * child_event , * next ;
8751
+ unsigned long flags ;
8730
8752
8731
- if (likely (!child -> perf_event_ctxp [ctxn ]))
8753
+ WARN_ON_ONCE (child != current );
8754
+
8755
+ child_ctx = perf_lock_task_context (child , ctxn , & flags );
8756
+ if (!child_ctx )
8732
8757
return ;
8733
8758
8734
- local_irq_disable ();
8735
- WARN_ON_ONCE (child != current );
8736
- /*
8737
- * We can't reschedule here because interrupts are disabled,
8738
- * and child must be current.
8739
- */
8740
- child_ctx = rcu_dereference_raw (child -> perf_event_ctxp [ctxn ]);
8759
+ task_ctx_sched_out (__get_cpu_context (child_ctx ), child_ctx );
8741
8760
8742
8761
/*
8743
- * Take the context lock here so that if find_get_context is
8744
- * reading child->perf_event_ctxp, we wait until it has
8745
- * incremented the context's refcount before we do put_ctx below.
8762
+ * Now that the context is inactive, destroy the task <-> ctx relation
8763
+ * and mark the context dead.
8746
8764
*/
8747
- raw_spin_lock (& child_ctx -> lock );
8748
- task_ctx_sched_out (__get_cpu_context (child_ctx ), child_ctx );
8749
- child -> perf_event_ctxp [ctxn ] = NULL ;
8765
+ RCU_INIT_POINTER (child -> perf_event_ctxp [ctxn ], NULL );
8766
+ put_ctx (child_ctx ); /* cannot be last */
8767
+ WRITE_ONCE (child_ctx -> task , TASK_TOMBSTONE );
8768
+ put_task_struct (current ); /* cannot be last */
8750
8769
8751
8770
/*
8752
8771
* If this context is a clone; unclone it so it can't get
@@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8755
8774
*/
8756
8775
clone_ctx = unclone_ctx (child_ctx );
8757
8776
update_context_time (child_ctx );
8758
- raw_spin_unlock_irq (& child_ctx -> lock );
8777
+ raw_spin_unlock_irqrestore (& child_ctx -> lock , flags );
8759
8778
8760
8779
if (clone_ctx )
8761
8780
put_ctx (clone_ctx );
0 commit comments