@@ -2369,6 +2369,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2369
2369
}
2370
2370
EXPORT_SYMBOL_GPL (task_cgroup_path );
2371
2371
2372
+ /**
2373
+ * cgroup_attach_lock - Lock for ->attach()
2374
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2375
+ *
2376
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
2377
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2378
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
2379
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2380
+ * lead to deadlocks.
2381
+ *
2382
+ * Bringing up a CPU may involve creating and destroying tasks which requires
2383
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2384
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2385
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2386
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
2387
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
2388
+ *
2389
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2390
+ *
2391
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
2392
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2393
+ * CPU hotplug is disabled on entry.
2394
+ */
2395
+ static void cgroup_attach_lock (bool lock_threadgroup )
2396
+ {
2397
+ cpus_read_lock ();
2398
+ if (lock_threadgroup )
2399
+ percpu_down_write (& cgroup_threadgroup_rwsem );
2400
+ }
2401
+
2402
+ /**
2403
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
2404
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2405
+ */
2406
+ static void cgroup_attach_unlock (bool lock_threadgroup )
2407
+ {
2408
+ if (lock_threadgroup )
2409
+ percpu_up_write (& cgroup_threadgroup_rwsem );
2410
+ cpus_read_unlock ();
2411
+ }
2412
+
2372
2413
/**
2373
2414
* cgroup_migrate_add_task - add a migration target task to a migration context
2374
2415
* @task: target task
@@ -2841,8 +2882,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2841
2882
}
2842
2883
2843
2884
struct task_struct * cgroup_procs_write_start (char * buf , bool threadgroup ,
2844
- bool * locked )
2845
- __acquires (& cgroup_threadgroup_rwsem )
2885
+ bool * threadgroup_locked )
2846
2886
{
2847
2887
struct task_struct * tsk ;
2848
2888
pid_t pid ;
@@ -2859,12 +2899,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2859
2899
* Therefore, we can skip the global lock.
2860
2900
*/
2861
2901
lockdep_assert_held (& cgroup_mutex );
2862
- if (pid || threadgroup ) {
2863
- percpu_down_write (& cgroup_threadgroup_rwsem );
2864
- * locked = true;
2865
- } else {
2866
- * locked = false;
2867
- }
2902
+ * threadgroup_locked = pid || threadgroup ;
2903
+ cgroup_attach_lock (* threadgroup_locked );
2868
2904
2869
2905
rcu_read_lock ();
2870
2906
if (pid ) {
@@ -2895,26 +2931,23 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2895
2931
goto out_unlock_rcu ;
2896
2932
2897
2933
out_unlock_threadgroup :
2898
- if (* locked ) {
2899
- percpu_up_write (& cgroup_threadgroup_rwsem );
2900
- * locked = false;
2901
- }
2934
+ cgroup_attach_unlock (* threadgroup_locked );
2935
+ * threadgroup_locked = false;
2902
2936
out_unlock_rcu :
2903
2937
rcu_read_unlock ();
2904
2938
return tsk ;
2905
2939
}
2906
2940
2907
- void cgroup_procs_write_finish (struct task_struct * task , bool locked )
2908
- __releases (& cgroup_threadgroup_rwsem )
2941
+ void cgroup_procs_write_finish (struct task_struct * task , bool threadgroup_locked )
2909
2942
{
2910
2943
struct cgroup_subsys * ss ;
2911
2944
int ssid ;
2912
2945
2913
2946
/* release reference from cgroup_procs_write_start() */
2914
2947
put_task_struct (task );
2915
2948
2916
- if ( locked )
2917
- percpu_up_write ( & cgroup_threadgroup_rwsem );
2949
+ cgroup_attach_unlock ( threadgroup_locked );
2950
+
2918
2951
for_each_subsys (ss , ssid )
2919
2952
if (ss -> post_attach )
2920
2953
ss -> post_attach ();
@@ -3000,8 +3033,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3000
3033
* write-locking can be skipped safely.
3001
3034
*/
3002
3035
has_tasks = !list_empty (& mgctx .preloaded_src_csets );
3003
- if (has_tasks )
3004
- percpu_down_write (& cgroup_threadgroup_rwsem );
3036
+ cgroup_attach_lock (has_tasks );
3005
3037
3006
3038
/* NULL dst indicates self on default hierarchy */
3007
3039
ret = cgroup_migrate_prepare_dst (& mgctx );
@@ -3022,8 +3054,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3022
3054
ret = cgroup_migrate_execute (& mgctx );
3023
3055
out_finish :
3024
3056
cgroup_migrate_finish (& mgctx );
3025
- if (has_tasks )
3026
- percpu_up_write (& cgroup_threadgroup_rwsem );
3057
+ cgroup_attach_unlock (has_tasks );
3027
3058
return ret ;
3028
3059
}
3029
3060
@@ -4971,13 +5002,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4971
5002
struct task_struct * task ;
4972
5003
const struct cred * saved_cred ;
4973
5004
ssize_t ret ;
4974
- bool locked ;
5005
+ bool threadgroup_locked ;
4975
5006
4976
5007
dst_cgrp = cgroup_kn_lock_live (of -> kn , false);
4977
5008
if (!dst_cgrp )
4978
5009
return - ENODEV ;
4979
5010
4980
- task = cgroup_procs_write_start (buf , threadgroup , & locked );
5011
+ task = cgroup_procs_write_start (buf , threadgroup , & threadgroup_locked );
4981
5012
ret = PTR_ERR_OR_ZERO (task );
4982
5013
if (ret )
4983
5014
goto out_unlock ;
@@ -5003,7 +5034,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
5003
5034
ret = cgroup_attach_task (dst_cgrp , task , threadgroup );
5004
5035
5005
5036
out_finish :
5006
- cgroup_procs_write_finish (task , locked );
5037
+ cgroup_procs_write_finish (task , threadgroup_locked );
5007
5038
out_unlock :
5008
5039
cgroup_kn_unlock (of -> kn );
5009
5040
0 commit comments