Message ID | 20220705123705.764-1-xuewen.yan@unisoc.com |
---|---|
State | New |
Headers | show |
Series | sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil | expand |
Hi all This deadlock is inevitable, any comment? On Tue, Jul 5, 2022 at 11:03 PM Xuewen Yan <xuewen.yan@unisoc.com> wrote: > > There is a deadlock between cpu_hotplug_lock sem and cgroup_threadgroup_rwsem. > And both cpu online and cpu offline scenarios will cause deadlock: > > [1] cpu online: > > 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock: > > [ffffffc00f9c3850] __switch_to at ffffffc0081229d4 > [ffffffc00f9c38b0] __schedule at ffffffc009c824f8 > [ffffffc00f9c3910] schedule at ffffffc009c82b50 > [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc > [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180 <<< try to get cpu_hotplug_lock > [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8 > [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4 > [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8 > [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8 > [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac <<< get the cgroup_threadgroup_rwsem > [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018 > [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04 > [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c > [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8 > [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640 > [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc > [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4 > [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8 > [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8 > [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44 > > 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x: > > [ffffffc01245b740] __switch_to at ffffffc0081229d4 > [ffffffc01245b7a0] __schedule at ffffffc009c824f8 > [ffffffc01245b800] schedule at ffffffc009c82b50 > [ffffffc01245b880] schedule_timeout at ffffffc009c8a144 > [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac <<< waiting for cpuhp/x complete > [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac <<< wake up cpuhp/x > [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0 > [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0 > [ffffffc01245bac0] _cpu_up at ffffffc0081dd844 <<< percpu_down_write(&cpu_hotplug_lock) > [ffffffc01245bb40] cpu_up at ffffffc0081dd100 > [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4 > [ffffffc01245bb90] device_online at ffffffc008d77dec > [ffffffc01245bbd0] online_store at ffffffc008d77bb8 > [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8 > [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894 > [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc01245bd50] vfs_write at ffffffc008607a8c > [ffffffc01245bdb0] ksys_write at ffffffc0086076d8 > [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640 > [ffffffc01245be10] invoke_syscall at ffffffc00813dccc > [ffffffc01245be30] el0_svc_common at ffffffc00813dc14 > [ffffffc01245be70] do_el0_svc at ffffffc00813dac8 > [ffffffc01245be80] el0_svc at ffffffc0098836c8 > [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44 > > 3. cpuhp/x would call cpufreq_online and the schedutil need create kthread, as a result it need wait for kthreadd: > > [ffffffc00b653860] __switch_to at ffffffc0081229d4 > [ffffffc00b6538c0] __schedule at ffffffc009c824f8 > [ffffffc00b653920] schedule at ffffffc009c82b50 > [ffffffc00b6539a0] schedule_timeout at ffffffc009c8a144 > [ffffffc00b653a00] wait_for_common at ffffffc009c83dac <<< wait for kthreadd > [ffffffc00b653ad0] __kthread_create_on_node at ffffffc0082296b8 > [ffffffc00b653b90] kthread_create_on_node at ffffffc008229a58 > [ffffffc00b653bb0] sugov_init at ffffffc001d01414 > [ffffffc00b653c00] cpufreq_init_governor at ffffffc009179a48 > [ffffffc00b653c50] cpufreq_set_policy at ffffffc009179360 > [ffffffc00b653cb0] cpufreq_online at ffffffc00917c3e4 > [ffffffc00b653d10] cpuhp_cpufreq_online at ffffffc00917ee3c > [ffffffc00b653d70] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc00b653dd0] cpuhp_thread_fun at ffffffc0081df47c > [ffffffc00b653e10] smpboot_thread_fn at ffffffc008234718 > [ffffffc00b653e70] kthread at ffffffc00822ac84 > > 4. the kthreadd is trying to get cgroup_threadgroup_rwsem... > > [ffffffc008073a70] __switch_to at ffffffc0081229d4 > [ffffffc008073ad0] __schedule at ffffffc009c824f8 > [ffffffc008073b30] schedule at ffffffc009c82b50 > [ffffffc008073b90] percpu_rwsem_wait at ffffffc00828fbcc <<< cgroup_threadgroup_rwsem > [ffffffc008073bd0] __percpu_down_read at ffffffc008290180 > [ffffffc008073c00] cgroup_css_set_fork at ffffffc008357ecc > [ffffffc008073c60] cgroup_can_fork at ffffffc008357bb4 > [ffffffc008073d00] copy_process at ffffffc0081d18c4 > [ffffffc008073d90] kernel_clone at ffffffc0081d088c > [ffffffc008073e50] kthreadd at ffffffc00822a8e4 > > Finally, the Thread-A and Thread-B would be blockd forever. > > [2]cpu_offline: > > 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock: > > [ffffffc00f9c3850] __switch_to at ffffffc0081229d4 > [ffffffc00f9c38b0] __schedule at ffffffc009c824f8 > [ffffffc00f9c3910] schedule at ffffffc009c82b50 > [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc > [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180 <<< try to get cpu_hotplug_lock > [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8 > [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4 > [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8 > [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8 > [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac <<< get the cgroup_threadgroup_rwsem > [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018 > [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04 > [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c > [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8 > [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640 > [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc > [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4 > [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8 > [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8 > [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44 > > 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x: > > [ffffffc01245b740] __switch_to at ffffffc0081229d4 > [ffffffc01245b7a0] __schedule at ffffffc009c824f8 > [ffffffc01245b800] schedule at ffffffc009c82b50 > [ffffffc01245b880] schedule_timeout at ffffffc009c8a144 > [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac <<< waiting for cpuhp/x complete > [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac <<< wake up cpuhp/x > [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0 > [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0 > [ffffffc01245bac0] _cpu_up at ffffffc0081dd844 <<< percpu_down_write(&cpu_hotplug_lock) > [ffffffc01245bb40] cpu_up at ffffffc0081dd100 > [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4 > [ffffffc01245bb90] device_online at ffffffc008d77dec > [ffffffc01245bbd0] online_store at ffffffc008d77bb8 > [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8 > [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894 > [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc01245bd50] vfs_write at ffffffc008607a8c > [ffffffc01245bdb0] ksys_write at ffffffc0086076d8 > [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640 > [ffffffc01245be10] invoke_syscall at ffffffc00813dccc > [ffffffc01245be30] el0_svc_common at ffffffc00813dc14 > [ffffffc01245be70] do_el0_svc at ffffffc00813dac8 > [ffffffc01245be80] el0_svc at ffffffc0098836c8 > [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44 > > 3.cpuhp/x would call cpufreq_offline and schedutil need stop_kthread: > > [ffffffc00b683a60] __switch_to at ffffffc0081229d4 > [ffffffc00b683ac0] __schedule at ffffffc009c824f8 > [ffffffc00b683b20] schedule at ffffffc009c82b50 > [ffffffc00b683ba0] schedule_timeout at ffffffc009c8a144 > [ffffffc00b683c00] wait_for_common at ffffffc009c83dac <<< waiting for sugov complete > [ffffffc00b683c60] kthread_stop at ffffffc008228128 <<< wakeup sugov > [ffffffc00b683c90] sugov_exit at ffffffc001d016a8 > [ffffffc00b683cc0] cpufreq_offline at ffffffc00917b4b8 > [ffffffc00b683d10] cpuhp_cpufreq_offline at ffffffc00917ee70 > [ffffffc00b683d70] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc00b683dd0] cpuhp_thread_fun at ffffffc0081df47c > [ffffffc00b683e10] smpboot_thread_fn at ffffffc008234718 > [ffffffc00b683e70] kthread at ffffffc00822ac84 > > 4. the sugov thread is waiting cgroup_threadgroup_rwsem: > > [ffffffc01258bc10] __switch_to at ffffffc0081229d4 > [ffffffc01258bc70] __schedule at ffffffc009c824f8 > [ffffffc01258bcd0] schedule at ffffffc009c82b50 > [ffffffc01258bd30] percpu_rwsem_wait at ffffffc00828fbcc <<< wait for cgroup_threadgroup_rwsem > [ffffffc01258bd70] __percpu_down_read at ffffffc008290180 > [ffffffc01258bdb0] exit_signals at ffffffc0082074c8 > [ffffffc01258be10] do_exit at ffffffc0081e5130 > [ffffffc01258be70] kthread at ffffffc00822ac8c > > Finally, the Thread-A and Thread-B would be blockd forever. > > Combining the above two situations, the reason why Thread-A and Thread-B are blocked is because cpuhp/x is blocked. > If we can solve the problem that cpuhp is not blocked, Thread-A and Thread-B can run normally. > So we can let schedutil do not create or destory thread, so that the cpuhp/x would not be blocked. > > So init the sugov thread just once to prevent the above deadlock. > > Signed-off-by: Ke Wang <ke.wang@unisoc.com> > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com> > --- > kernel/sched/cpufreq_schedutil.c | 24 +++++++++++++++++++++++- > 1 file changed, 23 insertions(+), 1 deletion(-) > > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c > index 285ad51caf0f..42d04a1eac20 100644 > --- a/kernel/sched/cpufreq_schedutil.c > +++ b/kernel/sched/cpufreq_schedutil.c > @@ -8,6 +8,8 @@ > > #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) > > +static LIST_HEAD(sugov_list); > + > struct sugov_tunables { > struct gov_attr_set attr_set; > unsigned int rate_limit_us; > @@ -15,6 +17,7 @@ struct sugov_tunables { > > struct sugov_policy { > struct cpufreq_policy *policy; > + struct list_head sg_policy_list; > > struct sugov_tunables *tunables; > struct list_head tunables_hook; > @@ -659,6 +662,7 @@ static int sugov_init(struct cpufreq_policy *policy) > struct sugov_policy *sg_policy; > struct sugov_tunables *tunables; > int ret = 0; > + int sugov_init = 0; > > /* State should be equivalent to EXIT */ > if (policy->governor_data) > @@ -666,6 +670,13 @@ static int sugov_init(struct cpufreq_policy *policy) > > cpufreq_enable_fast_switch(policy); > > + list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) { > + if (sg_policy->policy == policy) { > + sugov_init = 1; > + goto tunables_lock; > + } > + } > + > sg_policy = sugov_policy_alloc(policy); > if (!sg_policy) { > ret = -ENOMEM; > @@ -676,6 +687,10 @@ static int sugov_init(struct cpufreq_policy *policy) > if (ret) > goto free_sg_policy; > > + list_add(&sg_policy->sg_policy_list, &sugov_list); > + > +tunables_lock: > + > mutex_lock(&global_tunables_lock); > > if (global_tunables) { > @@ -717,7 +732,8 @@ static int sugov_init(struct cpufreq_policy *policy) > sugov_clear_global_tunables(); > > stop_kthread: > - sugov_kthread_stop(sg_policy); > + if (!sugov_init) > + sugov_kthread_stop(sg_policy); > mutex_unlock(&global_tunables_lock); > > free_sg_policy: > @@ -745,8 +761,14 @@ static void sugov_exit(struct cpufreq_policy *policy) > > mutex_unlock(&global_tunables_lock); > > + list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) { > + if (sg_policy->policy == policy) { > + goto out; > + } > + } > sugov_kthread_stop(sg_policy); > sugov_policy_free(sg_policy); > +out: > cpufreq_disable_fast_switch(policy); > } > > -- > 2.25.1 > Thanks! BR
Hi Xuewen, On 7/11/22 08:21, Xuewen Yan wrote: > Hi all > > This deadlock is inevitable, any comment? Could you tell me how to reproduce this? Is there a need of special cgroup setup? Regards, Lukasz
On Mon, Jul 11, 2022 at 3:32 PM Lukasz Luba <lukasz.luba@arm.com> wrote: > > Hi Xuewen, > > On 7/11/22 08:21, Xuewen Yan wrote: > > Hi all > > > > This deadlock is inevitable, any comment? > > Could you tell me how to reproduce this? > Is there a need of special cgroup setup? This deadlock occurs when we run the monkey test on an Android phone, at the same time, randomly put online or offline a cpu core. Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in android. And I see the cpu_hotplug_lock is added by the patch: https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/ Thanks! > > Regards, > Lukasz
On 11-07-22, 15:43, Xuewen Yan wrote: > This deadlock occurs when we run the monkey test on an Android phone, > at the same time, randomly put online or offline a cpu core. > Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting > for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in > android. It would be better to reproduce this on mainline kernel, no android, and tell us the steps. We don't want to chase an Android only issue, just in case. Thanks.
On 7/11/22 08:43, Xuewen Yan wrote: > On Mon, Jul 11, 2022 at 3:32 PM Lukasz Luba <lukasz.luba@arm.com> wrote: >> >> Hi Xuewen, >> >> On 7/11/22 08:21, Xuewen Yan wrote: >>> Hi all >>> >>> This deadlock is inevitable, any comment? >> >> Could you tell me how to reproduce this? >> Is there a need of special cgroup setup? > > This deadlock occurs when we run the monkey test on an Android phone, > at the same time, randomly put online or offline a cpu core. > Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting > for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in > android. Thanks, let me have a look. We have some hotplug stress tests in our EAS mainline integration tests, which haven't triggered this issue. I'll try to trigger this on mainline kernel. > > And I see the cpu_hotplug_lock is added by the patch: > https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/ Thanks for the pointer.
Hi Xuewen +CC Tejun who might be interested. On 07/05/22 20:37, Xuewen Yan wrote: > There is a deadlock between cpu_hotplug_lock sem and cgroup_threadgroup_rwsem. > And both cpu online and cpu offline scenarios will cause deadlock: > > [1] cpu online: > > 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock: > > [ffffffc00f9c3850] __switch_to at ffffffc0081229d4 > [ffffffc00f9c38b0] __schedule at ffffffc009c824f8 > [ffffffc00f9c3910] schedule at ffffffc009c82b50 > [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc > [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180 <<< try to get cpu_hotplug_lock > [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8 > [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4 > [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8 > [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8 > [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac <<< get the cgroup_threadgroup_rwsem > [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018 > [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04 > [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c > [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8 > [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640 > [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc > [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4 > [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8 > [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8 > [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44 > > 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x: > > [ffffffc01245b740] __switch_to at ffffffc0081229d4 > [ffffffc01245b7a0] __schedule at ffffffc009c824f8 > [ffffffc01245b800] schedule at ffffffc009c82b50 > [ffffffc01245b880] schedule_timeout at ffffffc009c8a144 > [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac <<< waiting for cpuhp/x complete > [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac <<< wake up cpuhp/x > [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0 > [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0 > [ffffffc01245bac0] _cpu_up at ffffffc0081dd844 <<< percpu_down_write(&cpu_hotplug_lock) > [ffffffc01245bb40] cpu_up at ffffffc0081dd100 > [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4 > [ffffffc01245bb90] device_online at ffffffc008d77dec > [ffffffc01245bbd0] online_store at ffffffc008d77bb8 > [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8 > [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894 > [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc01245bd50] vfs_write at ffffffc008607a8c > [ffffffc01245bdb0] ksys_write at ffffffc0086076d8 > [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640 > [ffffffc01245be10] invoke_syscall at ffffffc00813dccc > [ffffffc01245be30] el0_svc_common at ffffffc00813dc14 > [ffffffc01245be70] do_el0_svc at ffffffc00813dac8 > [ffffffc01245be80] el0_svc at ffffffc0098836c8 > [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44 > > 3. cpuhp/x would call cpufreq_online and the schedutil need create kthread, as a result it need wait for kthreadd: > > [ffffffc00b653860] __switch_to at ffffffc0081229d4 > [ffffffc00b6538c0] __schedule at ffffffc009c824f8 > [ffffffc00b653920] schedule at ffffffc009c82b50 > [ffffffc00b6539a0] schedule_timeout at ffffffc009c8a144 > [ffffffc00b653a00] wait_for_common at ffffffc009c83dac <<< wait for kthreadd > [ffffffc00b653ad0] __kthread_create_on_node at ffffffc0082296b8 > [ffffffc00b653b90] kthread_create_on_node at ffffffc008229a58 > [ffffffc00b653bb0] sugov_init at ffffffc001d01414 > [ffffffc00b653c00] cpufreq_init_governor at ffffffc009179a48 > [ffffffc00b653c50] cpufreq_set_policy at ffffffc009179360 > [ffffffc00b653cb0] cpufreq_online at ffffffc00917c3e4 > [ffffffc00b653d10] cpuhp_cpufreq_online at ffffffc00917ee3c > [ffffffc00b653d70] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc00b653dd0] cpuhp_thread_fun at ffffffc0081df47c > [ffffffc00b653e10] smpboot_thread_fn at ffffffc008234718 > [ffffffc00b653e70] kthread at ffffffc00822ac84 > > 4. the kthreadd is trying to get cgroup_threadgroup_rwsem... > > [ffffffc008073a70] __switch_to at ffffffc0081229d4 > [ffffffc008073ad0] __schedule at ffffffc009c824f8 > [ffffffc008073b30] schedule at ffffffc009c82b50 > [ffffffc008073b90] percpu_rwsem_wait at ffffffc00828fbcc <<< cgroup_threadgroup_rwsem > [ffffffc008073bd0] __percpu_down_read at ffffffc008290180 > [ffffffc008073c00] cgroup_css_set_fork at ffffffc008357ecc > [ffffffc008073c60] cgroup_can_fork at ffffffc008357bb4 > [ffffffc008073d00] copy_process at ffffffc0081d18c4 > [ffffffc008073d90] kernel_clone at ffffffc0081d088c > [ffffffc008073e50] kthreadd at ffffffc00822a8e4 > > Finally, the Thread-A and Thread-B would be blockd forever. > > [2]cpu_offline: > > 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock: > > [ffffffc00f9c3850] __switch_to at ffffffc0081229d4 > [ffffffc00f9c38b0] __schedule at ffffffc009c824f8 > [ffffffc00f9c3910] schedule at ffffffc009c82b50 > [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc > [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180 <<< try to get cpu_hotplug_lock > [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8 > [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4 > [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8 > [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8 > [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac <<< get the cgroup_threadgroup_rwsem > [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018 > [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04 > [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c > [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8 > [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640 > [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc > [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4 > [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8 > [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8 > [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44 > > 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x: > > [ffffffc01245b740] __switch_to at ffffffc0081229d4 > [ffffffc01245b7a0] __schedule at ffffffc009c824f8 > [ffffffc01245b800] schedule at ffffffc009c82b50 > [ffffffc01245b880] schedule_timeout at ffffffc009c8a144 > [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac <<< waiting for cpuhp/x complete > [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac <<< wake up cpuhp/x > [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0 > [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0 > [ffffffc01245bac0] _cpu_up at ffffffc0081dd844 <<< percpu_down_write(&cpu_hotplug_lock) > [ffffffc01245bb40] cpu_up at ffffffc0081dd100 > [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4 > [ffffffc01245bb90] device_online at ffffffc008d77dec > [ffffffc01245bbd0] online_store at ffffffc008d77bb8 > [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8 > [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894 > [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8 > [ffffffc01245bd50] vfs_write at ffffffc008607a8c > [ffffffc01245bdb0] ksys_write at ffffffc0086076d8 > [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640 > [ffffffc01245be10] invoke_syscall at ffffffc00813dccc > [ffffffc01245be30] el0_svc_common at ffffffc00813dc14 > [ffffffc01245be70] do_el0_svc at ffffffc00813dac8 > [ffffffc01245be80] el0_svc at ffffffc0098836c8 > [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c > [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44 > > 3.cpuhp/x would call cpufreq_offline and schedutil need stop_kthread: > > [ffffffc00b683a60] __switch_to at ffffffc0081229d4 > [ffffffc00b683ac0] __schedule at ffffffc009c824f8 > [ffffffc00b683b20] schedule at ffffffc009c82b50 > [ffffffc00b683ba0] schedule_timeout at ffffffc009c8a144 > [ffffffc00b683c00] wait_for_common at ffffffc009c83dac <<< waiting for sugov complete > [ffffffc00b683c60] kthread_stop at ffffffc008228128 <<< wakeup sugov > [ffffffc00b683c90] sugov_exit at ffffffc001d016a8 > [ffffffc00b683cc0] cpufreq_offline at ffffffc00917b4b8 > [ffffffc00b683d10] cpuhp_cpufreq_offline at ffffffc00917ee70 > [ffffffc00b683d70] cpuhp_invoke_callback at ffffffc0081dc000 > [ffffffc00b683dd0] cpuhp_thread_fun at ffffffc0081df47c > [ffffffc00b683e10] smpboot_thread_fn at ffffffc008234718 > [ffffffc00b683e70] kthread at ffffffc00822ac84 > > 4. the sugov thread is waiting cgroup_threadgroup_rwsem: > > [ffffffc01258bc10] __switch_to at ffffffc0081229d4 > [ffffffc01258bc70] __schedule at ffffffc009c824f8 > [ffffffc01258bcd0] schedule at ffffffc009c82b50 > [ffffffc01258bd30] percpu_rwsem_wait at ffffffc00828fbcc <<< wait for cgroup_threadgroup_rwsem > [ffffffc01258bd70] __percpu_down_read at ffffffc008290180 > [ffffffc01258bdb0] exit_signals at ffffffc0082074c8 > [ffffffc01258be10] do_exit at ffffffc0081e5130 > [ffffffc01258be70] kthread at ffffffc00822ac8c > > Finally, the Thread-A and Thread-B would be blockd forever. > > Combining the above two situations, the reason why Thread-A and Thread-B are blocked is because cpuhp/x is blocked. > If we can solve the problem that cpuhp is not blocked, Thread-A and Thread-B can run normally. > So we can let schedutil do not create or destory thread, so that the cpuhp/x would not be blocked. > > So init the sugov thread just once to prevent the above deadlock. > > Signed-off-by: Ke Wang <ke.wang@unisoc.com> > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com> > --- Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful output about the DEADLOCK. But your explanation was good and clear to me. AFAIU: CPU0 CPU1 CPU2 // attach task to a different // cpuset cgroup via sysfs __acquire(cgroup_threadgroup_rwsem) // pring up CPU2 online __acquire(cpu_hotplug_lock) // wait for CPU2 to come online // bringup cpu online // call cpufreq_online() which tries to create sugov kthread __acquire(cpu_hotplug_lock) copy_process() cgroup_can_fork() cgroup_css_set_fork() __acquire(cgroup_threadgroup_rwsem) // blocks forever // blocks forever // blocks forever Is this a correct summary of the problem? The locks are held in reverse order and we end up with a DEADLOCK. I believe the same happens on offline it's just the path to hold the cgroup_threadgroup_rwsem on CPU2 is different. This will be a tricky one. Your proposed patch might fix it for this case, but if there's anything else that creates a kthread when a cpu goes online/offline then we'll hit the same problem again. I haven't reviewed your patch to be honest, but I think worth seeing first if there's something that can be done at the 'right level' first. Needs head scratching from my side at least. This is the not the first type of locking issue between hotplug and cpuset :-/ Thanks -- Qais Yousef
(cc'ing Waiman) On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote: > Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful > output about the DEADLOCK. But your explanation was good and clear to me. I don't think lockdep would be able to track CPU1 -> CPU2 dependency here unfortunately. > AFAIU: > > > CPU0 CPU1 CPU2 > > // attach task to a different > // cpuset cgroup via sysfs > __acquire(cgroup_threadgroup_rwsem) > > // pring up CPU2 online > __acquire(cpu_hotplug_lock) > // wait for CPU2 to come online > // bringup cpu online > // call cpufreq_online() which tries to create sugov kthread > __acquire(cpu_hotplug_lock) copy_process() > cgroup_can_fork() > cgroup_css_set_fork() > __acquire(cgroup_threadgroup_rwsem) > // blocks forever // blocks forever // blocks forever > > > Is this a correct summary of the problem? > > The locks are held in reverse order and we end up with a DEADLOCK. > > I believe the same happens on offline it's just the path to hold the > cgroup_threadgroup_rwsem on CPU2 is different. > > This will be a tricky one. Your proposed patch might fix it for this case, but > if there's anything else that creates a kthread when a cpu goes online/offline > then we'll hit the same problem again. > > I haven't reviewed your patch to be honest, but I think worth seeing first if > there's something that can be done at the 'right level' first. > > Needs head scratching from my side at least. This is the not the first type of > locking issue between hotplug and cpuset :-/ Well, the only thing I can think of is always grabbing cpus_read_lock() before grabbing threadgroup_rwsem. Waiman, what do you think? Thanks.
On Mon, 11 Jul 2022 10:58:28 -1000 Tejun Heo <tj@kernel.org> wrote: > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here > unfortunately. > > > AFAIU: > > > > > > CPU0 CPU1 CPU2 > > > > // attach task to a different > > // cpuset cgroup via sysfs > > __acquire(cgroup_threadgroup_rwsem) > > > > // pring up CPU2 online > > __acquire(cpu_hotplug_lock) > > // wait for CPU2 to come online Should there be some annotation here that tells lockdep that CPU1 is now blocked on CPU2? Then this case would be caught by lockdep. -- Steve > > // bringup cpu online > > // call cpufreq_online() which tries to create sugov kthread > > __acquire(cpu_hotplug_lock) copy_process() > > cgroup_can_fork() > > cgroup_css_set_fork() > > __acquire(cgroup_threadgroup_rwsem) > > // blocks forever // blocks forever // blocks forever > >
On Tue, Jul 12, 2022 at 5:34 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Mon, 11 Jul 2022 10:58:28 -1000 > Tejun Heo <tj@kernel.org> wrote: > > > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here > > unfortunately. > > > > > AFAIU: > > > > > > > > > CPU0 CPU1 CPU2 > > > > > > // attach task to a different > > > // cpuset cgroup via sysfs > > > __acquire(cgroup_threadgroup_rwsem) > > > > > > // pring up CPU2 online > > > __acquire(cpu_hotplug_lock) > > > // wait for CPU2 to come online > > Should there be some annotation here that tells lockdep that CPU1 is now > blocked on CPU2? > > Then this case would be caught by lockdep. > > -- Steve > > > > > // bringup cpu online > > > // call cpufreq_online() which tries to create sugov kthread > > > __acquire(cpu_hotplug_lock) copy_process() > > > cgroup_can_fork() > > > cgroup_css_set_fork() > > > __acquire(cgroup_threadgroup_rwsem) > > > // blocks forever // blocks forever // blocks forever > > > Indeed, It's caused by threads instead of cpus. Our soc contains two cpufreq policy.0-5 belongs to policy0, 6-7 belongs to policy1. when cpu6/7 online Thread-A Thread-B cpuhp/6 kthreadd cgroup_file_write device_online cgroup1_tasks_write ... __cgroup1_procs_write _cpu_up write(&cgroup_threadgroup_rwsem); << cpus_write_lock();<< cgroup_attach_task ...... cgroup_migrate_execute cpuhp_kick_ap ---------> cpuhp_thread_fun cpuset_attach //waiting for cpuhp cpuhp_invoke_callback cpus_read_lock() cpuhp_cpufreq_online cpufreq_online //blocked sugov_init __kthread_create_on_node kthreadd //blocked, waiting for kthreadd copy_process cgroup_can_fork cgroup_css_set_fork __acquires(&cgroup_threadgroup_rwsem) So it's logic is: Thread-A ----->Thread-B------>cpuhp----->kthreadd----- ^ | |<---------------------------------------------------------------<- When cpu offline, the sugov thread would stop, so it would waiting cgroup_threadgroup_rwsem when kthread_stop(); It's logic is: Thread-A ----->Thread-B------>cpuhp-------->sugov----- ^ | |<---------------------------------------------------------------<- As Qais said: > if there's anything else that creates a kthread when a cpu goes online/offline > then we'll hit the same problem again. Indeed, only the cpuhp thread create/destroy kthread would cause the case. I have put the test script in the mail, and I have tested it without monkey test, the deadlock still occurs.. Thanks! xuewen.yan
[RESEND] On Tue, Jul 12, 2022 at 5:34 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Mon, 11 Jul 2022 10:58:28 -1000 > Tejun Heo <tj@kernel.org> wrote: > > > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here > > unfortunately. > > > > > AFAIU: > > > > > > > > > CPU0 CPU1 CPU2 > > > > > > // attach task to a different > > > // cpuset cgroup via sysfs > > > __acquire(cgroup_threadgroup_rwsem) > > > > > > // pring up CPU2 online > > > __acquire(cpu_hotplug_lock) > > > // wait for CPU2 to come online > > Should there be some annotation here that tells lockdep that CPU1 is now > blocked on CPU2? > > Then this case would be caught by lockdep. > > -- Steve > > > > > // bringup cpu online > > > // call cpufreq_online() which tries to create sugov kthread > > > __acquire(cpu_hotplug_lock) copy_process() > > > cgroup_can_fork() > > > cgroup_css_set_fork() > > > __acquire(cgroup_threadgroup_rwsem) > > > // blocks forever // blocks forever // blocks forever > > > Indeed, It's caused by threads instead of cpus. Our soc contains two cpufreq policy.0-5 belongs to policy0, 6-7 belongs to policy1. when cpu6/7 online Thread-A Thread-B cgroup_file_write device_online cgroup1_tasks_write ... __cgroup1_procs_write _cpu_up write(&cgroup_threadgroup_rwsem); << cpus_write_lock();<< cgroup_attach_task ...... cgroup_migrate_execute cpuhp_kick_ap cpuset_attach //wakeup cpuhp cpus_read_lock() //waitingfor cpuhp cpuhp/6 kthreadd cpuhp_thread_fun cpuhp_invoke_callback cpuhp_cpufreq_online cpufreq_online sugov_init __kthread_create_on_node copy_process //blocked, waiting for kthreadd cgroup_can_fork cgroup_css_set_fork __acquires(&cgroup_threadgroup_rwsem) //blocked So it's logic is: Thread-A --->Thread-B---->cpuhp--->kthreadd---->Thread-A When cpu offline, the sugov thread would stop, so it would waiting cgroup_threadgroup_rwsem when kthread_stop(); It's logic is: Thread-A --->Thread-B---->cpuhp--->sugov---->Thread-A As Qais said: > if there's anything else that creates a kthread when a cpu goes online/offline > then we'll hit the same problem again. Indeed, only the cpuhp thread create/destroy kthread would cause the case. I have put the test script in the mail, and I have tested it without monkey test, the deadlock still occurs.. Thanks! xuewen.yan
On 07/11/22 10:58, Tejun Heo wrote: > (cc'ing Waiman) > > On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote: > > Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful > > output about the DEADLOCK. But your explanation was good and clear to me. > > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here > unfortunately. I see. It seems I got it wrong and CPU1 and CPU2 below run from the same context, IIU Xuewen's response correctly this time that is. > > > AFAIU: > > > > > > CPU0 CPU1 CPU2 > > > > // attach task to a different > > // cpuset cgroup via sysfs > > __acquire(cgroup_threadgroup_rwsem) > > > > // pring up CPU2 online > > __acquire(cpu_hotplug_lock) > > // wait for CPU2 to come online > > // bringup cpu online > > // call cpufreq_online() which tries to create sugov kthread > > __acquire(cpu_hotplug_lock) copy_process() > > cgroup_can_fork() > > cgroup_css_set_fork() > > __acquire(cgroup_threadgroup_rwsem) > > // blocks forever // blocks forever // blocks forever > > > > > > Is this a correct summary of the problem? > > > > The locks are held in reverse order and we end up with a DEADLOCK. > > > > I believe the same happens on offline it's just the path to hold the > > cgroup_threadgroup_rwsem on CPU2 is different. > > > > This will be a tricky one. Your proposed patch might fix it for this case, but > > if there's anything else that creates a kthread when a cpu goes online/offline > > then we'll hit the same problem again. > > > > I haven't reviewed your patch to be honest, but I think worth seeing first if > > there's something that can be done at the 'right level' first. > > > > Needs head scratching from my side at least. This is the not the first type of > > locking issue between hotplug and cpuset :-/ > > Well, the only thing I can think of is always grabbing cpus_read_lock() > before grabbing threadgroup_rwsem. Waiman, what do you think? Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()? A quick grep tells me it's the only one. Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we either hold the lock successfully then before we go ahead and call cpuset_attach(), or bail out and cancel the whole attach operation which should unlock the threadgroup_rwsem() lock? Sounds simple in theory, but as always the devil is in the details which I don't have all inside my head. AFAICS cgroup_migration_execute() will call ss->can_attach() for each subsystem. And it's the only caller of can_attach(). Not sure if it'd be safe to do cpus_read_unlock() at the end of cpuset_attach() or we need to introduce an additional callback to 'cpuset_attach_finish()' where we'd release the lock. ie something like the below untested-and-for-demo-purposes-only patch: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 71a418858a5e..86c7c6aa0f12 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2226,6 +2226,12 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct task_struct *task; int ret; + ret = cpus_read_trylock(); + if (!ret) { + ret = -EBUSY; + goto out_unlock; + } + /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); @@ -2289,7 +2295,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + // Requires cpus_read_lock() to be held already + // releases cpus_read_lock() when done percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); Cheers -- Qais Yousef
On Tue, Jul 12, 2022 at 01:57:02PM +0100, Qais Yousef wrote: > Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()? > A quick grep tells me it's the only one. > > Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we > either hold the lock successfully then before we go ahead and call > cpuset_attach(), or bail out and cancel the whole attach operation which should > unlock the threadgroup_rwsem() lock? But now we're failing user-initiated operations randomly. I have a hard time seeing that as an acceptable solution. The only thing we can do, I think, is establishing a locking order between the two locks by either nesting threadgroup_rwsem under cpus_read_lock or disallowing thread creation during hotplug operations. Thanks.
On 07/12/22 06:13, Tejun Heo wrote: > On Tue, Jul 12, 2022 at 01:57:02PM +0100, Qais Yousef wrote: > > Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()? > > A quick grep tells me it's the only one. > > > > Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we > > either hold the lock successfully then before we go ahead and call > > cpuset_attach(), or bail out and cancel the whole attach operation which should > > unlock the threadgroup_rwsem() lock? > > But now we're failing user-initiated operations randomly. I have a hard time True. That might appear more random than necessary. It looked neat and I thought since hotplug operations aren't that common and users must be prepared for failures for other reasons, it might be okay. > seeing that as an acceptable solution. The only thing we can do, I think, is > establishing a locking order between the two locks by either nesting That might be enough if no other paths can exist which would hold them in reverse order again. It would be more robust to either hold them both or wait until we can. Then potential ordering problems can't happen again, because of this path at least. > threadgroup_rwsem under cpus_read_lock or disallowing thread creation during > hotplug operations. I think that's what Xuewen tried to do in the proposed patch. But it fixes it for a specific user. If we go with that we'll need nuts and bolts to help warn when other users do that. Thanks -- Qais Yousef
On 7/11/22 16:58, Tejun Heo wrote: > (cc'ing Waiman) > > On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote: >> Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful >> output about the DEADLOCK. But your explanation was good and clear to me. > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here > unfortunately. That is the case AFAIK. Lockdep only track individually the locks taken by each task. >> AFAIU: >> >> >> CPU0 CPU1 CPU2 >> >> // attach task to a different >> // cpuset cgroup via sysfs >> __acquire(cgroup_threadgroup_rwsem) >> >> // pring up CPU2 online >> __acquire(cpu_hotplug_lock) >> // wait for CPU2 to come online >> // bringup cpu online >> // call cpufreq_online() which tries to create sugov kthread >> __acquire(cpu_hotplug_lock) copy_process() >> cgroup_can_fork() >> cgroup_css_set_fork() >> __acquire(cgroup_threadgroup_rwsem) >> // blocks forever // blocks forever // blocks forever >> >> >> Is this a correct summary of the problem? >> >> The locks are held in reverse order and we end up with a DEADLOCK. >> >> I believe the same happens on offline it's just the path to hold the >> cgroup_threadgroup_rwsem on CPU2 is different. >> >> This will be a tricky one. Your proposed patch might fix it for this case, but >> if there's anything else that creates a kthread when a cpu goes online/offline >> then we'll hit the same problem again. >> >> I haven't reviewed your patch to be honest, but I think worth seeing first if >> there's something that can be done at the 'right level' first. >> >> Needs head scratching from my side at least. This is the not the first type of >> locking issue between hotplug and cpuset :-/ > Well, the only thing I can think of is always grabbing cpus_read_lock() > before grabbing threadgroup_rwsem. Waiman, what do you think? That is a possible solution as cpus_read_lock() is rather lightweight. It is a good practice to acquire it first. Cheers, Longman > > Thanks. >
On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote: > > Well, the only thing I can think of is always grabbing cpus_read_lock() > > before grabbing threadgroup_rwsem. Waiman, what do you think? > > That is a possible solution as cpus_read_lock() is rather lightweight. It is > a good practice to acquire it first. On a similar note, I think we probably should re-enable percpu operations on threadgroup_rwsem too by default and allow users who are affected by slower write path operations to opt-in. After the new CLONE_INTO_CGROUP which doesn't need the rwsem, we have far fewer write lockers after all. Thanks.
On 7/11/22 17:11, Steven Rostedt wrote: > On Mon, 11 Jul 2022 10:58:28 -1000 > Tejun Heo <tj@kernel.org> wrote: > >> I don't think lockdep would be able to track CPU1 -> CPU2 dependency here >> unfortunately. >> >>> AFAIU: >>> >>> >>> CPU0 CPU1 CPU2 >>> >>> // attach task to a different >>> // cpuset cgroup via sysfs >>> __acquire(cgroup_threadgroup_rwsem) >>> >>> // pring up CPU2 online >>> __acquire(cpu_hotplug_lock) >>> // wait for CPU2 to come online > Should there be some annotation here that tells lockdep that CPU1 is now > blocked on CPU2? > > Then this case would be caught by lockdep. > > -- Steve > > >>> // bringup cpu online >>> // call cpufreq_online() which tries to create sugov kthread >>> __acquire(cpu_hotplug_lock) copy_process() >>> cgroup_can_fork() >>> cgroup_css_set_fork() >>> __acquire(cgroup_threadgroup_rwsem) >>> // blocks forever // blocks forever // blocks forever Actually, the dependency can probably be coded by calling lockdep_acquire_cpus_lock() in cpu2 (task 2) before acquiring cgroup_threadgroup_rwsem to indicate the dependency between CPU1 and CPU2 (task 1 and task 2). lockdep_release_cpus_lock() can then called after release the rwsem. Cheers, Longman
Dear all On Wed, Jul 13, 2022 at 11:20 AM Tejun Heo <tj@kernel.org> wrote: > > On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote: > > > Well, the only thing I can think of is always grabbing cpus_read_lock() > > > before grabbing threadgroup_rwsem. Waiman, what do you think? > > > > That is a possible solution as cpus_read_lock() is rather lightweight. It is > > a good practice to acquire it first. > > On a similar note, I think we probably should re-enable percpu operations on > threadgroup_rwsem too by default and allow users who are affected by slower > write path operations to opt-in. After the new CLONE_INTO_CGROUP which > doesn't need the rwsem, we have far fewer write lockers after all. > If there's any patch for me to try? I would be very grateful. Thanks! --- xw.yan
Hello, On Wed, Jul 20, 2022 at 03:45:27PM +0800, Xuewen Yan wrote: > Dear all > > On Wed, Jul 13, 2022 at 11:20 AM Tejun Heo <tj@kernel.org> wrote: > > > > On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote: > > > > Well, the only thing I can think of is always grabbing cpus_read_lock() > > > > before grabbing threadgroup_rwsem. Waiman, what do you think? > > > > > > That is a possible solution as cpus_read_lock() is rather lightweight. It is > > > a good practice to acquire it first. > > > > On a similar note, I think we probably should re-enable percpu operations on > > threadgroup_rwsem too by default and allow users who are affected by slower > > write path operations to opt-in. After the new CLONE_INTO_CGROUP which > > doesn't need the rwsem, we have far fewer write lockers after all. > > > > If there's any patch for me to try? I would be very grateful. Can youp please see whether the following patch fixes the problem? Thanks. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 13c8e91d78620..7caefc8af9127 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/** + * lock_threadgroup - Stabilize threadgroups + * + * cgroup migration operations need the threadgroups stabilized against forks + * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem. + * + * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug + * disable. This is because cpuset needs CPU hotplug disabled during ->attach() + * and bringing up a CPU may involve creating new tasks which can require + * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with + * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by + * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting + * for the threadgroup_rwsem to be released to create new tasks. See the + * following for more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + */ +static void lock_threadgroup(void) +{ + cpus_read_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * lock_threadgroup - Undo lock_threadgroup + */ +static void unlock_threadgroup(void) +{ + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *locked) - __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; pid_t pid; @@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ lockdep_assert_held(&cgroup_mutex); if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); + lock_threadgroup(); *locked = true; } else { *locked = false; @@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, out_unlock_threadgroup: if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); + unlock_threadgroup(); *locked = false; } out_unlock_rcu: @@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, } void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; int ssid; @@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) put_task_struct(task); if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + unlock_threadgroup(); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + lock_threadgroup(); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + unlock_threadgroup(); return ret; }
On Tue, Jul 12, 2022 at 06:13:46AM -1000, Tejun Heo wrote: > But now we're failing user-initiated operations randomly. I have a hard time > seeing that as an acceptable solution. There (sadly) is precedent for that; grep for "cpu_hotplug_disable".
Hello, On Thu, Jul 28, 2022 at 12:01:35AM +0200, Peter Zijlstra wrote: > On Tue, Jul 12, 2022 at 06:13:46AM -1000, Tejun Heo wrote: > > > But now we're failing user-initiated operations randomly. I have a hard time > > seeing that as an acceptable solution. > > There (sadly) is precedent for that; grep for "cpu_hotplug_disable". Yeah, there's another one in cpu cgroup controller attach path. We should get better at not doing these. Low frequency arbitrary failures are really difficult to handle reliably downstream. Thanks.
Hi Tejun On Sat, Jul 30, 2022 at 3:59 AM Tejun Heo <tj@kernel.org> wrote: > > On Fri, Jul 29, 2022 at 09:39:49AM +0100, Qais Yousef wrote: > > I *think* it's because we haven't removed cpus_read_lock() from > > cpuset_attach(). So we end up holding the lock twice in the same path. Since we > > hold it unconditionally now, we should remove cpuset dependency on > > cpus_read_lock() I believe. > > Ah, yeah, that's because pending write locker makes future reader lockers > wait, so even if we're holding read lock, if we try to read lock again, we > end up waiting. I'll make the cpus_read_lock() unconditional in cgroup core > and drop it from cpuset's attach operation. I revert the following patch which add the cpus_read_lock() in cpuset's attach and have test for a while. And the deadlock has not reproduced. https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/ But I do not know the risk with reverting the patch.. Thanks! BR
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 285ad51caf0f..42d04a1eac20 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -8,6 +8,8 @@ #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) +static LIST_HEAD(sugov_list); + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -15,6 +17,7 @@ struct sugov_tunables { struct sugov_policy { struct cpufreq_policy *policy; + struct list_head sg_policy_list; struct sugov_tunables *tunables; struct list_head tunables_hook; @@ -659,6 +662,7 @@ static int sugov_init(struct cpufreq_policy *policy) struct sugov_policy *sg_policy; struct sugov_tunables *tunables; int ret = 0; + int sugov_init = 0; /* State should be equivalent to EXIT */ if (policy->governor_data) @@ -666,6 +670,13 @@ static int sugov_init(struct cpufreq_policy *policy) cpufreq_enable_fast_switch(policy); + list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) { + if (sg_policy->policy == policy) { + sugov_init = 1; + goto tunables_lock; + } + } + sg_policy = sugov_policy_alloc(policy); if (!sg_policy) { ret = -ENOMEM; @@ -676,6 +687,10 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto free_sg_policy; + list_add(&sg_policy->sg_policy_list, &sugov_list); + +tunables_lock: + mutex_lock(&global_tunables_lock); if (global_tunables) { @@ -717,7 +732,8 @@ static int sugov_init(struct cpufreq_policy *policy) sugov_clear_global_tunables(); stop_kthread: - sugov_kthread_stop(sg_policy); + if (!sugov_init) + sugov_kthread_stop(sg_policy); mutex_unlock(&global_tunables_lock); free_sg_policy: @@ -745,8 +761,14 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) { + if (sg_policy->policy == policy) { + goto out; + } + } sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); +out: cpufreq_disable_fast_switch(policy); }