diff mbox series

[V7,5/7] cpufreq: Register notifiers with the PM QoS framework

Message ID 2c7a751a58adb4ce6f345dab9714b924504009b6.1562583394.git.viresh.kumar@linaro.org
State Accepted
Commit 67d874c3b2c69d65274fa5ce44ba939788d5729d
Headers show
Series None | expand

Commit Message

Viresh Kumar July 8, 2019, 10:57 a.m. UTC
This registers the notifiers for min/max frequency constraints with the
PM QoS framework. The constraints are also taken into consideration in
cpufreq_set_policy().

This also relocates cpufreq_policy_put_kobj() as it is required to be
called from cpufreq_policy_alloc() now.

refresh_frequency_limits() is updated to avoid calling
cpufreq_set_policy() for inactive policies and handle_update() is
updated to have proper locking in place.

No constraints are added until now though.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

---
V6->V7:
- All callers of refresh_frequency_limits(), except handle_update(),
  take the policy->rwsem and result in deadlock as
  refresh_frequency_limits() also takes the same lock again. Fix that
  by taking the rwsem from handle_update() instead.

@Rafael: Sending it before Pavel has verified it as I would be offline
later, in case you want to apply this today itself.

 drivers/cpufreq/cpufreq.c | 135 +++++++++++++++++++++++++++++---------
 include/linux/cpufreq.h   |   3 +
 2 files changed, 108 insertions(+), 30 deletions(-)

-- 
2.21.0.rc0.269.g1a574e7a288b

Comments

Viresh Kumar Sept. 23, 2019, 1:56 p.m. UTC | #1
On 22-09-19, 23:12, Dmitry Osipenko wrote:
> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

> 

> 

> [   87.952369] ==================================================================

> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

> 

> [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

> 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

> [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

> [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

> [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

> [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

> (print_address_description.constprop.0+0x3d/0x340)

> [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

> (__kasan_report+0xe3/0x12c)

> [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

> [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

> (blocking_notifier_chain_register+0x29/0x3c)

> [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

> (dev_pm_qos_add_notifier+0x79/0xf8)

> [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)


Hi Dmitry,

Unfortunately I am traveling right now and can't test this stuff, though I may
have found the root cause here. Can you please test the below diff for me ?

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c90fd7e2ff8..9ac244ee05fe 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -328,6 +328,8 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
        spin_unlock_irq(&dev->power.lock);
 
        kfree(qos->resume_latency.notifiers);
+       kfree(qos->min_frequency.notifiers);
+       kfree(qos->max_frequency.notifiers);
        kfree(qos);
 
  out:

-- 
viresh
Dmitry Osipenko Sept. 23, 2019, 5:27 p.m. UTC | #2
23.09.2019 16:56, Viresh Kumar пишет:
> On 22-09-19, 23:12, Dmitry Osipenko wrote:

>> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

>>

>>

>> [   87.952369] ==================================================================

>> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

>> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

>>

>> [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

>> 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

>> [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

>> [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

>> [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

>> [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

>> (print_address_description.constprop.0+0x3d/0x340)

>> [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

>> (__kasan_report+0xe3/0x12c)

>> [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

>> [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

>> (blocking_notifier_chain_register+0x29/0x3c)

>> [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

>> (dev_pm_qos_add_notifier+0x79/0xf8)

>> [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)

> 

> Hi Dmitry,

> 

> Unfortunately I am traveling right now and can't test this stuff, though I may

> have found the root cause here. Can you please test the below diff for me ?

> 

> diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c

> index 6c90fd7e2ff8..9ac244ee05fe 100644

> --- a/drivers/base/power/qos.c

> +++ b/drivers/base/power/qos.c

> @@ -328,6 +328,8 @@ void dev_pm_qos_constraints_destroy(struct device *dev)

>         spin_unlock_irq(&dev->power.lock);

>  

>         kfree(qos->resume_latency.notifiers);

> +       kfree(qos->min_frequency.notifiers);

> +       kfree(qos->max_frequency.notifiers);

>         kfree(qos);

>  

>   out:

> 


Doesn't help. The use-after-free bugs are usually caused by a missing
NULL assignment after kfree(), like in this snippet:

	..
	if (!a)
		a = kmalloc();
	..
	kfree(a);
	// a = NULL    <-- missing!
	..

I briefly looked through the code and don't see anything obviously
wrong. The bug isn't critical since unlikely that somebody reloads
cpufreq module for a non-development purposes, so it's not a big deal
and can wait. Please take your time.

I also want to point out that kernel crashes after second module reload,
hence the KASAN report should be valid.
Viresh Kumar Oct. 14, 2019, 9:42 a.m. UTC | #3
On 22-09-19, 23:12, Dmitry Osipenko wrote:
> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

> 

> 

> [   87.952369] ==================================================================

> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243


Hi Dmitry,

I tried to reproduce it on my ubuntu on ARM64 setup and I couldn't hit
these issues on v5.4-rc1 with Kasan built in.

I then enabled Kasan (tried both inline and outline instrumentation)
but I couldn't get past the issues with module insertion. It fails
like this for me:

root@linaro-developer:~/work# insmod cpufreq-dt.ko 
[   72.985974] cpufreq_dt: Unknown symbol __asan_report_load1_noabort (err -2)
[   72.993164] cpufreq_dt: Unknown symbol __asan_report_load4_noabort (err -2)
[   73.000307] cpufreq_dt: Unknown symbol __asan_report_load8_noabort (err -2)
[   73.007451] cpufreq_dt: Unknown symbol __asan_report_store1_noabort (err -2)
[   73.014643] cpufreq_dt: Unknown symbol __asan_register_globals (err -2)
[   73.021409] cpufreq_dt: Unknown symbol __asan_unregister_globals (err -2)
[   73.028349] cpufreq_dt: Unknown symbol __asan_report_store8_noabort (err -2)
[   73.035543] cpufreq_dt: Unknown symbol __asan_report_store4_noabort (err -2)
insmod: ERROR: could not insert module cpufreq-dt.ko: Unknown symbol in module

I tried to search for these errors but couldn't find why I am getting
these and why the symbols are missing here. Can you suggest something
here ?

-- 
viresh
Dmitry Osipenko Oct. 14, 2019, 1:01 p.m. UTC | #4
14.10.2019 12:42, Viresh Kumar пишет:
> On 22-09-19, 23:12, Dmitry Osipenko wrote:

>> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

>>

>>

>> [   87.952369] ==================================================================

>> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

>> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

> 

> Hi Dmitry,

> 

> I tried to reproduce it on my ubuntu on ARM64 setup and I couldn't hit

> these issues on v5.4-rc1 with Kasan built in.

> 

> I then enabled Kasan (tried both inline and outline instrumentation)

> but I couldn't get past the issues with module insertion. It fails

> like this for me:

> 

> root@linaro-developer:~/work# insmod cpufreq-dt.ko 

> [   72.985974] cpufreq_dt: Unknown symbol __asan_report_load1_noabort (err -2)

> [   72.993164] cpufreq_dt: Unknown symbol __asan_report_load4_noabort (err -2)

> [   73.000307] cpufreq_dt: Unknown symbol __asan_report_load8_noabort (err -2)

> [   73.007451] cpufreq_dt: Unknown symbol __asan_report_store1_noabort (err -2)

> [   73.014643] cpufreq_dt: Unknown symbol __asan_register_globals (err -2)

> [   73.021409] cpufreq_dt: Unknown symbol __asan_unregister_globals (err -2)

> [   73.028349] cpufreq_dt: Unknown symbol __asan_report_store8_noabort (err -2)

> [   73.035543] cpufreq_dt: Unknown symbol __asan_report_store4_noabort (err -2)

> insmod: ERROR: could not insert module cpufreq-dt.ko: Unknown symbol in module

> 

> I tried to search for these errors but couldn't find why I am getting

> these and why the symbols are missing here. Can you suggest something

> here ?

> 


Sorry, I don't know what's wrong with ARM64. There is no KASAN on ARM32 in upstream yet, I'm using
the WIP patches [1].

[1] https://lkml.org/lkml/2019/6/17/1562

BTW, I moved tegra20-cpufreq to use cpufreq-dt recently and the problem presents with the cpufreq-dt:

# rmmod cpufreq_dt
# modprobe cpufreq_dt

[   31.259483] ==================================================================
[   31.260321] BUG: KASAN: use-after-free in notifier_chain_register+0x2b/0x7c
[   31.261026] Read of size 4 at addr cc30250c by task modprobe/218

[   31.262067] CPU: 1 PID: 218 Comm: modprobe Tainted: G        W
5.4.0-rc2-next-20191011-00194-g02f44e30b215-dirty #2645
[   31.263347] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[   31.264154] [<c011116d>] (unwind_backtrace) from [<c010bb05>] (show_stack+0x11/0x14)
[   31.264960] [<c010bb05>] (show_stack) from [<c0d749ad>] (dump_stack+0x89/0x98)
[   31.265804] [<c0d749ad>] (dump_stack) from [<c02c72dd>]
(print_address_description.constprop.0+0x3d/0x340)
[   31.266830] [<c02c72dd>] (print_address_description.constprop.0) from [<c02c7767>]
(__kasan_report+0xe3/0x12c)
[   31.267865] [<c02c7767>] (__kasan_report) from [<c014eabb>] (notifier_chain_register+0x2b/0x7c)
[   31.268755] [<c014eabb>] (notifier_chain_register) from [<c014eb89>]
(blocking_notifier_chain_register+0x29/0x3c)
[   31.269842] [<c014eb89>] (blocking_notifier_chain_register) from [<c072cc11>]
(dev_pm_qos_add_notifier+0x79/0xf8)
[   31.270948] [<c072cc11>] (dev_pm_qos_add_notifier) from [<c095ec71>] (cpufreq_online+0x5e1/0x8a4)
[   31.271922] [<c095ec71>] (cpufreq_online) from [<c095efbd>] (cpufreq_add_dev+0x79/0x80)
[   31.272889] [<c095efbd>] (cpufreq_add_dev) from [<c0720213>] (subsys_interface_register+0xc3/0x100)
[   31.273894] [<c0720213>] (subsys_interface_register) from [<c095d83f>]
(cpufreq_register_driver+0x13b/0x1ec)
[   31.274912] [<c095d83f>] (cpufreq_register_driver) from [<bf800475>] (dt_cpufreq_probe+0x89/0xe0
[cpufreq_dt])
[   31.275924] [<bf800475>] (dt_cpufreq_probe [cpufreq_dt]) from [<c0723e31>]
(platform_drv_probe+0x49/0x88)
[   31.276889] [<c0723e31>] (platform_drv_probe) from [<c0721ad9>] (really_probe+0x109/0x378)
[   31.277715] [<c0721ad9>] (really_probe) from [<c0721e93>] (driver_probe_device+0x57/0x15c)
[   31.278537] [<c0721e93>] (driver_probe_device) from [<c0722145>] (device_driver_attach+0x61/0x64)
[   31.279425] [<c0722145>] (device_driver_attach) from [<c0722191>] (__driver_attach+0x49/0xa0)
[   31.280273] [<c0722191>] (__driver_attach) from [<c071fe6d>] (bus_for_each_dev+0x69/0x94)
[   31.281087] [<c071fe6d>] (bus_for_each_dev) from [<c0720f71>] (bus_add_driver+0x179/0x1e8)
[   31.281909] [<c0720f71>] (bus_add_driver) from [<c0722cf7>] (driver_register+0x8f/0x130)
[   31.282734] [<c0722cf7>] (driver_register) from [<bf805017>] (dt_cpufreq_platdrv_init+0x17/0x1000
[cpufreq_dt])
[   31.283761] [<bf805017>] (dt_cpufreq_platdrv_init [cpufreq_dt]) from [<c0102f69>]
(do_one_initcall+0x4d/0x280)
[   31.284759] [<c0102f69>] (do_one_initcall) from [<c01c70a9>] (do_init_module+0xb9/0x28c)
[   31.285561] [<c01c70a9>] (do_init_module) from [<c01c9ba9>] (load_module+0x2895/0x2c04)
[   31.286347] [<c01c9ba9>] (load_module) from [<c01ca0d7>] (sys_finit_module+0x7b/0x8c)
[   31.287117] [<c01ca0d7>] (sys_finit_module) from [<c0101001>] (ret_fast_syscall+0x1/0x26)
[   31.287901] Exception stack(0xcabb3fa8 to 0xcabb3ff0)
[   31.288406] 3fa0:                   0003f348 00000001 00000003 0002b744 00000000 b6b31e74
[   31.289200] 3fc0: 0003f348 00000001 94ccfd00 0000017b 0003f3f0 00000000 0003f348 00040010
[   31.290029] 3fe0: b6b31df8 b6b31de8 00022534 aec752f0

[   31.290698] Allocated by task 181:
[   31.291065]  __kasan_kmalloc.constprop.0+0x7b/0x84
[   31.291565]  cpufreq_online+0x55f/0x8a4
[   31.291959]  cpufreq_add_dev+0x79/0x80
[   31.292351]  subsys_interface_register+0xc3/0x100
[   31.292830]  cpufreq_register_driver+0x13b/0x1ec
[   31.293335]  dt_cpufreq_probe+0x89/0xe0 [cpufreq_dt]
[   31.293832]  platform_drv_probe+0x49/0x88
[   31.294245]  really_probe+0x109/0x378
[   31.294623]  driver_probe_device+0x57/0x15c
[   31.295048]  device_driver_attach+0x61/0x64
[   31.295472]  __driver_attach+0x49/0xa0
[   31.295854]  bus_for_each_dev+0x69/0x94
[   31.296244]  bus_add_driver+0x179/0x1e8
[   31.296636]  driver_register+0x8f/0x130
[   31.297047]  dt_cpufreq_platdrv_init+0x17/0x1000 [cpufreq_dt]
[   31.297616]  do_one_initcall+0x4d/0x280
[   31.298013]  do_init_module+0xb9/0x28c
[   31.298397]  load_module+0x2895/0x2c04
[   31.298780]  sys_finit_module+0x7b/0x8c
[   31.299167]  ret_fast_syscall+0x1/0x26
[   31.299548]  0xb6c2ac60

[   31.299967] Freed by task 214:
[   31.300288]  __kasan_slab_free+0xb7/0xe0
[   31.300686]  kfree+0x71/0x1f4
[   31.301001]  subsys_interface_unregister+0xad/0xf0
[   31.338959]  cpufreq_unregister_driver+0x2f/0x7c
[   31.377102]  dt_cpufreq_remove+0x15/0x18 [cpufreq_dt]
[   31.414885]  platform_drv_remove+0x27/0x34
[   31.452644]  device_release_driver_internal+0xdf/0x1a8
[   31.490404]  driver_detach+0x85/0xf8
[   31.527682]  bus_remove_driver+0x53/0xb0
[   31.564827]  dt_cpufreq_platdrv_exit+0x9/0xb28 [cpufreq_dt]
[   31.601736]  sys_delete_module+0x117/0x1a4
[   31.638575]  ret_fast_syscall+0x1/0x26
[   31.675041]  0xb6cafff4

[   31.746517] The buggy address belongs to the object at cc302400
                which belongs to the cache kmalloc-512 of size 512
[   31.817855] The buggy address is located 268 bytes inside of
                512-byte region [cc302400, cc302600)
[   31.888496] The buggy address belongs to the page:
[   31.923247] page:d291a000 refcount:1 mapcount:0 mapping:ce001a00 index:0x0 compound_mapcount: 0
[   31.958247] flags: 0x10200(slab|head)
[   31.992944] raw: 00010200 00000100 00000122 ce001a00 00000000 00100010 ffffffff 00000001
[   32.027763] page dumped because: kasan: bad access detected

[   32.095965] Memory state around the buggy address:
[   32.129904]  cc302400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   32.163593]  cc302480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   32.196538] >cc302500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   32.229052]               ^
[   32.260939]  cc302580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   32.292881]  cc302600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   32.324296] ==================================================================
[   32.355594] Disabling lock debugging due to kernel taint
[   32.462151] ------------[ cut here ]------------
[   32.492881] WARNING: CPU: 1 PID: 218 at lib/refcount.c:156 dev_pm_opp_of_add_table+0x59/0x128
[   32.523741] refcount_t: increment on 0; use-after-free.
[   32.554329] Modules linked in: cpufreq_dt(+) tegra30_devfreq [last unloaded: cpufreq_dt]
[   32.585233] CPU: 1 PID: 218 Comm: modprobe Tainted: G    B   W
5.4.0-rc2-next-20191011-00194-g02f44e30b215-dirty #2645
[   32.646692] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[   32.677493] [<c011116d>] (unwind_backtrace) from [<c010bb05>] (show_stack+0x11/0x14)
[   32.708460] [<c010bb05>] (show_stack) from [<c0d749ad>] (dump_stack+0x89/0x98)
[   32.739392] [<c0d749ad>] (dump_stack) from [<c0127713>] (__warn+0x10f/0x110)
[   32.770049] [<c0127713>] (__warn) from [<c0127a09>] (warn_slowpath_fmt+0x61/0x78)
[   32.800656] [<c0127a09>] (warn_slowpath_fmt) from [<c095afc5>] (dev_pm_opp_of_add_table+0x59/0x128)
[   32.860732] [<c095afc5>] (dev_pm_opp_of_add_table) from [<c095b0c5>]
(dev_pm_opp_of_cpumask_add_table+0x31/0x88)
[   32.921247] [<c095b0c5>] (dev_pm_opp_of_cpumask_add_table) from [<bf800245>]
(cpufreq_init+0xd9/0x280 [cpufreq_dt])
[   32.982732] [<bf800245>] (cpufreq_init [cpufreq_dt]) from [<c095ea0f>] (cpufreq_online+0x37f/0x8a4)
[   33.045107] [<c095ea0f>] (cpufreq_online) from [<c095efbd>] (cpufreq_add_dev+0x79/0x80)
[   33.077037] [<c095efbd>] (cpufreq_add_dev) from [<c0720213>] (subsys_interface_register+0xc3/0x100)
[   33.140128] [<c0720213>] (subsys_interface_register) from [<c095d83f>]
(cpufreq_register_driver+0x13b/0x1ec)
[   33.204911] [<c095d83f>] (cpufreq_register_driver) from [<bf800475>] (dt_cpufreq_probe+0x89/0xe0
[cpufreq_dt])
[   33.271766] [<bf800475>] (dt_cpufreq_probe [cpufreq_dt]) from [<c0723e31>]
(platform_drv_probe+0x49/0x88)
[   33.340156] [<c0723e31>] (platform_drv_probe) from [<c0721ad9>] (really_probe+0x109/0x378)
[   33.375275] [<c0721ad9>] (really_probe) from [<c0721e93>] (driver_probe_device+0x57/0x15c)
[   33.410559] [<c0721e93>] (driver_probe_device) from [<c0722145>] (device_driver_attach+0x61/0x64)
[   33.446244] [<c0722145>] (device_driver_attach) from [<c0722191>] (__driver_attach+0x49/0xa0)
[   33.482238] [<c0722191>] (__driver_attach) from [<c071fe6d>] (bus_for_each_dev+0x69/0x94)
[   33.518513] [<c071fe6d>] (bus_for_each_dev) from [<c0720f71>] (bus_add_driver+0x179/0x1e8)
[   33.555099] [<c0720f71>] (bus_add_driver) from [<c0722cf7>] (driver_register+0x8f/0x130)
[   33.592015] [<c0722cf7>] (driver_register) from [<bf805017>] (dt_cpufreq_platdrv_init+0x17/0x1000
[cpufreq_dt])
[   33.666547] [<bf805017>] (dt_cpufreq_platdrv_init [cpufreq_dt]) from [<c0102f69>]
(do_one_initcall+0x4d/0x280)
[   33.742553] [<c0102f69>] (do_one_initcall) from [<c01c70a9>] (do_init_module+0xb9/0x28c)
[   33.781507] [<c01c70a9>] (do_init_module) from [<c01c9ba9>] (load_module+0x2895/0x2c04)
[   33.820735] [<c01c9ba9>] (load_module) from [<c01ca0d7>] (sys_finit_module+0x7b/0x8c)
[   33.860308] [<c01ca0d7>] (sys_finit_module) from [<c0101001>] (ret_fast_syscall+0x1/0x26)
[   33.900121] Exception stack(0xcabb3fa8 to 0xcabb3ff0)
[   33.940062] 3fa0:                   0003f348 00000001 00000003 0002b744 00000000 b6b31e74
[   33.980876] 3fc0: 0003f348 00000001 94ccfd00 0000017b 0003f3f0 00000000 0003f348 00040010
[   34.021838] 3fe0: b6b31df8 b6b31de8 00022534 aec752f0
[   34.062931] ---[ end trace f68728a0d3053b54 ]---
Dmitry Osipenko Oct. 15, 2019, 1:45 p.m. UTC | #5
15.10.2019 14:46, Viresh Kumar пишет:
> On 22-09-19, 23:12, Dmitry Osipenko wrote:

>> Hello Viresh,

>>

>> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

>>

>>

>> [   87.952369] ==================================================================

>> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

>> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

>>

>> [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

>> 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

>> [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

>> [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

>> [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

>> [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

>> (print_address_description.constprop.0+0x3d/0x340)

>> [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

>> (__kasan_report+0xe3/0x12c)

>> [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

>> [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

>> (blocking_notifier_chain_register+0x29/0x3c)

>> [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

>> (dev_pm_qos_add_notifier+0x79/0xf8)

>> [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)

>> [   87.965351] [<c092927d>] (cpufreq_online) from [<c09295c9>] (cpufreq_add_dev+0x79/0x80)

>> [   87.966247] [<c09295c9>] (cpufreq_add_dev) from [<c06eb9d3>] (subsys_interface_register+0xc3/0x100)

>> [   87.967297] [<c06eb9d3>] (subsys_interface_register) from [<c0926e53>]

>> (cpufreq_register_driver+0x13b/0x1ec)

>> [   87.968476] [<c0926e53>] (cpufreq_register_driver) from [<bf800435>]

>> (tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq])

> 

> Hi Dmitry,

> 

> Thanks for the bug report and I was finally able to reproduce it at my end and

> this was quite an interesting debugging exercise :)

> 

> When a cpufreq driver gets registered, we register with the subsys interface and

> it calls cpufreq_add_dev() for each CPU, starting from CPU0. And so the QoS

> notifiers get added to the first CPU of the policy, i.e. CPU0 in common cases.

> 

> When the cpufreq driver gets unregistered, we unregister with the subsys

> interface and it calls cpufreq_remove_dev() for each CPU, starting from CPU0

> (should have been in reverse order I feel). We remove the QoS notifier only when

> cpufreq_remove_dev() gets called for the last CPU of the policy, lets call it

> CPUx. Now this has a different notifier list as compared to CPU0.

> 

> In short, we are adding the cpufreq notifiers to CPU0 and removing them from

> CPUx. When we try to add it again by inserting the module for second time, we

> find a node in the notifier list which is already freed but still in the list as

> we removed it from CPUx's list (which doesn't do anything as the node wasn't

> there in the first place).

> 

> @Rafael: How do you see we solve this problem ? Here are the options I could

> think of:

> 

> - Update subsys layer to reverse the order of devices while unregistering (this

>   will fix the current problem, but we will still have corner cases hanging

>   around, like if the CPU0 is hotplugged out, etc).

> 

> - Update QoS framework with the knowledge of related CPUs, this has been pending

>   until now from my side. And this is the thing we really need to do. Eventually

>   we shall have only a single notifier list for all CPUs of a policy, at least

>   for MIN/MAX frequencies.

> 

> - ??

> 


Viresh, thank you very much! Looking forward to a fix :)
Rafael J. Wysocki Oct. 15, 2019, 3:53 p.m. UTC | #6
On Tue, Oct 15, 2019 at 1:46 PM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>

> On 22-09-19, 23:12, Dmitry Osipenko wrote:

> > Hello Viresh,

> >

> > This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

> >

> >

> > [   87.952369] ==================================================================

> > [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

> > [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

> >

> > [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

> > 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

> > [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

> > [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

> > [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

> > [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

> > (print_address_description.constprop.0+0x3d/0x340)

> > [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

> > (__kasan_report+0xe3/0x12c)

> > [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

> > [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

> > (blocking_notifier_chain_register+0x29/0x3c)

> > [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

> > (dev_pm_qos_add_notifier+0x79/0xf8)

> > [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)

> > [   87.965351] [<c092927d>] (cpufreq_online) from [<c09295c9>] (cpufreq_add_dev+0x79/0x80)

> > [   87.966247] [<c09295c9>] (cpufreq_add_dev) from [<c06eb9d3>] (subsys_interface_register+0xc3/0x100)

> > [   87.967297] [<c06eb9d3>] (subsys_interface_register) from [<c0926e53>]

> > (cpufreq_register_driver+0x13b/0x1ec)

> > [   87.968476] [<c0926e53>] (cpufreq_register_driver) from [<bf800435>]

> > (tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq])

>

> Hi Dmitry,

>

> Thanks for the bug report and I was finally able to reproduce it at my end and

> this was quite an interesting debugging exercise :)

>

> When a cpufreq driver gets registered, we register with the subsys interface and

> it calls cpufreq_add_dev() for each CPU, starting from CPU0. And so the QoS

> notifiers get added to the first CPU of the policy, i.e. CPU0 in common cases.

>

> When the cpufreq driver gets unregistered, we unregister with the subsys

> interface and it calls cpufreq_remove_dev() for each CPU, starting from CPU0

> (should have been in reverse order I feel). We remove the QoS notifier only when

> cpufreq_remove_dev() gets called for the last CPU of the policy, lets call it

> CPUx. Now this has a different notifier list as compared to CPU0.


The same problem will appear if the original policy CPU goes offline, won't it?

> In short, we are adding the cpufreq notifiers to CPU0 and removing them from

> CPUx. When we try to add it again by inserting the module for second time, we

> find a node in the notifier list which is already freed but still in the list as

> we removed it from CPUx's list (which doesn't do anything as the node wasn't

> there in the first place).

>

> @Rafael: How do you see we solve this problem ? Here are the options I could

> think of:

>

> - Update subsys layer to reverse the order of devices while unregistering (this

>   will fix the current problem, but we will still have corner cases hanging

>   around, like if the CPU0 is hotplugged out, etc).


This isn't sufficient for the offline case.

> - Update QoS framework with the knowledge of related CPUs, this has been pending

>   until now from my side. And this is the thing we really need to do. Eventually

>   we shall have only a single notifier list for all CPUs of a policy, at least

>   for MIN/MAX frequencies.


- Move the PM QoS requests and notifiers to the new policy CPU on all
changes of that.  That is, when cpufreq_offline() nominates the new
"leader", all of the QoS stuff for the policy needs to go to this one.

Of course, the reverse order of unregistration in the subsys layer
would help to avoid some useless churn related to that.
Rafael J. Wysocki Oct. 15, 2019, 9:50 p.m. UTC | #7
On Tue, Oct 15, 2019 at 5:53 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>

> On Tue, Oct 15, 2019 at 1:46 PM Viresh Kumar <viresh.kumar@linaro.org> wrote:

> >

> > On 22-09-19, 23:12, Dmitry Osipenko wrote:

> > > Hello Viresh,

> > >

> > > This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

> > >

> > >

> > > [   87.952369] ==================================================================

> > > [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

> > > [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

> > >

> > > [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

> > > 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

> > > [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

> > > [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

> > > [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

> > > [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

> > > (print_address_description.constprop.0+0x3d/0x340)

> > > [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

> > > (__kasan_report+0xe3/0x12c)

> > > [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

> > > [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

> > > (blocking_notifier_chain_register+0x29/0x3c)

> > > [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

> > > (dev_pm_qos_add_notifier+0x79/0xf8)

> > > [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)

> > > [   87.965351] [<c092927d>] (cpufreq_online) from [<c09295c9>] (cpufreq_add_dev+0x79/0x80)

> > > [   87.966247] [<c09295c9>] (cpufreq_add_dev) from [<c06eb9d3>] (subsys_interface_register+0xc3/0x100)

> > > [   87.967297] [<c06eb9d3>] (subsys_interface_register) from [<c0926e53>]

> > > (cpufreq_register_driver+0x13b/0x1ec)

> > > [   87.968476] [<c0926e53>] (cpufreq_register_driver) from [<bf800435>]

> > > (tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq])

> >

> > Hi Dmitry,

> >

> > Thanks for the bug report and I was finally able to reproduce it at my end and

> > this was quite an interesting debugging exercise :)

> >

> > When a cpufreq driver gets registered, we register with the subsys interface and

> > it calls cpufreq_add_dev() for each CPU, starting from CPU0. And so the QoS

> > notifiers get added to the first CPU of the policy, i.e. CPU0 in common cases.

> >

> > When the cpufreq driver gets unregistered, we unregister with the subsys

> > interface and it calls cpufreq_remove_dev() for each CPU, starting from CPU0

> > (should have been in reverse order I feel). We remove the QoS notifier only when

> > cpufreq_remove_dev() gets called for the last CPU of the policy, lets call it

> > CPUx. Now this has a different notifier list as compared to CPU0.

>

> The same problem will appear if the original policy CPU goes offline, won't it?

>

> > In short, we are adding the cpufreq notifiers to CPU0 and removing them from

> > CPUx. When we try to add it again by inserting the module for second time, we

> > find a node in the notifier list which is already freed but still in the list as

> > we removed it from CPUx's list (which doesn't do anything as the node wasn't

> > there in the first place).

> >

> > @Rafael: How do you see we solve this problem ? Here are the options I could

> > think of:

> >

> > - Update subsys layer to reverse the order of devices while unregistering (this

> >   will fix the current problem, but we will still have corner cases hanging

> >   around, like if the CPU0 is hotplugged out, etc).

>

> This isn't sufficient for the offline case.

>

> > - Update QoS framework with the knowledge of related CPUs, this has been pending

> >   until now from my side. And this is the thing we really need to do. Eventually

> >   we shall have only a single notifier list for all CPUs of a policy, at least

> >   for MIN/MAX frequencies.

>

> - Move the PM QoS requests and notifiers to the new policy CPU on all

> changes of that.  That is, when cpufreq_offline() nominates the new

> "leader", all of the QoS stuff for the policy needs to go to this one.


Alas, that still will not work, because things like
acpi_processor_ppc_init() only work accidentally for one-CPU policies.
Generally, adding such a PM QoS request to a non-policy CPU simply has
no effect until it becomes a policy CPU which may be never.

It looks like using device PM QoS for cpufreq is a mistake in general
and what is needed is a struct pm_qos_constraints member in struct
cpufreq_policy and something like

struct freq_pm_qos_request {
        enum freq_pm_qos_req_type type; /* min or max */
        struct plist_node pnode;
        struct cpufreq_policy *policy;
};

Then, pm_qos_update_target() can be used for adding, updating and
removing requests.
Viresh Kumar Oct. 16, 2019, 8:27 a.m. UTC | #8
On 15-10-19, 23:50, Rafael J. Wysocki wrote:
> On Tue, Oct 15, 2019 at 5:53 PM Rafael J. Wysocki <rafael@kernel.org> wrote:


> > > - Update QoS framework with the knowledge of related CPUs, this has been pending

> > >   until now from my side. And this is the thing we really need to do. Eventually

> > >   we shall have only a single notifier list for all CPUs of a policy, at least

> > >   for MIN/MAX frequencies.

> >

> > - Move the PM QoS requests and notifiers to the new policy CPU on all

> > changes of that.  That is, when cpufreq_offline() nominates the new

> > "leader", all of the QoS stuff for the policy needs to go to this one.

> 

> Alas, that still will not work, because things like

> acpi_processor_ppc_init() only work accidentally for one-CPU policies.


I am not sure what problem you see here ? Can you please explain a bit more.

> Generally, adding such a PM QoS request to a non-policy CPU simply has

> no effect until it becomes a policy CPU which may be never.


I was thinking maybe we can read the constraints for all CPUs in the
policy->cpus mask in cpufreq_set_policy() and so this part of the problem will
just go away. The only part that would be left is to remove the QoS constraints
properly.

> It looks like using device PM QoS for cpufreq is a mistake in general

> and what is needed is a struct pm_qos_constraints member in struct

> cpufreq_policy and something like

> 

> struct freq_pm_qos_request {

>         enum freq_pm_qos_req_type type; /* min or max */

>         struct plist_node pnode;

>         struct cpufreq_policy *policy;

> };

> 

> Then, pm_qos_update_target() can be used for adding, updating and

> removing requests.


-- 
viresh
Rafael J. Wysocki Oct. 16, 2019, 8:43 a.m. UTC | #9
On Wed, Oct 16, 2019 at 10:27 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>

> On 15-10-19, 23:50, Rafael J. Wysocki wrote:

> > On Tue, Oct 15, 2019 at 5:53 PM Rafael J. Wysocki <rafael@kernel.org> wrote:

>

> > > > - Update QoS framework with the knowledge of related CPUs, this has been pending

> > > >   until now from my side. And this is the thing we really need to do. Eventually

> > > >   we shall have only a single notifier list for all CPUs of a policy, at least

> > > >   for MIN/MAX frequencies.

> > >

> > > - Move the PM QoS requests and notifiers to the new policy CPU on all

> > > changes of that.  That is, when cpufreq_offline() nominates the new

> > > "leader", all of the QoS stuff for the policy needs to go to this one.

> >

> > Alas, that still will not work, because things like

> > acpi_processor_ppc_init() only work accidentally for one-CPU policies.

>

> I am not sure what problem you see here ? Can you please explain a bit more.


Never mind, sorry.  This is called for policy->cpu too.

> > Generally, adding such a PM QoS request to a non-policy CPU simply has

> > no effect until it becomes a policy CPU which may be never.

>

> I was thinking maybe we can read the constraints for all CPUs in the

> policy->cpus mask in cpufreq_set_policy() and so this part of the problem will

> just go away. The only part that would be left is to remove the QoS constraints

> properly.


That would be on the complicated side IMO.

> > It looks like using device PM QoS for cpufreq is a mistake in general

> > and what is needed is a struct pm_qos_constraints member in struct

> > cpufreq_policy and something like

> >

> > struct freq_pm_qos_request {

> >         enum freq_pm_qos_req_type type; /* min or max */

> >         struct plist_node pnode;

> >         struct cpufreq_policy *policy;

> > };

> >

> > Then, pm_qos_update_target() can be used for adding, updating and

> > removing requests.


I have patches implementing this idea, more or less, almost ready,
stay tuned. :-)
diff mbox series

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ceb57af15ca0..b96ef6db1bfe 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -26,6 +26,7 @@ 
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -999,7 +1000,7 @@  static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 
-	if (!dev)
+	if (unlikely(!dev))
 		return;
 
 	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1117,14 +1118,16 @@  static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 static void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy new_policy = *policy;
-
-	pr_debug("updating policy for CPU %u\n", policy->cpu);
+	struct cpufreq_policy new_policy;
 
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
+	if (!policy_is_inactive(policy)) {
+		new_policy = *policy;
+		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-	cpufreq_set_policy(policy, &new_policy);
+		new_policy.min = policy->user_policy.min;
+		new_policy.max = policy->user_policy.max;
+		cpufreq_set_policy(policy, &new_policy);
+	}
 }
 
 static void handle_update(struct work_struct *work)
@@ -1133,14 +1136,60 @@  static void handle_update(struct work_struct *work)
 		container_of(work, struct cpufreq_policy, update);
 
 	pr_debug("handle_update for cpu %u called\n", policy->cpu);
+	down_write(&policy->rwsem);
 	refresh_frequency_limits(policy);
+	up_write(&policy->rwsem);
+}
+
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+	struct kobject *kobj;
+	struct completion *cmp;
+
+	down_write(&policy->rwsem);
+	cpufreq_stats_free_table(policy);
+	kobj = &policy->kobj;
+	cmp = &policy->kobj_unregister;
+	up_write(&policy->rwsem);
+	kobject_put(kobj);
+
+	/*
+	 * We need to make sure that the underlying kobj is
+	 * actually not referenced anymore by anybody before we
+	 * proceed with unloading.
+	 */
+	pr_debug("waiting for dropping of refcount\n");
+	wait_for_completion(cmp);
+	pr_debug("wait complete\n");
 }
 
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	struct device *dev = get_cpu_device(cpu);
 	int ret;
 
+	if (!dev)
+		return NULL;
+
 	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
 	if (!policy)
 		return NULL;
@@ -1157,7 +1206,7 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   cpufreq_global_kobject, "policy%u", cpu);
 	if (ret) {
-		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
 		/*
 		 * The entire policy object will be freed below, but the extra
 		 * memory allocated for the kobject name needs to be freed by
@@ -1167,6 +1216,25 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	policy->nb_min.notifier_call = cpufreq_notifier_min;
+	policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+				      DEV_PM_QOS_MIN_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_kobj_remove;
+	}
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+				      DEV_PM_QOS_MAX_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_min_qos_notifier;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1177,6 +1245,11 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_min_qos_notifier:
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
 	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
@@ -1189,30 +1262,9 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
-	struct kobject *kobj;
-	struct completion *cmp;
-
-	down_write(&policy->rwsem);
-	cpufreq_stats_free_table(policy);
-	kobj = &policy->kobj;
-	cmp = &policy->kobj_unregister;
-	up_write(&policy->rwsem);
-	kobject_put(kobj);
-
-	/*
-	 * We need to make sure that the underlying kobj is
-	 * actually not referenced anymore by anybody before we
-	 * proceed with unloading.
-	 */
-	pr_debug("waiting for dropping of refcount\n");
-	wait_for_completion(cmp);
-	pr_debug("wait complete\n");
-}
-
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
+	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1224,6 +1276,11 @@  static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+				   DEV_PM_QOS_MAX_FREQUENCY);
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
@@ -2283,6 +2340,8 @@  int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2297,11 +2356,27 @@  int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (new_policy->min > new_policy->max)
 		return -EINVAL;
 
+	/*
+	 * PM QoS framework collects all the requests from users and provide us
+	 * the final aggregated value here.
+	 */
+	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+
+	if (min > new_policy->min)
+		new_policy->min = min;
+	if (max < new_policy->max)
+		new_policy->max = max;
+
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
 	if (ret)
 		return ret;
 
+	/*
+	 * The notifier-chain shall be removed once all the users of
+	 * CPUFREQ_ADJUST are moved to use the QoS framework.
+	 */
 	/* adjust if necessary - all reasons */
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a1467aa7f58b..95425941f46d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -147,6 +147,9 @@  struct cpufreq_policy {
 
 	/* Pointer to the cooling device if used for thermal mitigation */
 	struct thermal_cooling_device *cdev;
+
+	struct notifier_block nb_min;
+	struct notifier_block nb_max;
 };
 
 struct cpufreq_freqs {