[V6,5/7] cpufreq: Register notifiers with the PM QoS framework

Message ID 5ad2624194baa2f53acc1f1e627eb7684c577a19.1562210705.git.viresh.kumar@linaro.org
State New
Headers show
Series
  • [V6,1/7] PM / QOS: Pass request type to dev_pm_qos_{add|remove}_notifier()
Related show

Commit Message

Viresh Kumar July 4, 2019, 7:36 a.m.
This registers the notifiers for min/max frequency constraints with the
PM QoS framework. The constraints are also taken into consideration in
cpufreq_set_policy().

This also relocates cpufreq_policy_put_kobj() as it is required to be
called from cpufreq_policy_alloc() now.

refresh_frequency_limits() is updated to have proper locking in place and
avoid calling cpufreq_set_policy() for inactive policies.

No constraints are added until now though.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

---
 drivers/cpufreq/cpufreq.c | 135 ++++++++++++++++++++++++++++++--------
 include/linux/cpufreq.h   |   3 +
 2 files changed, 109 insertions(+), 29 deletions(-)

-- 
2.21.0.rc0.269.g1a574e7a288b

Comments

Dmitry Osipenko Sept. 22, 2019, 8:12 p.m. | #1
08.07.2019 13:57, Viresh Kumar пишет:
> This registers the notifiers for min/max frequency constraints with the

> PM QoS framework. The constraints are also taken into consideration in

> cpufreq_set_policy().

> 

> This also relocates cpufreq_policy_put_kobj() as it is required to be

> called from cpufreq_policy_alloc() now.

> 

> refresh_frequency_limits() is updated to avoid calling

> cpufreq_set_policy() for inactive policies and handle_update() is

> updated to have proper locking in place.

> 

> No constraints are added until now though.

> 

> Reviewed-by: Matthias Kaehlcke <mka@chromium.org>

> Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>

> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

> ---

> V6->V7:

> - All callers of refresh_frequency_limits(), except handle_update(),

>   take the policy->rwsem and result in deadlock as

>   refresh_frequency_limits() also takes the same lock again. Fix that

>   by taking the rwsem from handle_update() instead.

> 

> @Rafael: Sending it before Pavel has verified it as I would be offline

> later, in case you want to apply this today itself.

> 

>  drivers/cpufreq/cpufreq.c | 135 +++++++++++++++++++++++++++++---------

>  include/linux/cpufreq.h   |   3 +

>  2 files changed, 108 insertions(+), 30 deletions(-)

> 

> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c

> index ceb57af15ca0..b96ef6db1bfe 100644

> --- a/drivers/cpufreq/cpufreq.c

> +++ b/drivers/cpufreq/cpufreq.c

> @@ -26,6 +26,7 @@

>  #include <linux/kernel_stat.h>

>  #include <linux/module.h>

>  #include <linux/mutex.h>

> +#include <linux/pm_qos.h>

>  #include <linux/slab.h>

>  #include <linux/suspend.h>

>  #include <linux/syscore_ops.h>

> @@ -999,7 +1000,7 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)

>  {

>  	struct device *dev = get_cpu_device(cpu);

>  

> -	if (!dev)

> +	if (unlikely(!dev))

>  		return;

>  

>  	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))

> @@ -1117,14 +1118,16 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp

>  

>  static void refresh_frequency_limits(struct cpufreq_policy *policy)

>  {

> -	struct cpufreq_policy new_policy = *policy;

> -

> -	pr_debug("updating policy for CPU %u\n", policy->cpu);

> +	struct cpufreq_policy new_policy;

>  

> -	new_policy.min = policy->user_policy.min;

> -	new_policy.max = policy->user_policy.max;

> +	if (!policy_is_inactive(policy)) {

> +		new_policy = *policy;

> +		pr_debug("updating policy for CPU %u\n", policy->cpu);

>  

> -	cpufreq_set_policy(policy, &new_policy);

> +		new_policy.min = policy->user_policy.min;

> +		new_policy.max = policy->user_policy.max;

> +		cpufreq_set_policy(policy, &new_policy);

> +	}

>  }

>  

>  static void handle_update(struct work_struct *work)

> @@ -1133,14 +1136,60 @@ static void handle_update(struct work_struct *work)

>  		container_of(work, struct cpufreq_policy, update);

>  

>  	pr_debug("handle_update for cpu %u called\n", policy->cpu);

> +	down_write(&policy->rwsem);

>  	refresh_frequency_limits(policy);

> +	up_write(&policy->rwsem);

> +}

> +

> +static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,

> +				void *data)

> +{

> +	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);

> +

> +	schedule_work(&policy->update);

> +	return 0;

> +}

> +

> +static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,

> +				void *data)

> +{

> +	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);

> +

> +	schedule_work(&policy->update);

> +	return 0;

> +}

> +

> +static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)

> +{

> +	struct kobject *kobj;

> +	struct completion *cmp;

> +

> +	down_write(&policy->rwsem);

> +	cpufreq_stats_free_table(policy);

> +	kobj = &policy->kobj;

> +	cmp = &policy->kobj_unregister;

> +	up_write(&policy->rwsem);

> +	kobject_put(kobj);

> +

> +	/*

> +	 * We need to make sure that the underlying kobj is

> +	 * actually not referenced anymore by anybody before we

> +	 * proceed with unloading.

> +	 */

> +	pr_debug("waiting for dropping of refcount\n");

> +	wait_for_completion(cmp);

> +	pr_debug("wait complete\n");

>  }

>  

>  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)

>  {

>  	struct cpufreq_policy *policy;

> +	struct device *dev = get_cpu_device(cpu);

>  	int ret;

>  

> +	if (!dev)

> +		return NULL;

> +

>  	policy = kzalloc(sizeof(*policy), GFP_KERNEL);

>  	if (!policy)

>  		return NULL;

> @@ -1157,7 +1206,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)

>  	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,

>  				   cpufreq_global_kobject, "policy%u", cpu);

>  	if (ret) {

> -		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);

> +		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);

>  		/*

>  		 * The entire policy object will be freed below, but the extra

>  		 * memory allocated for the kobject name needs to be freed by

> @@ -1167,6 +1216,25 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)

>  		goto err_free_real_cpus;

>  	}

>  

> +	policy->nb_min.notifier_call = cpufreq_notifier_min;

> +	policy->nb_max.notifier_call = cpufreq_notifier_max;

> +

> +	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,

> +				      DEV_PM_QOS_MIN_FREQUENCY);

> +	if (ret) {

> +		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",

> +			ret, cpumask_pr_args(policy->cpus));

> +		goto err_kobj_remove;

> +	}

> +

> +	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,

> +				      DEV_PM_QOS_MAX_FREQUENCY);

> +	if (ret) {

> +		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",

> +			ret, cpumask_pr_args(policy->cpus));

> +		goto err_min_qos_notifier;

> +	}

> +

>  	INIT_LIST_HEAD(&policy->policy_list);

>  	init_rwsem(&policy->rwsem);

>  	spin_lock_init(&policy->transition_lock);

> @@ -1177,6 +1245,11 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)

>  	policy->cpu = cpu;

>  	return policy;

>  

> +err_min_qos_notifier:

> +	dev_pm_qos_remove_notifier(dev, &policy->nb_min,

> +				   DEV_PM_QOS_MIN_FREQUENCY);

> +err_kobj_remove:

> +	cpufreq_policy_put_kobj(policy);

>  err_free_real_cpus:

>  	free_cpumask_var(policy->real_cpus);

>  err_free_rcpumask:

> @@ -1189,30 +1262,9 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)

>  	return NULL;

>  }

>  

> -static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)

> -{

> -	struct kobject *kobj;

> -	struct completion *cmp;

> -

> -	down_write(&policy->rwsem);

> -	cpufreq_stats_free_table(policy);

> -	kobj = &policy->kobj;

> -	cmp = &policy->kobj_unregister;

> -	up_write(&policy->rwsem);

> -	kobject_put(kobj);

> -

> -	/*

> -	 * We need to make sure that the underlying kobj is

> -	 * actually not referenced anymore by anybody before we

> -	 * proceed with unloading.

> -	 */

> -	pr_debug("waiting for dropping of refcount\n");

> -	wait_for_completion(cmp);

> -	pr_debug("wait complete\n");

> -}

> -

>  static void cpufreq_policy_free(struct cpufreq_policy *policy)

>  {

> +	struct device *dev = get_cpu_device(policy->cpu);

>  	unsigned long flags;

>  	int cpu;

>  

> @@ -1224,6 +1276,11 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)

>  		per_cpu(cpufreq_cpu_data, cpu) = NULL;

>  	write_unlock_irqrestore(&cpufreq_driver_lock, flags);

>  

> +	dev_pm_qos_remove_notifier(dev, &policy->nb_max,

> +				   DEV_PM_QOS_MAX_FREQUENCY);

> +	dev_pm_qos_remove_notifier(dev, &policy->nb_min,

> +				   DEV_PM_QOS_MIN_FREQUENCY);

> +

>  	cpufreq_policy_put_kobj(policy);

>  	free_cpumask_var(policy->real_cpus);

>  	free_cpumask_var(policy->related_cpus);

> @@ -2283,6 +2340,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,

>  		       struct cpufreq_policy *new_policy)

>  {

>  	struct cpufreq_governor *old_gov;

> +	struct device *cpu_dev = get_cpu_device(policy->cpu);

> +	unsigned long min, max;

>  	int ret;

>  

>  	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",

> @@ -2297,11 +2356,27 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,

>  	if (new_policy->min > new_policy->max)

>  		return -EINVAL;

>  

> +	/*

> +	 * PM QoS framework collects all the requests from users and provide us

> +	 * the final aggregated value here.

> +	 */

> +	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);

> +	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);

> +

> +	if (min > new_policy->min)

> +		new_policy->min = min;

> +	if (max < new_policy->max)

> +		new_policy->max = max;

> +

>  	/* verify the cpu speed can be set within this limit */

>  	ret = cpufreq_driver->verify(new_policy);

>  	if (ret)

>  		return ret;

>  

> +	/*

> +	 * The notifier-chain shall be removed once all the users of

> +	 * CPUFREQ_ADJUST are moved to use the QoS framework.

> +	 */

>  	/* adjust if necessary - all reasons */

>  	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,

>  			CPUFREQ_ADJUST, new_policy);

> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h

> index a1467aa7f58b..95425941f46d 100644

> --- a/include/linux/cpufreq.h

> +++ b/include/linux/cpufreq.h

> @@ -147,6 +147,9 @@ struct cpufreq_policy {

>  

>  	/* Pointer to the cooling device if used for thermal mitigation */

>  	struct thermal_cooling_device *cdev;

> +

> +	struct notifier_block nb_min;

> +	struct notifier_block nb_max;

>  };

>  

>  struct cpufreq_freqs {

> 


Hello Viresh,

This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.


[   87.952369] ==================================================================
[   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c
[   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

[   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W
5.3.0-next-20190920-00185-gf61698eab956-dirty #2408
[   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
[   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)
[   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)
[   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]
(print_address_description.constprop.0+0x3d/0x340)
[   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]
(__kasan_report+0xe3/0x12c)
[   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)
[   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]
(blocking_notifier_chain_register+0x29/0x3c)
[   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]
(dev_pm_qos_add_notifier+0x79/0xf8)
[   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)
[   87.965351] [<c092927d>] (cpufreq_online) from [<c09295c9>] (cpufreq_add_dev+0x79/0x80)
[   87.966247] [<c09295c9>] (cpufreq_add_dev) from [<c06eb9d3>] (subsys_interface_register+0xc3/0x100)
[   87.967297] [<c06eb9d3>] (subsys_interface_register) from [<c0926e53>]
(cpufreq_register_driver+0x13b/0x1ec)
[   87.968476] [<c0926e53>] (cpufreq_register_driver) from [<bf800435>]
(tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq])
[   87.969711] [<bf800435>] (tegra20_cpufreq_probe [tegra20_cpufreq]) from [<c06ef7a5>]
(platform_drv_probe+0x49/0x88)
[   87.970854] [<c06ef7a5>] (platform_drv_probe) from [<c06ed299>] (really_probe+0x109/0x374)
[   87.971771] [<c06ed299>] (really_probe) from [<c06ed64f>] (driver_probe_device+0x57/0x158)
[   87.972685] [<c06ed64f>] (driver_probe_device) from [<c06ed8fd>] (device_driver_attach+0x61/0x64)
[   87.973658] [<c06ed8fd>] (device_driver_attach) from [<c06ed949>] (__driver_attach+0x49/0xa0)
[   87.974599] [<c06ed949>] (__driver_attach) from [<c06eb62d>] (bus_for_each_dev+0x69/0x94)
[   87.975510] [<c06eb62d>] (bus_for_each_dev) from [<c06ec731>] (bus_add_driver+0x179/0x1e8)
[   87.976427] [<c06ec731>] (bus_add_driver) from [<c06ee4ab>] (driver_register+0x8f/0x130)
[   87.977345] [<c06ee4ab>] (driver_register) from [<bf805017>]
(tegra20_cpufreq_driver_init+0x17/0x1000 [tegra20_cpufreq])
[   87.978553] [<bf805017>] (tegra20_cpufreq_driver_init [tegra20_cpufreq]) from [<c0102f35>]
(do_one_initcall+0x4d/0x280)
[   87.979729] [<c0102f35>] (do_one_initcall) from [<c01c8675>] (do_init_module+0xb9/0x288)
[   87.980618] [<c01c8675>] (do_init_module) from [<c01cb105>] (load_module+0x2829/0x2b90)
[   87.981496] [<c01cb105>] (load_module) from [<c01cb62b>] (sys_finit_module+0x7b/0x8c)
[   87.982358] [<c01cb62b>] (sys_finit_module) from [<c0101001>] (ret_fast_syscall+0x1/0x26)
[   87.983240] Exception stack(0xe4babfa8 to 0xe4babff0)
[   87.983809] bfa0:                   0003f210 00000001 00000003 0002b744 00000000 b6611e64
[   87.984703] bfc0: 0003f210 00000001 29a92d00 0000017b 0003f2a8 00000000 0003f210 00040008
[   87.985588] bfe0: b6611de8 b6611dd8 00022534 aec402e0

[   87.986340] Allocated by task 182:
[   87.986756]  cpufreq_online+0x55f/0x8a4
[   87.987204]  cpufreq_add_dev+0x79/0x80
[   87.987641]  subsys_interface_register+0xc3/0x100
[   87.988178]  cpufreq_register_driver+0x13b/0x1ec
[   87.988719]  tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq]
[   87.989389]  platform_drv_probe+0x49/0x88
[   87.989849]  really_probe+0x109/0x374
[   88.021245]  driver_probe_device+0x57/0x158
[   88.052890]  device_driver_attach+0x61/0x64
[   88.084012]  __driver_attach+0x49/0xa0
[   88.114533]  bus_for_each_dev+0x69/0x94
[   88.144898]  bus_add_driver+0x179/0x1e8
[   88.174840]  driver_register+0x8f/0x130
[   88.204398]  tegra20_cpufreq_driver_init+0x17/0x1000 [tegra20_cpufreq]
[   88.234165]  do_one_initcall+0x4d/0x280
[   88.263754]  do_init_module+0xb9/0x288
[   88.292773]  load_module+0x2829/0x2b90
[   88.321106]  sys_finit_module+0x7b/0x8c
[   88.349098]  ret_fast_syscall+0x1/0x26
[   88.376732]  0xb6802c60

[   88.430292] Freed by task 239:
[   88.456279]  __kasan_slab_free+0x9f/0xc4
[   88.481802]  kfree+0x71/0x1f4
[   88.506805]  subsys_interface_unregister+0xad/0xf0
[   88.531524]  cpufreq_unregister_driver+0x2f/0x7c
[   88.555909]  tegra20_cpufreq_remove+0x19/0x48 [tegra20_cpufreq]
[   88.580273]  platform_drv_remove+0x27/0x34
[   88.604156]  device_release_driver_internal+0xdf/0x1a4
[   88.627659]  driver_detach+0x85/0xf8
[   88.650458]  bus_remove_driver+0x53/0xb0
[   88.672706]  tegra20_cpufreq_driver_exit+0x9/0xb88 [tegra20_cpufreq]
[   88.695100]  sys_delete_module+0x14d/0x1a0
[   88.717147]  ret_fast_syscall+0x1/0x26
[   88.738799]  0xb6d35ff4

[   88.780543] The buggy address belongs to the object at e6abbc00
                which belongs to the cache kmalloc-512 of size 512
[   88.821845] The buggy address is located 268 bytes inside of
                512-byte region [e6abbc00, e6abbe00)
[   88.861188] The buggy address belongs to the page:
[   88.880380] page:edcbc700 refcount:1 mapcount:0 mapping:e7001a00 index:0x0 compound_mapcount: 0
[   88.900167] flags: 0x10200(slab|head)
[   88.919599] raw: 00010200 edcafb00 00030003 e7001a00 00000000 00100010 ffffffff 00000001
[   88.939423] page dumped because: kasan: bad access detected

[   88.978080] Memory state around the buggy address:
[   88.997237]  e6abbc00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   89.016710]  e6abbc80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   89.036051] >e6abbd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   89.055192]               ^
[   89.074260]  e6abbd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   89.093832]  e6abbe00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   89.113300] ==================================================================
Viresh Kumar Oct. 15, 2019, 11:46 a.m. | #2
On 22-09-19, 23:12, Dmitry Osipenko wrote:
> Hello Viresh,

> 

> This patch causes use-after-free on a cpufreq driver module reload. Please take a look, thanks in advance.

> 

> 

> [   87.952369] ==================================================================

> [   87.953259] BUG: KASAN: use-after-free in notifier_chain_register+0x4f/0x9c

> [   87.954031] Read of size 4 at addr e6abbd0c by task modprobe/243

> 

> [   87.954901] CPU: 1 PID: 243 Comm: modprobe Tainted: G        W

> 5.3.0-next-20190920-00185-gf61698eab956-dirty #2408

> [   87.956077] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)

> [   87.956807] [<c0110aad>] (unwind_backtrace) from [<c010bb71>] (show_stack+0x11/0x14)

> [   87.957709] [<c010bb71>] (show_stack) from [<c0d37b25>] (dump_stack+0x89/0x98)

> [   87.958616] [<c0d37b25>] (dump_stack) from [<c02937e1>]

> (print_address_description.constprop.0+0x3d/0x340)

> [   87.959785] [<c02937e1>] (print_address_description.constprop.0) from [<c0293c6b>]

> (__kasan_report+0xe3/0x12c)

> [   87.960907] [<c0293c6b>] (__kasan_report) from [<c014988f>] (notifier_chain_register+0x4f/0x9c)

> [   87.962001] [<c014988f>] (notifier_chain_register) from [<c01499b5>]

> (blocking_notifier_chain_register+0x29/0x3c)

> [   87.963180] [<c01499b5>] (blocking_notifier_chain_register) from [<c06f7ee9>]

> (dev_pm_qos_add_notifier+0x79/0xf8)

> [   87.964339] [<c06f7ee9>] (dev_pm_qos_add_notifier) from [<c092927d>] (cpufreq_online+0x5e1/0x8a4)

> [   87.965351] [<c092927d>] (cpufreq_online) from [<c09295c9>] (cpufreq_add_dev+0x79/0x80)

> [   87.966247] [<c09295c9>] (cpufreq_add_dev) from [<c06eb9d3>] (subsys_interface_register+0xc3/0x100)

> [   87.967297] [<c06eb9d3>] (subsys_interface_register) from [<c0926e53>]

> (cpufreq_register_driver+0x13b/0x1ec)

> [   87.968476] [<c0926e53>] (cpufreq_register_driver) from [<bf800435>]

> (tegra20_cpufreq_probe+0x165/0x1a8 [tegra20_cpufreq])


Hi Dmitry,

Thanks for the bug report and I was finally able to reproduce it at my end and
this was quite an interesting debugging exercise :)

When a cpufreq driver gets registered, we register with the subsys interface and
it calls cpufreq_add_dev() for each CPU, starting from CPU0. And so the QoS
notifiers get added to the first CPU of the policy, i.e. CPU0 in common cases.

When the cpufreq driver gets unregistered, we unregister with the subsys
interface and it calls cpufreq_remove_dev() for each CPU, starting from CPU0
(should have been in reverse order I feel). We remove the QoS notifier only when
cpufreq_remove_dev() gets called for the last CPU of the policy, lets call it
CPUx. Now this has a different notifier list as compared to CPU0.

In short, we are adding the cpufreq notifiers to CPU0 and removing them from
CPUx. When we try to add it again by inserting the module for second time, we
find a node in the notifier list which is already freed but still in the list as
we removed it from CPUx's list (which doesn't do anything as the node wasn't
there in the first place).

@Rafael: How do you see we solve this problem ? Here are the options I could
think of:

- Update subsys layer to reverse the order of devices while unregistering (this
  will fix the current problem, but we will still have corner cases hanging
  around, like if the CPU0 is hotplugged out, etc).

- Update QoS framework with the knowledge of related CPUs, this has been pending
  until now from my side. And this is the thing we really need to do. Eventually
  we shall have only a single notifier list for all CPUs of a policy, at least
  for MIN/MAX frequencies.

- ??

-- 
viresh

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ceb57af15ca0..81117e4d43cc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -26,6 +26,7 @@ 
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -999,7 +1000,7 @@  static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 
-	if (!dev)
+	if (unlikely(!dev))
 		return;
 
 	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1117,14 +1118,20 @@  static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 static void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy new_policy = *policy;
+	struct cpufreq_policy new_policy;
 
-	pr_debug("updating policy for CPU %u\n", policy->cpu);
+	down_write(&policy->rwsem);
+
+	if (!policy_is_inactive(policy)) {
+		new_policy = *policy;
+		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
+		new_policy.min = policy->user_policy.min;
+		new_policy.max = policy->user_policy.max;
+		cpufreq_set_policy(policy, &new_policy);
+	}
 
-	cpufreq_set_policy(policy, &new_policy);
+	up_write(&policy->rwsem);
 }
 
 static void handle_update(struct work_struct *work)
@@ -1136,11 +1143,55 @@  static void handle_update(struct work_struct *work)
 	refresh_frequency_limits(policy);
 }
 
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+	struct kobject *kobj;
+	struct completion *cmp;
+
+	down_write(&policy->rwsem);
+	cpufreq_stats_free_table(policy);
+	kobj = &policy->kobj;
+	cmp = &policy->kobj_unregister;
+	up_write(&policy->rwsem);
+	kobject_put(kobj);
+
+	/*
+	 * We need to make sure that the underlying kobj is
+	 * actually not referenced anymore by anybody before we
+	 * proceed with unloading.
+	 */
+	pr_debug("waiting for dropping of refcount\n");
+	wait_for_completion(cmp);
+	pr_debug("wait complete\n");
+}
+
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	struct device *dev = get_cpu_device(cpu);
 	int ret;
 
+	if (!dev)
+		return NULL;
+
 	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
 	if (!policy)
 		return NULL;
@@ -1157,7 +1208,7 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   cpufreq_global_kobject, "policy%u", cpu);
 	if (ret) {
-		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
 		/*
 		 * The entire policy object will be freed below, but the extra
 		 * memory allocated for the kobject name needs to be freed by
@@ -1167,6 +1218,25 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	policy->nb_min.notifier_call = cpufreq_notifier_min;
+	policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+				      DEV_PM_QOS_MIN_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_kobj_remove;
+	}
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+				      DEV_PM_QOS_MAX_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_min_qos_notifier;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1177,6 +1247,11 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_min_qos_notifier:
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
 	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
@@ -1189,30 +1264,9 @@  static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
-	struct kobject *kobj;
-	struct completion *cmp;
-
-	down_write(&policy->rwsem);
-	cpufreq_stats_free_table(policy);
-	kobj = &policy->kobj;
-	cmp = &policy->kobj_unregister;
-	up_write(&policy->rwsem);
-	kobject_put(kobj);
-
-	/*
-	 * We need to make sure that the underlying kobj is
-	 * actually not referenced anymore by anybody before we
-	 * proceed with unloading.
-	 */
-	pr_debug("waiting for dropping of refcount\n");
-	wait_for_completion(cmp);
-	pr_debug("wait complete\n");
-}
-
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
+	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1224,6 +1278,11 @@  static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+				   DEV_PM_QOS_MAX_FREQUENCY);
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
@@ -2283,6 +2342,8 @@  int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2297,11 +2358,27 @@  int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (new_policy->min > new_policy->max)
 		return -EINVAL;
 
+	/*
+	 * PM QoS framework collects all the requests from users and provide us
+	 * the final aggregated value here.
+	 */
+	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+
+	if (min > new_policy->min)
+		new_policy->min = min;
+	if (max < new_policy->max)
+		new_policy->max = max;
+
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
 	if (ret)
 		return ret;
 
+	/*
+	 * The notifier-chain shall be removed once all the users of
+	 * CPUFREQ_ADJUST are moved to use the QoS framework.
+	 */
 	/* adjust if necessary - all reasons */
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a1467aa7f58b..95425941f46d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -147,6 +147,9 @@  struct cpufreq_policy {
 
 	/* Pointer to the cooling device if used for thermal mitigation */
 	struct thermal_cooling_device *cdev;
+
+	struct notifier_block nb_min;
+	struct notifier_block nb_max;
 };
 
 struct cpufreq_freqs {