diff mbox series

cpufreq: fix race on cpufreq online

Message ID 20220420191541.99528-1-schspa@gmail.com
State New
Headers show
Series cpufreq: fix race on cpufreq online | expand

Commit Message

Schspa Shi April 20, 2022, 7:15 p.m. UTC
When cpufreq online failed, policy->cpus are not empty while
cpufreq sysfs file available, we may access some data freed.

Take policy->clk as an example:

static int cpufreq_online(unsigned int cpu)
{
  ...
  // policy->cpus != 0 at this time
  down_write(&policy->rwsem);
  ret = cpufreq_add_dev_interface(policy);
  up_write(&policy->rwsem);

  return 0;

out_destroy_policy:
	for_each_cpu(j, policy->real_cpus)
		remove_cpu_dev_symlink(policy, get_cpu_device(j));
    up_write(&policy->rwsem);
...
out_exit_policy:
  if (cpufreq_driver->exit)
    cpufreq_driver->exit(policy);
      clk_put(policy->clk);
      // policy->clk is a wild pointer
...
                                    ^
                                    |
                            Another process access
                            __cpufreq_get
                              cpufreq_verify_current_freq
                                cpufreq_generic_get
                                  // acces wild pointer of policy->clk;
                                    |
                                    |
out_offline_policy:                 |
  cpufreq_policy_free(policy);      |
    // deleted here, and will wait for no body reference
    cpufreq_policy_put_kobj(policy);
}

Signed-off-by: Schspa Shi <schspa@gmail.com>
---
 drivers/cpufreq/cpufreq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Comments

Viresh Kumar May 9, 2022, 3:57 a.m. UTC | #1
I had to dig the old patch first before starting to review what your
next one does.

On 21-04-22, 03:15, Schspa Shi wrote:
> When cpufreq online failed, policy->cpus are not empty while
> cpufreq sysfs file available, we may access some data freed.
> 
> Take policy->clk as an example:
> 
> static int cpufreq_online(unsigned int cpu)
> {
>   ...
>   // policy->cpus != 0 at this time
>   down_write(&policy->rwsem);
>   ret = cpufreq_add_dev_interface(policy);

Please keep some code to help understand where we went from here. I am
sure you meant that we will error out in this case, but you removed
the relevant code.

>   up_write(&policy->rwsem);
> 
>   return 0;
> 
> out_destroy_policy:
> 	for_each_cpu(j, policy->real_cpus)
> 		remove_cpu_dev_symlink(policy, get_cpu_device(j));
>     up_write(&policy->rwsem);
> ...
> out_exit_policy:
>   if (cpufreq_driver->exit)
>     cpufreq_driver->exit(policy);
>       clk_put(policy->clk);
>       // policy->clk is a wild pointer
> ...
>                                     ^
>                                     |
>                             Another process access
>                             __cpufreq_get
>                               cpufreq_verify_current_freq
>                                 cpufreq_generic_get
>                                   // acces wild pointer of policy->clk;
>                                     |
>                                     |
> out_offline_policy:                 |
>   cpufreq_policy_free(policy);      |
>     // deleted here, and will wait for no body reference
>     cpufreq_policy_put_kobj(policy);
> }
> 
> Signed-off-by: Schspa Shi <schspa@gmail.com>
> ---
>  drivers/cpufreq/cpufreq.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 80f535cc8a75..0d58b0f8f3af 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -1533,8 +1533,6 @@ static int cpufreq_online(unsigned int cpu)
>  	for_each_cpu(j, policy->real_cpus)
>  		remove_cpu_dev_symlink(policy, get_cpu_device(j));
>  
> -	up_write(&policy->rwsem);
> -
>  out_offline_policy:
>  	if (cpufreq_driver->offline)
>  		cpufreq_driver->offline(policy);
> @@ -1543,6 +1541,9 @@ static int cpufreq_online(unsigned int cpu)
>  	if (cpufreq_driver->exit)
>  		cpufreq_driver->exit(policy);
>  
> +	cpumask_clear(policy->cpus);
> +	up_write(&policy->rwsem);

This is simply buggy as now an error out to out_offline_policy or
out_exit_policy will try to release a semaphore which was never taken
in the first place. This works fine only if we failed late, i.e. via
out_destroy_policy.

The very first thing we need to do now is revert this patch. Lemme
send a patch for that and you can send a fresh fix over that once you
have a stable fix.
Schspa Shi May 9, 2022, 3:06 p.m. UTC | #2
Viresh Kumar <viresh.kumar@linaro.org> writes:

> I had to dig the old patch first before starting to review what your
> next one does.
>
> On 21-04-22, 03:15, Schspa Shi wrote:
>> When cpufreq online failed, policy->cpus are not empty while
>> cpufreq sysfs file available, we may access some data freed.
>>
>> Take policy->clk as an example:
>>
>> static int cpufreq_online(unsigned int cpu)
>> {
>>   ...
>>   // policy->cpus != 0 at this time
>>   down_write(&policy->rwsem);
>>   ret = cpufreq_add_dev_interface(policy);
>
> Please keep some code to help understand where we went from here. I am
> sure you meant that we will error out in this case, but you removed
> the relevant code.
>

Yes, I will add this to the next version of patch.

>>   up_write(&policy->rwsem);
>>
>>   return 0;
>>
>> out_destroy_policy:
>>      for_each_cpu(j, policy->real_cpus)
>>              remove_cpu_dev_symlink(policy, get_cpu_device(j));
>>     up_write(&policy->rwsem);
>> ...
>> out_exit_policy:
>>   if (cpufreq_driver->exit)
>>     cpufreq_driver->exit(policy);
>>       clk_put(policy->clk);
>>       // policy->clk is a wild pointer
>> ...
>>                                     ^
>>                                     |
>>                             Another process access
>>                             __cpufreq_get
>>                               cpufreq_verify_current_freq
>>                                 cpufreq_generic_get
>>                                   // acces wild pointer of policy->clk;
>>                                     |
>>                                     |
>> out_offline_policy:                 |
>>   cpufreq_policy_free(policy);      |
>>     // deleted here, and will wait for no body reference
>>     cpufreq_policy_put_kobj(policy);
>> }
>>
>> Signed-off-by: Schspa Shi <schspa@gmail.com>
>> ---
>>  drivers/cpufreq/cpufreq.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>> index 80f535cc8a75..0d58b0f8f3af 100644
>> --- a/drivers/cpufreq/cpufreq.c
>> +++ b/drivers/cpufreq/cpufreq.c
>> @@ -1533,8 +1533,6 @@ static int cpufreq_online(unsigned int cpu)
>>      for_each_cpu(j, policy->real_cpus)
>>              remove_cpu_dev_symlink(policy, get_cpu_device(j));
>>
>> -    up_write(&policy->rwsem);
>> -
>>  out_offline_policy:
>>      if (cpufreq_driver->offline)
>>              cpufreq_driver->offline(policy);
>> @@ -1543,6 +1541,9 @@ static int cpufreq_online(unsigned int cpu)
>>      if (cpufreq_driver->exit)
>>              cpufreq_driver->exit(policy);
>>
>> +    cpumask_clear(policy->cpus);
>> +    up_write(&policy->rwsem);
>
> This is simply buggy as now an error out to out_offline_policy or
> out_exit_policy will try to release a semaphore which was never taken
> in the first place. This works fine only if we failed late, i.e. via
> out_destroy_policy.
>

I am very sorry for this oversight.

To fix this issue, there is no need to move cpufreq_driver->exit(policy)
and cpufreq_driver->offline(policy) to inside of &policy->rwsem.
I made this change because they are inside of &policy->rwsem write lock
at cpufreq_offline. I think we should keep offline & exit call inside of
policy->rwsem for better symmetry.

static int cpufreq_offline(unsigned int cpu)
{
        ...
        down_write(&policy->rwsem);
    ...
        /*
         * Perform the ->offline() during light-weight tear-down, as
         * that allows fast recovery when the CPU comes back.
         */
        if (cpufreq_driver->offline) {
                cpufreq_driver->offline(policy);
        } else if (cpufreq_driver->exit) {
                cpufreq_driver->exit(policy);
                policy->freq_table = NULL;
        }

unlock:
        up_write(&policy->rwsem);
        return 0;
}

> The very first thing we need to do now is revert this patch. Lemme
> send a patch for that and you can send a fresh fix over that once you
> have a stable fix.

For the next version of the stable fix, I'd be willing to keep exit and
offline calls inside of policy->rwsem. But it's OK for me to keep offline
& exit calls outside of policy->rwsem.

---
BRs


Schspa Shi
Viresh Kumar May 10, 2022, 3:52 a.m. UTC | #3
On 09-05-22, 23:06, Schspa Shi wrote:
> I am very sorry for this oversight.

No issues, I am partly to blame for not reviewing it as well.

> To fix this issue, there is no need to move cpufreq_driver->exit(policy)
> and cpufreq_driver->offline(policy) to inside of &policy->rwsem.
> I made this change because they are inside of &policy->rwsem write lock
> at cpufreq_offline. I think we should keep offline & exit call inside of
> policy->rwsem for better symmetry.
> 
> static int cpufreq_offline(unsigned int cpu)
> {
>         ...
>         down_write(&policy->rwsem);
>     ...
>         /*
>          * Perform the ->offline() during light-weight tear-down, as
>          * that allows fast recovery when the CPU comes back.
>          */
>         if (cpufreq_driver->offline) {
>                 cpufreq_driver->offline(policy);
>         } else if (cpufreq_driver->exit) {
>                 cpufreq_driver->exit(policy);
>                 policy->freq_table = NULL;
>         }
> 
> unlock:
>         up_write(&policy->rwsem);
>         return 0;
> }
> 
> > The very first thing we need to do now is revert this patch. Lemme
> > send a patch for that and you can send a fresh fix over that once you
> > have a stable fix.
> 
> For the next version of the stable fix, I'd be willing to keep exit and
> offline calls inside of policy->rwsem. But it's OK for me to keep offline
> & exit calls outside of policy->rwsem.

Just send a patch with whatever you think is the right fix and lets
take it from there.
diff mbox series

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 80f535cc8a75..0d58b0f8f3af 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1533,8 +1533,6 @@  static int cpufreq_online(unsigned int cpu)
 	for_each_cpu(j, policy->real_cpus)
 		remove_cpu_dev_symlink(policy, get_cpu_device(j));
 
-	up_write(&policy->rwsem);
-
 out_offline_policy:
 	if (cpufreq_driver->offline)
 		cpufreq_driver->offline(policy);
@@ -1543,6 +1541,9 @@  static int cpufreq_online(unsigned int cpu)
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
 
+	cpumask_clear(policy->cpus);
+	up_write(&policy->rwsem);
+
 out_free_policy:
 	cpufreq_policy_free(policy);
 	return ret;