diff mbox

[v6,3/3] ARM: Check if a CPU has gone offline

Message ID 1397762146-8337-4-git-send-email-ashwin.chaugule@linaro.org
State New
Headers show

Commit Message

Ashwin Chaugule April 17, 2014, 7:15 p.m. UTC
PSCIv0.2 adds a new function called AFFINITY_INFO, which
can be used to query if a specified CPU has actually gone
offline. Calling this function via cpu_kill ensures that
a CPU has quiesced after a call to cpu_die.

Signed-off-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
---
 arch/arm/kernel/psci_smp.c | 21 +++++++++++++++++++++
 include/uapi/linux/psci.h  |  5 +++++
 2 files changed, 26 insertions(+)

Comments

Mark Rutland April 17, 2014, 7:50 p.m. UTC | #1
On Thu, Apr 17, 2014 at 08:15:46PM +0100, Ashwin Chaugule wrote:
> PSCIv0.2 adds a new function called AFFINITY_INFO, which
> can be used to query if a specified CPU has actually gone
> offline. Calling this function via cpu_kill ensures that
> a CPU has quiesced after a call to cpu_die.
> 
> Signed-off-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
> Reviewed-by: Rob Herring <robh@kernel.org>
> ---
>  arch/arm/kernel/psci_smp.c | 21 +++++++++++++++++++++
>  include/uapi/linux/psci.h  |  5 +++++
>  2 files changed, 26 insertions(+)
> 
> diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
> index 570a48c..c6f1420 100644
> --- a/arch/arm/kernel/psci_smp.c
> +++ b/arch/arm/kernel/psci_smp.c
> @@ -16,6 +16,7 @@
>  #include <linux/init.h>
>  #include <linux/smp.h>
>  #include <linux/of.h>
> +#include <uapi/linux/psci.h>
>  
>  #include <asm/psci.h>
>  #include <asm/smp_plat.h>
> @@ -66,6 +67,25 @@ void __ref psci_cpu_die(unsigned int cpu)
>         /* We should never return */
>         panic("psci: cpu %d failed to shutdown\n", cpu);
>  }
> +
> +int __ref psci_cpu_kill(unsigned int cpu)
> +{
> +	int err;
> +
> +	if (!psci_ops.affinity_info)
> +		return 1;
> +
> +	err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
> +
> +	if (err != PSCI_AFFINITY_INFO_RET_OFF) {
> +		pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
> +				cpu, err);
> +		/* Make platform_cpu_kill() fail. */
> +		return 0;
> +	}

We can race with the dying CPU here -- if we call AFFINITY_INFO before
the dying cpu is sufficiently far through its CPU_OFF call it won't
register as OFF. 

Could we poll here instead (with a reasonable limit on the number of
iterations)? That would enable us to not spuriously declare a CPU to be
dead when it happened to take slightly longer than we expect to turn
off.

Otherwise, this looks good. Thanks for implementing this :)

Cheers,
Mark.
Ashwin Chaugule April 18, 2014, 3:21 p.m. UTC | #2
Hi Mark,


On 17 April 2014 15:50, Mark Rutland <mark.rutland@arm.com> wrote:
> On Thu, Apr 17, 2014 at 08:15:46PM +0100, Ashwin Chaugule wrote:
>> PSCIv0.2 adds a new function called AFFINITY_INFO, which
>> can be used to query if a specified CPU has actually gone
>> offline. Calling this function via cpu_kill ensures that
>> a CPU has quiesced after a call to cpu_die.
>>
>> Signed-off-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
>> Reviewed-by: Rob Herring <robh@kernel.org>
>> ---
>>  arch/arm/kernel/psci_smp.c | 21 +++++++++++++++++++++
>>  include/uapi/linux/psci.h  |  5 +++++
>>  2 files changed, 26 insertions(+)
>>
>> diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
>> index 570a48c..c6f1420 100644
>> --- a/arch/arm/kernel/psci_smp.c
>> +++ b/arch/arm/kernel/psci_smp.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/init.h>
>>  #include <linux/smp.h>
>>  #include <linux/of.h>
>> +#include <uapi/linux/psci.h>
>>
>>  #include <asm/psci.h>
>>  #include <asm/smp_plat.h>
>> @@ -66,6 +67,25 @@ void __ref psci_cpu_die(unsigned int cpu)
>>         /* We should never return */
>>         panic("psci: cpu %d failed to shutdown\n", cpu);
>>  }
>> +
>> +int __ref psci_cpu_kill(unsigned int cpu)
>> +{
>> +     int err;
>> +
>> +     if (!psci_ops.affinity_info)
>> +             return 1;
>> +
>> +     err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
>> +
>> +     if (err != PSCI_AFFINITY_INFO_RET_OFF) {
>> +             pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
>> +                             cpu, err);
>> +             /* Make platform_cpu_kill() fail. */
>> +             return 0;
>> +     }
>
> We can race with the dying CPU here -- if we call AFFINITY_INFO before
> the dying cpu is sufficiently far through its CPU_OFF call it won't
> register as OFF.
>
> Could we poll here instead (with a reasonable limit on the number of
> iterations)? That would enable us to not spuriously declare a CPU to be
> dead when it happened to take slightly longer than we expect to turn
> off.

True. How about something like this?

 int __ref psci_cpu_kill(unsigned int cpu)
 {
-       int err;
+       int err, retries;

        if (!psci_ops.affinity_info)
                return 1;
-
+       /*
+        * cpu_kill could race with cpu_die and we can
+        * potentially end up declaring this cpu undead
+        * while it is dying. So retry a couple of times.
+        */
+retry:
        err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);

        if (err != PSCI_AFFINITY_INFO_RET_OFF) {
+               if (++retries < 3) {
+                       pr_info("Retrying check for CPU kill: %d\n", retries);
+                       goto retry;
+               }
                pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
                                cpu, err);
                /* Make platform_cpu_kill() fail. */



Cheers,
Ashwin
Ashwin Chaugule April 24, 2014, 2:33 p.m. UTC | #3
On 18 April 2014 11:21, Ashwin Chaugule <ashwin.chaugule@linaro.org> wrote:
> Hi Mark,
>
>
> On 17 April 2014 15:50, Mark Rutland <mark.rutland@arm.com> wrote:
>> On Thu, Apr 17, 2014 at 08:15:46PM +0100, Ashwin Chaugule wrote:
>>> PSCIv0.2 adds a new function called AFFINITY_INFO, which
>>> can be used to query if a specified CPU has actually gone
>>> offline. Calling this function via cpu_kill ensures that
>>> a CPU has quiesced after a call to cpu_die.
>>>
>>> Signed-off-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
>>> Reviewed-by: Rob Herring <robh@kernel.org>
>>> ---
>>>  arch/arm/kernel/psci_smp.c | 21 +++++++++++++++++++++
>>>  include/uapi/linux/psci.h  |  5 +++++
>>>  2 files changed, 26 insertions(+)
>>>
>>> diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
>>> index 570a48c..c6f1420 100644
>>> --- a/arch/arm/kernel/psci_smp.c
>>> +++ b/arch/arm/kernel/psci_smp.c
>>> @@ -16,6 +16,7 @@
>>>  #include <linux/init.h>
>>>  #include <linux/smp.h>
>>>  #include <linux/of.h>
>>> +#include <uapi/linux/psci.h>
>>>
>>>  #include <asm/psci.h>
>>>  #include <asm/smp_plat.h>
>>> @@ -66,6 +67,25 @@ void __ref psci_cpu_die(unsigned int cpu)
>>>         /* We should never return */
>>>         panic("psci: cpu %d failed to shutdown\n", cpu);
>>>  }
>>> +
>>> +int __ref psci_cpu_kill(unsigned int cpu)
>>> +{
>>> +     int err;
>>> +
>>> +     if (!psci_ops.affinity_info)
>>> +             return 1;
>>> +
>>> +     err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
>>> +
>>> +     if (err != PSCI_AFFINITY_INFO_RET_OFF) {
>>> +             pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
>>> +                             cpu, err);
>>> +             /* Make platform_cpu_kill() fail. */
>>> +             return 0;
>>> +     }
>>
>> We can race with the dying CPU here -- if we call AFFINITY_INFO before
>> the dying cpu is sufficiently far through its CPU_OFF call it won't
>> register as OFF.
>>
>> Could we poll here instead (with a reasonable limit on the number of
>> iterations)? That would enable us to not spuriously declare a CPU to be
>> dead when it happened to take slightly longer than we expect to turn
>> off.
>
> True. How about something like this?
>
>  int __ref psci_cpu_kill(unsigned int cpu)
>  {
> -       int err;
> +       int err, retries;
>
>         if (!psci_ops.affinity_info)
>                 return 1;
> -
> +       /*
> +        * cpu_kill could race with cpu_die and we can
> +        * potentially end up declaring this cpu undead
> +        * while it is dying. So retry a couple of times.
> +        */
> +retry:
>         err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
>
>         if (err != PSCI_AFFINITY_INFO_RET_OFF) {
> +               if (++retries < 3) {
> +                       pr_info("Retrying check for CPU kill: %d\n", retries);
> +                       goto retry;
> +               }
>                 pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
>                                 cpu, err);
>                 /* Make platform_cpu_kill() fail. */
>
>
>


Hi Rob, I've already got your Reviewed-by on this patch without this
"retry" thing. Are you okay with this as well? I can then roll it up
in one patch.

Cheers,
Ashwin
Rob Herring April 24, 2014, 3:11 p.m. UTC | #4
On Thu, Apr 24, 2014 at 9:33 AM, Ashwin Chaugule
<ashwin.chaugule@linaro.org> wrote:
> On 18 April 2014 11:21, Ashwin Chaugule <ashwin.chaugule@linaro.org> wrote:
>> Hi Mark,
>>
>>
>> On 17 April 2014 15:50, Mark Rutland <mark.rutland@arm.com> wrote:
>>> On Thu, Apr 17, 2014 at 08:15:46PM +0100, Ashwin Chaugule wrote:
>>>> PSCIv0.2 adds a new function called AFFINITY_INFO, which
>>>> can be used to query if a specified CPU has actually gone
>>>> offline. Calling this function via cpu_kill ensures that
>>>> a CPU has quiesced after a call to cpu_die.

[...]

>>> We can race with the dying CPU here -- if we call AFFINITY_INFO before
>>> the dying cpu is sufficiently far through its CPU_OFF call it won't
>>> register as OFF.
>>>
>>> Could we poll here instead (with a reasonable limit on the number of
>>> iterations)? That would enable us to not spuriously declare a CPU to be
>>> dead when it happened to take slightly longer than we expect to turn
>>> off.
>>
>> True. How about something like this?
>>
>>  int __ref psci_cpu_kill(unsigned int cpu)
>>  {
>> -       int err;
>> +       int err, retries;
>>
>>         if (!psci_ops.affinity_info)
>>                 return 1;
>> -
>> +       /*
>> +        * cpu_kill could race with cpu_die and we can
>> +        * potentially end up declaring this cpu undead
>> +        * while it is dying. So retry a couple of times.
>> +        */
>> +retry:
>>         err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
>>
>>         if (err != PSCI_AFFINITY_INFO_RET_OFF) {
>> +               if (++retries < 3) {
>> +                       pr_info("Retrying check for CPU kill: %d\n", retries);
>> +                       goto retry;
>> +               }
>>                 pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
>>                                 cpu, err);
>>                 /* Make platform_cpu_kill() fail. */
>>
>>
>>
>
>
> Hi Rob, I've already got your Reviewed-by on this patch without this
> "retry" thing. Are you okay with this as well? I can then roll it up
> in one patch.

Yes. My only comment is I would perhaps add a sleep (or delay if this
context cannot sleep) on the retry. I'm not sure what I reasonable
time would be, but at least then you are waiting a defined amount of
time versus how long it takes this code to execute.

Rob
diff mbox

Patch

diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
index 570a48c..c6f1420 100644
--- a/arch/arm/kernel/psci_smp.c
+++ b/arch/arm/kernel/psci_smp.c
@@ -16,6 +16,7 @@ 
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/of.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/psci.h>
 #include <asm/smp_plat.h>
@@ -66,6 +67,25 @@  void __ref psci_cpu_die(unsigned int cpu)
        /* We should never return */
        panic("psci: cpu %d failed to shutdown\n", cpu);
 }
+
+int __ref psci_cpu_kill(unsigned int cpu)
+{
+	int err;
+
+	if (!psci_ops.affinity_info)
+		return 1;
+
+	err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
+
+	if (err != PSCI_AFFINITY_INFO_RET_OFF) {
+		pr_err("psci: Cannot kill CPU:%d, psci ret val: %d\n",
+				cpu, err);
+		/* Make platform_cpu_kill() fail. */
+		return 0;
+	}
+	return 1;
+}
+
 #endif
 
 bool __init psci_smp_available(void)
@@ -78,5 +98,6 @@  struct smp_operations __initdata psci_smp_ops = {
 	.smp_boot_secondary	= psci_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_die		= psci_cpu_die,
+	.cpu_kill		= psci_cpu_kill,
 #endif
 };
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index a4136c3..857209b 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -74,4 +74,9 @@ 
 #define PSCI_RET_NOT_PRESENT			-7
 #define PSCI_RET_DISABLED			-8
 
+/* Return values from the PSCI_ID_AFFINITY_INFO Function. */
+#define PSCI_AFFINITY_INFO_RET_ON			0
+#define PSCI_AFFINITY_INFO_RET_OFF			1
+#define PSCI_AFFINITY_INFO_RET_ON_PENDING	2
+
 #endif /* _UAPI_LINUX_PSCI_H */