diff mbox series

[v3,3/4] powerpc/watchdog: introduce a NMI watchdog's factor

Message ID 20220627135347.32624-4-ldufour@linux.ibm.com
State Superseded
Headers show
Series Extending NMI watchdog during LPM | expand

Commit Message

Laurent Dufour June 27, 2022, 1:53 p.m. UTC
Introduce a factor which would apply to the NMI watchdog timeout.

This factor is a percentage added to the watchdog_tresh value. The value is
set under the watchdog_mutex protection and lockup_detector_reconfigure()
is called to recompute wd_panic_timeout_tb.

Once the factor is set, it remains until it is set back to 0, which means
no impact.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 arch/powerpc/include/asm/nmi.h |  2 ++
 arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

Comments

Nicholas Piggin July 12, 2022, 1:42 a.m. UTC | #1
Excerpts from Laurent Dufour's message of June 27, 2022 11:53 pm:
> Introduce a factor which would apply to the NMI watchdog timeout.
> 
> This factor is a percentage added to the watchdog_tresh value. The value is
> set under the watchdog_mutex protection and lockup_detector_reconfigure()
> is called to recompute wd_panic_timeout_tb.
> 
> Once the factor is set, it remains until it is set back to 0, which means
> no impact.

Looks okay. We could worry about making it more generic or nicer if
another user came along.

Could you make the naming a bit more self documenting? 
watchdog_nmi_set_timeout_pct(), maybe? Does the wd really care
that it is for LPM in particular?

Variables and parameters could have a _pct suffix too.

Otherwise

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

> 
> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/nmi.h |  2 ++
>  arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++-
>  2 files changed, 22 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
> index ea0e487f87b1..7d6a8d9b0543 100644
> --- a/arch/powerpc/include/asm/nmi.h
> +++ b/arch/powerpc/include/asm/nmi.h
> @@ -5,8 +5,10 @@
>  #ifdef CONFIG_PPC_WATCHDOG
>  extern void arch_touch_nmi_watchdog(void);
>  long soft_nmi_interrupt(struct pt_regs *regs);
> +void watchdog_nmi_set_lpm_factor(u64 factor);
>  #else
>  static inline void arch_touch_nmi_watchdog(void) {}
> +static inline void watchdog_nmi_set_lpm_factor(u64 factor) {}
>  #endif
>  
>  #ifdef CONFIG_NMI_IPI
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 7d28b9553654..80851b228f71 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending;
>  static cpumask_t wd_smp_cpus_stuck;
>  static u64 wd_smp_last_reset_tb;
>  
> +#ifdef CONFIG_PPC_PSERIES
> +static u64 wd_factor;
> +#endif
> +
>  /*
>   * Try to take the exclusive watchdog action / NMI IPI / printing lock.
>   * wd_smp_lock must be held. If this fails, we should return and wait
> @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu)
>  
>  static void watchdog_calc_timeouts(void)
>  {
> -	wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
> +	u64 threshold = watchdog_thresh;
> +
> +#ifdef CONFIG_PPC_PSERIES
> +	threshold += (READ_ONCE(wd_factor) * threshold) / 100;
> +#endif
> +
> +	wd_panic_timeout_tb = threshold * ppc_tb_freq;
>  
>  	/* Have the SMP detector trigger a bit later */
>  	wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
> @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void)
>  	}
>  	return 0;
>  }
> +
> +#ifdef CONFIG_PPC_PSERIES
> +void watchdog_nmi_set_lpm_factor(u64 factor)
> +{
> +	pr_info("Set the NMI watchdog factor to %llu%%\n", factor);
> +	WRITE_ONCE(wd_factor, factor);
> +	lockup_detector_reconfigure();
> +}
> +#endif
> -- 
> 2.36.1
> 
>
Laurent Dufour July 12, 2022, 9:51 a.m. UTC | #2
Le 12/07/2022 à 03:42, Nicholas Piggin a écrit :
> Excerpts from Laurent Dufour's message of June 27, 2022 11:53 pm:
>> Introduce a factor which would apply to the NMI watchdog timeout.
>>
>> This factor is a percentage added to the watchdog_tresh value. The value is
>> set under the watchdog_mutex protection and lockup_detector_reconfigure()
>> is called to recompute wd_panic_timeout_tb.
>>
>> Once the factor is set, it remains until it is set back to 0, which means
>> no impact.
> 
> Looks okay. We could worry about making it more generic or nicer if
> another user came along.
> 
> Could you make the naming a bit more self documenting? 
> watchdog_nmi_set_timeout_pct(), maybe? Does the wd really care
> that it is for LPM in particular?

You're right, the name should not mention lpm.
For my information, what does "pct" stand for?

> 
> Variables and parameters could have a _pct suffix too.

I'll do that.

> 
> Otherwise
> 
> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
> 
>>
>> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
>> ---
>>  arch/powerpc/include/asm/nmi.h |  2 ++
>>  arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++-
>>  2 files changed, 22 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
>> index ea0e487f87b1..7d6a8d9b0543 100644
>> --- a/arch/powerpc/include/asm/nmi.h
>> +++ b/arch/powerpc/include/asm/nmi.h
>> @@ -5,8 +5,10 @@
>>  #ifdef CONFIG_PPC_WATCHDOG
>>  extern void arch_touch_nmi_watchdog(void);
>>  long soft_nmi_interrupt(struct pt_regs *regs);
>> +void watchdog_nmi_set_lpm_factor(u64 factor);
>>  #else
>>  static inline void arch_touch_nmi_watchdog(void) {}
>> +static inline void watchdog_nmi_set_lpm_factor(u64 factor) {}
>>  #endif
>>  
>>  #ifdef CONFIG_NMI_IPI
>> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
>> index 7d28b9553654..80851b228f71 100644
>> --- a/arch/powerpc/kernel/watchdog.c
>> +++ b/arch/powerpc/kernel/watchdog.c
>> @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending;
>>  static cpumask_t wd_smp_cpus_stuck;
>>  static u64 wd_smp_last_reset_tb;
>>  
>> +#ifdef CONFIG_PPC_PSERIES
>> +static u64 wd_factor;
>> +#endif
>> +
>>  /*
>>   * Try to take the exclusive watchdog action / NMI IPI / printing lock.
>>   * wd_smp_lock must be held. If this fails, we should return and wait
>> @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu)
>>  
>>  static void watchdog_calc_timeouts(void)
>>  {
>> -	wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
>> +	u64 threshold = watchdog_thresh;
>> +
>> +#ifdef CONFIG_PPC_PSERIES
>> +	threshold += (READ_ONCE(wd_factor) * threshold) / 100;
>> +#endif
>> +
>> +	wd_panic_timeout_tb = threshold * ppc_tb_freq;
>>  
>>  	/* Have the SMP detector trigger a bit later */
>>  	wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
>> @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void)
>>  	}
>>  	return 0;
>>  }
>> +
>> +#ifdef CONFIG_PPC_PSERIES
>> +void watchdog_nmi_set_lpm_factor(u64 factor)
>> +{
>> +	pr_info("Set the NMI watchdog factor to %llu%%\n", factor);
>> +	WRITE_ONCE(wd_factor, factor);
>> +	lockup_detector_reconfigure();
>> +}
>> +#endif
>> -- 
>> 2.36.1
>>
>>
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
index ea0e487f87b1..7d6a8d9b0543 100644
--- a/arch/powerpc/include/asm/nmi.h
+++ b/arch/powerpc/include/asm/nmi.h
@@ -5,8 +5,10 @@ 
 #ifdef CONFIG_PPC_WATCHDOG
 extern void arch_touch_nmi_watchdog(void);
 long soft_nmi_interrupt(struct pt_regs *regs);
+void watchdog_nmi_set_lpm_factor(u64 factor);
 #else
 static inline void arch_touch_nmi_watchdog(void) {}
+static inline void watchdog_nmi_set_lpm_factor(u64 factor) {}
 #endif
 
 #ifdef CONFIG_NMI_IPI
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 7d28b9553654..80851b228f71 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -91,6 +91,10 @@  static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
+#ifdef CONFIG_PPC_PSERIES
+static u64 wd_factor;
+#endif
+
 /*
  * Try to take the exclusive watchdog action / NMI IPI / printing lock.
  * wd_smp_lock must be held. If this fails, we should return and wait
@@ -527,7 +531,13 @@  static int stop_watchdog_on_cpu(unsigned int cpu)
 
 static void watchdog_calc_timeouts(void)
 {
-	wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
+	u64 threshold = watchdog_thresh;
+
+#ifdef CONFIG_PPC_PSERIES
+	threshold += (READ_ONCE(wd_factor) * threshold) / 100;
+#endif
+
+	wd_panic_timeout_tb = threshold * ppc_tb_freq;
 
 	/* Have the SMP detector trigger a bit later */
 	wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
@@ -570,3 +580,12 @@  int __init watchdog_nmi_probe(void)
 	}
 	return 0;
 }
+
+#ifdef CONFIG_PPC_PSERIES
+void watchdog_nmi_set_lpm_factor(u64 factor)
+{
+	pr_info("Set the NMI watchdog factor to %llu%%\n", factor);
+	WRITE_ONCE(wd_factor, factor);
+	lockup_detector_reconfigure();
+}
+#endif