Message ID | 20220627135347.32624-4-ldufour@linux.ibm.com |
---|---|
State | Superseded |
Headers | show |
Series | Extending NMI watchdog during LPM | expand |
Excerpts from Laurent Dufour's message of June 27, 2022 11:53 pm: > Introduce a factor which would apply to the NMI watchdog timeout. > > This factor is a percentage added to the watchdog_tresh value. The value is > set under the watchdog_mutex protection and lockup_detector_reconfigure() > is called to recompute wd_panic_timeout_tb. > > Once the factor is set, it remains until it is set back to 0, which means > no impact. Looks okay. We could worry about making it more generic or nicer if another user came along. Could you make the naming a bit more self documenting? watchdog_nmi_set_timeout_pct(), maybe? Does the wd really care that it is for LPM in particular? Variables and parameters could have a _pct suffix too. Otherwise Reviewed-by: Nicholas Piggin <npiggin@gmail.com> > > Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> > --- > arch/powerpc/include/asm/nmi.h | 2 ++ > arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++- > 2 files changed, 22 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h > index ea0e487f87b1..7d6a8d9b0543 100644 > --- a/arch/powerpc/include/asm/nmi.h > +++ b/arch/powerpc/include/asm/nmi.h > @@ -5,8 +5,10 @@ > #ifdef CONFIG_PPC_WATCHDOG > extern void arch_touch_nmi_watchdog(void); > long soft_nmi_interrupt(struct pt_regs *regs); > +void watchdog_nmi_set_lpm_factor(u64 factor); > #else > static inline void arch_touch_nmi_watchdog(void) {} > +static inline void watchdog_nmi_set_lpm_factor(u64 factor) {} > #endif > > #ifdef CONFIG_NMI_IPI > diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c > index 7d28b9553654..80851b228f71 100644 > --- a/arch/powerpc/kernel/watchdog.c > +++ b/arch/powerpc/kernel/watchdog.c > @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending; > static cpumask_t wd_smp_cpus_stuck; > static u64 wd_smp_last_reset_tb; > > +#ifdef CONFIG_PPC_PSERIES > +static u64 wd_factor; > +#endif > + > /* > * Try to take the exclusive watchdog action / NMI IPI / printing lock. > * wd_smp_lock must be held. If this fails, we should return and wait > @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu) > > static void watchdog_calc_timeouts(void) > { > - wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; > + u64 threshold = watchdog_thresh; > + > +#ifdef CONFIG_PPC_PSERIES > + threshold += (READ_ONCE(wd_factor) * threshold) / 100; > +#endif > + > + wd_panic_timeout_tb = threshold * ppc_tb_freq; > > /* Have the SMP detector trigger a bit later */ > wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; > @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void) > } > return 0; > } > + > +#ifdef CONFIG_PPC_PSERIES > +void watchdog_nmi_set_lpm_factor(u64 factor) > +{ > + pr_info("Set the NMI watchdog factor to %llu%%\n", factor); > + WRITE_ONCE(wd_factor, factor); > + lockup_detector_reconfigure(); > +} > +#endif > -- > 2.36.1 > >
Le 12/07/2022 à 03:42, Nicholas Piggin a écrit : > Excerpts from Laurent Dufour's message of June 27, 2022 11:53 pm: >> Introduce a factor which would apply to the NMI watchdog timeout. >> >> This factor is a percentage added to the watchdog_tresh value. The value is >> set under the watchdog_mutex protection and lockup_detector_reconfigure() >> is called to recompute wd_panic_timeout_tb. >> >> Once the factor is set, it remains until it is set back to 0, which means >> no impact. > > Looks okay. We could worry about making it more generic or nicer if > another user came along. > > Could you make the naming a bit more self documenting? > watchdog_nmi_set_timeout_pct(), maybe? Does the wd really care > that it is for LPM in particular? You're right, the name should not mention lpm. For my information, what does "pct" stand for? > > Variables and parameters could have a _pct suffix too. I'll do that. > > Otherwise > > Reviewed-by: Nicholas Piggin <npiggin@gmail.com> > >> >> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> >> --- >> arch/powerpc/include/asm/nmi.h | 2 ++ >> arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++- >> 2 files changed, 22 insertions(+), 1 deletion(-) >> >> diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h >> index ea0e487f87b1..7d6a8d9b0543 100644 >> --- a/arch/powerpc/include/asm/nmi.h >> +++ b/arch/powerpc/include/asm/nmi.h >> @@ -5,8 +5,10 @@ >> #ifdef CONFIG_PPC_WATCHDOG >> extern void arch_touch_nmi_watchdog(void); >> long soft_nmi_interrupt(struct pt_regs *regs); >> +void watchdog_nmi_set_lpm_factor(u64 factor); >> #else >> static inline void arch_touch_nmi_watchdog(void) {} >> +static inline void watchdog_nmi_set_lpm_factor(u64 factor) {} >> #endif >> >> #ifdef CONFIG_NMI_IPI >> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c >> index 7d28b9553654..80851b228f71 100644 >> --- a/arch/powerpc/kernel/watchdog.c >> +++ b/arch/powerpc/kernel/watchdog.c >> @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending; >> static cpumask_t wd_smp_cpus_stuck; >> static u64 wd_smp_last_reset_tb; >> >> +#ifdef CONFIG_PPC_PSERIES >> +static u64 wd_factor; >> +#endif >> + >> /* >> * Try to take the exclusive watchdog action / NMI IPI / printing lock. >> * wd_smp_lock must be held. If this fails, we should return and wait >> @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu) >> >> static void watchdog_calc_timeouts(void) >> { >> - wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; >> + u64 threshold = watchdog_thresh; >> + >> +#ifdef CONFIG_PPC_PSERIES >> + threshold += (READ_ONCE(wd_factor) * threshold) / 100; >> +#endif >> + >> + wd_panic_timeout_tb = threshold * ppc_tb_freq; >> >> /* Have the SMP detector trigger a bit later */ >> wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; >> @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void) >> } >> return 0; >> } >> + >> +#ifdef CONFIG_PPC_PSERIES >> +void watchdog_nmi_set_lpm_factor(u64 factor) >> +{ >> + pr_info("Set the NMI watchdog factor to %llu%%\n", factor); >> + WRITE_ONCE(wd_factor, factor); >> + lockup_detector_reconfigure(); >> +} >> +#endif >> -- >> 2.36.1 >> >>
diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index ea0e487f87b1..7d6a8d9b0543 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -5,8 +5,10 @@ #ifdef CONFIG_PPC_WATCHDOG extern void arch_touch_nmi_watchdog(void); long soft_nmi_interrupt(struct pt_regs *regs); +void watchdog_nmi_set_lpm_factor(u64 factor); #else static inline void arch_touch_nmi_watchdog(void) {} +static inline void watchdog_nmi_set_lpm_factor(u64 factor) {} #endif #ifdef CONFIG_NMI_IPI diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 7d28b9553654..80851b228f71 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -91,6 +91,10 @@ static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; +#ifdef CONFIG_PPC_PSERIES +static u64 wd_factor; +#endif + /* * Try to take the exclusive watchdog action / NMI IPI / printing lock. * wd_smp_lock must be held. If this fails, we should return and wait @@ -527,7 +531,13 @@ static int stop_watchdog_on_cpu(unsigned int cpu) static void watchdog_calc_timeouts(void) { - wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + u64 threshold = watchdog_thresh; + +#ifdef CONFIG_PPC_PSERIES + threshold += (READ_ONCE(wd_factor) * threshold) / 100; +#endif + + wd_panic_timeout_tb = threshold * ppc_tb_freq; /* Have the SMP detector trigger a bit later */ wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; @@ -570,3 +580,12 @@ int __init watchdog_nmi_probe(void) } return 0; } + +#ifdef CONFIG_PPC_PSERIES +void watchdog_nmi_set_lpm_factor(u64 factor) +{ + pr_info("Set the NMI watchdog factor to %llu%%\n", factor); + WRITE_ONCE(wd_factor, factor); + lockup_detector_reconfigure(); +} +#endif
Introduce a factor which would apply to the NMI watchdog timeout. This factor is a percentage added to the watchdog_tresh value. The value is set under the watchdog_mutex protection and lockup_detector_reconfigure() is called to recompute wd_panic_timeout_tb. Once the factor is set, it remains until it is set back to 0, which means no impact. Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> --- arch/powerpc/include/asm/nmi.h | 2 ++ arch/powerpc/kernel/watchdog.c | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-)