diff mbox series

[v2] watchdog/mm: Allow dumping memory info in pretimeout

Message ID 20230608-pretimeout-oom-v2-1-581f0ad0e4f3@axis.com
State New
Headers show
Series [v2] watchdog/mm: Allow dumping memory info in pretimeout | expand

Commit Message

Vincent Whitchurch June 12, 2023, 7:26 a.m. UTC
On my (embedded) systems, the most common cause of hitting the watchdog
(pre)timeout is due to thrashing.  Diagnosing these problems is hard
without knowing the memory state at the point of the watchdog hit.  In
order to make this information available, add a module parameter to the
watchdog pretimeout panic governor to ask it to dump memory info and the
OOM task list (using a new helper in the OOM code) before triggering the
panic.

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
Changes in v2:
- Add missing static to fix warning reported by kernel test robot.
- Export __show_mem to fix error reported by kernel test robot.
- Link to v1: https://lore.kernel.org/r/20230608-pretimeout-oom-v1-1-542cc91062d7@axis.com
---
 drivers/watchdog/pretimeout_panic.c | 15 +++++++++++
 include/linux/oom.h                 |  5 ++++
 include/linux/sched/task.h          |  5 ++++
 lib/show_mem.c                      |  1 +
 mm/oom_kill.c                       | 54 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 79 insertions(+), 1 deletion(-)


---
base-commit: 9561de3a55bed6bdd44a12820ba81ec416e705a7
change-id: 20230608-pretimeout-oom-99148438a1df

Best regards,

Comments

Guenter Roeck June 12, 2023, 2:53 p.m. UTC | #1
On 6/12/23 00:26, Vincent Whitchurch wrote:
> On my (embedded) systems, the most common cause of hitting the watchdog
> (pre)timeout is due to thrashing.  Diagnosing these problems is hard
> without knowing the memory state at the point of the watchdog hit.  In
> order to make this information available, add a module parameter to the
> watchdog pretimeout panic governor to ask it to dump memory info and the
> OOM task list (using a new helper in the OOM code) before triggering the
> panic.
> 

Personally I don't think this is the right way of approaching this problem.
First, the userspace task controlling the watchdog should run as realtime
task, forced to be in memory, and not be affected by thrashing.
Second, the problem should be observable well before the watchdog fires.
Last but not least, I don't think it is appropriate to intertwine watchdog
code with oom handling code as suggested here.

Guenter

> Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
> ---
> Changes in v2:
> - Add missing static to fix warning reported by kernel test robot.
> - Export __show_mem to fix error reported by kernel test robot.
> - Link to v1: https://lore.kernel.org/r/20230608-pretimeout-oom-v1-1-542cc91062d7@axis.com
> ---
>   drivers/watchdog/pretimeout_panic.c | 15 +++++++++++
>   include/linux/oom.h                 |  5 ++++
>   include/linux/sched/task.h          |  5 ++++
>   lib/show_mem.c                      |  1 +
>   mm/oom_kill.c                       | 54 ++++++++++++++++++++++++++++++++++++-
>   5 files changed, 79 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/watchdog/pretimeout_panic.c b/drivers/watchdog/pretimeout_panic.c
> index 2cc3c41d2be5b..52d686fa541c7 100644
> --- a/drivers/watchdog/pretimeout_panic.c
> +++ b/drivers/watchdog/pretimeout_panic.c
> @@ -5,10 +5,15 @@
>   
>   #include <linux/kernel.h>
>   #include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/oom.h>
>   #include <linux/watchdog.h>
>   
>   #include "watchdog_pretimeout.h"
>   
> +static unsigned long dump_min_rss_bytes;
> +module_param(dump_min_rss_bytes, ulong, 0644);
> +
>   /**
>    * pretimeout_panic - Panic on watchdog pretimeout event
>    * @wdd - watchdog_device
> @@ -17,6 +22,16 @@
>    */
>   static void pretimeout_panic(struct watchdog_device *wdd)
>   {
> +	/*
> +	 * Since the root cause is not certain to be low memory, only print
> +	 * tasks with RSS above a configurable limit, to avoid losing
> +	 * potentially more important messages from the log.
> +	 */
> +	if (dump_min_rss_bytes) {
> +		show_mem(SHOW_MEM_FILTER_NODES, NULL);
> +		oom_dump_tasks(DIV_ROUND_UP(dump_min_rss_bytes, PAGE_SIZE));
> +	}
> +
>   	panic("watchdog pretimeout event\n");
>   }
>   
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 7d0c9c48a0c54..1451fe2c38d78 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -52,6 +52,9 @@ struct oom_control {
>   
>   	/* Used to print the constraint info. */
>   	enum oom_constraint constraint;
> +
> +	bool dump_trylock;
> +	unsigned long dump_min_rss_pages;
>   };
>   
>   extern struct mutex oom_lock;
> @@ -102,6 +105,8 @@ long oom_badness(struct task_struct *p,
>   
>   extern bool out_of_memory(struct oom_control *oc);
>   
> +extern void oom_dump_tasks(unsigned long min_rss_pages);
> +
>   extern void exit_oom_victim(void);
>   
>   extern int register_oom_notifier(struct notifier_block *nb);
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index e0f5ac90a228b..e8a68b2a3e829 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -183,6 +183,11 @@ static inline void task_lock(struct task_struct *p)
>   	spin_lock(&p->alloc_lock);
>   }
>   
> +static inline int task_trylock(struct task_struct *p)
> +{
> +	return spin_trylock(&p->alloc_lock);
> +}
> +
>   static inline void task_unlock(struct task_struct *p)
>   {
>   	spin_unlock(&p->alloc_lock);
> diff --git a/lib/show_mem.c b/lib/show_mem.c
> index 1485c87be9354..cf90d1c5182b7 100644
> --- a/lib/show_mem.c
> +++ b/lib/show_mem.c
> @@ -35,3 +35,4 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>   	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>   #endif
>   }
> +EXPORT_SYMBOL_GPL(__show_mem);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 044e1eed720ee..0fad1c6d3c90c 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -149,6 +149,30 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
>   	return t;
>   }
>   
> +/*
> + * Identical to the above, except that we avoid tasks which we can't lock, to
> + * avoid deadlocks when called from an interrupt handler.
> + */
> +static struct task_struct *find_trylock_task_mm(struct task_struct *p)
> +{
> +	struct task_struct *t;
> +
> +	rcu_read_lock();
> +
> +	for_each_thread(p, t) {
> +		if (!task_trylock(t))
> +			continue;
> +		if (likely(t->mm))
> +			goto found;
> +		task_unlock(t);
> +	}
> +	t = NULL;
> +found:
> +	rcu_read_unlock();
> +
> +	return t;
> +}
> +
>   /*
>    * order == -1 means the oom kill is required by sysrq, otherwise only
>    * for display purposes.
> @@ -390,15 +414,26 @@ static int dump_task(struct task_struct *p, void *arg)
>   	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
>   		return 0;
>   
> -	task = find_lock_task_mm(p);
> +	task = oc->dump_trylock ? find_trylock_task_mm(p) :
> +				  find_lock_task_mm(p);
>   	if (!task) {
>   		/*
>   		 * All of p's threads have already detached their mm's. There's
>   		 * no need to report them; they can't be oom killed anyway.
> +		 *
> +		 * Or we got here from an interrupt and the task lock is
> +		 * locked, in which case we're forced to ignore this task to
> +		 * avoid deadlocks.
>   		 */
>   		return 0;
>   	}
>   
> +	if (oc->dump_min_rss_pages &&
> +	    get_mm_rss(task->mm) < oc->dump_min_rss_pages) {
> +		task_unlock(task);
> +		return 0;
> +	}
> +
>   	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
>   		task->pid, from_kuid(&init_user_ns, task_uid(task)),
>   		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> @@ -437,6 +472,23 @@ static void dump_tasks(struct oom_control *oc)
>   	}
>   }
>   
> +void oom_dump_tasks(unsigned long min_rss_pages)
> +{
> +	const gfp_t gfp_mask = GFP_KERNEL;
> +	struct oom_control oc = {
> +		.zonelist = node_zonelist(first_memory_node, gfp_mask),
> +		.nodemask = NULL,
> +		.memcg = NULL,
> +		.gfp_mask = gfp_mask,
> +		.order = -1,
> +		.dump_min_rss_pages = min_rss_pages,
> +		.dump_trylock = in_interrupt(),
> +	};
> +
> +	dump_tasks(&oc);
> +}
> +EXPORT_SYMBOL_GPL(oom_dump_tasks);
> +
>   static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>   {
>   	/* one line summary of the oom killer context. */
> 
> ---
> base-commit: 9561de3a55bed6bdd44a12820ba81ec416e705a7
> change-id: 20230608-pretimeout-oom-99148438a1df
> 
> Best regards,
Vincent Whitchurch June 14, 2023, 7:42 a.m. UTC | #2
On Mon, 2023-06-12 at 07:53 -0700, Guenter Roeck wrote:
> On 6/12/23 00:26, Vincent Whitchurch wrote:
> > On my (embedded) systems, the most common cause of hitting the watchdog
> > (pre)timeout is due to thrashing.  Diagnosing these problems is hard
> > without knowing the memory state at the point of the watchdog hit.  In
> > order to make this information available, add a module parameter to the
> > watchdog pretimeout panic governor to ask it to dump memory info and the
> > OOM task list (using a new helper in the OOM code) before triggering the
> > panic.
> 
> Personally I don't think this is the right way of approaching this problem.
> First, the userspace task controlling the watchdog should run as realtime
> task, forced to be in memory, and not be affected by thrashing.

That may not be appropriate in all cases since you may want the watchdog
to hit when the system as a whole really is unusable.

> Second, the problem should be observable well before the watchdog fires.

Yes, there are ways to try to detect it earlier (e.g. PSI) and attempt
recovery, even if the kernel's OOM killer itself is very slow to react.

But if those attempts fail for whatever reason and we actually do end up
hitting the watchdog, something like this patch provides information
which is invaluable for diagnosing the problem.

> Last but not least, I don't think it is appropriate to intertwine
> watchdog code with oom handling code as suggested here.

The show_mem() function is in lib/ so that's outside of the OOM
handling.  The oom_dump_tasks() function could perhaps be refactored and
moved to a neutral location so then we would avoid the intertwining.
diff mbox series

Patch

diff --git a/drivers/watchdog/pretimeout_panic.c b/drivers/watchdog/pretimeout_panic.c
index 2cc3c41d2be5b..52d686fa541c7 100644
--- a/drivers/watchdog/pretimeout_panic.c
+++ b/drivers/watchdog/pretimeout_panic.c
@@ -5,10 +5,15 @@ 
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/oom.h>
 #include <linux/watchdog.h>
 
 #include "watchdog_pretimeout.h"
 
+static unsigned long dump_min_rss_bytes;
+module_param(dump_min_rss_bytes, ulong, 0644);
+
 /**
  * pretimeout_panic - Panic on watchdog pretimeout event
  * @wdd - watchdog_device
@@ -17,6 +22,16 @@ 
  */
 static void pretimeout_panic(struct watchdog_device *wdd)
 {
+	/*
+	 * Since the root cause is not certain to be low memory, only print
+	 * tasks with RSS above a configurable limit, to avoid losing
+	 * potentially more important messages from the log.
+	 */
+	if (dump_min_rss_bytes) {
+		show_mem(SHOW_MEM_FILTER_NODES, NULL);
+		oom_dump_tasks(DIV_ROUND_UP(dump_min_rss_bytes, PAGE_SIZE));
+	}
+
 	panic("watchdog pretimeout event\n");
 }
 
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7d0c9c48a0c54..1451fe2c38d78 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -52,6 +52,9 @@  struct oom_control {
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
+
+	bool dump_trylock;
+	unsigned long dump_min_rss_pages;
 };
 
 extern struct mutex oom_lock;
@@ -102,6 +105,8 @@  long oom_badness(struct task_struct *p,
 
 extern bool out_of_memory(struct oom_control *oc);
 
+extern void oom_dump_tasks(unsigned long min_rss_pages);
+
 extern void exit_oom_victim(void);
 
 extern int register_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index e0f5ac90a228b..e8a68b2a3e829 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -183,6 +183,11 @@  static inline void task_lock(struct task_struct *p)
 	spin_lock(&p->alloc_lock);
 }
 
+static inline int task_trylock(struct task_struct *p)
+{
+	return spin_trylock(&p->alloc_lock);
+}
+
 static inline void task_unlock(struct task_struct *p)
 {
 	spin_unlock(&p->alloc_lock);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1485c87be9354..cf90d1c5182b7 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -35,3 +35,4 @@  void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif
 }
+EXPORT_SYMBOL_GPL(__show_mem);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044e1eed720ee..0fad1c6d3c90c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -149,6 +149,30 @@  struct task_struct *find_lock_task_mm(struct task_struct *p)
 	return t;
 }
 
+/*
+ * Identical to the above, except that we avoid tasks which we can't lock, to
+ * avoid deadlocks when called from an interrupt handler.
+ */
+static struct task_struct *find_trylock_task_mm(struct task_struct *p)
+{
+	struct task_struct *t;
+
+	rcu_read_lock();
+
+	for_each_thread(p, t) {
+		if (!task_trylock(t))
+			continue;
+		if (likely(t->mm))
+			goto found;
+		task_unlock(t);
+	}
+	t = NULL;
+found:
+	rcu_read_unlock();
+
+	return t;
+}
+
 /*
  * order == -1 means the oom kill is required by sysrq, otherwise only
  * for display purposes.
@@ -390,15 +414,26 @@  static int dump_task(struct task_struct *p, void *arg)
 	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
 		return 0;
 
-	task = find_lock_task_mm(p);
+	task = oc->dump_trylock ? find_trylock_task_mm(p) :
+				  find_lock_task_mm(p);
 	if (!task) {
 		/*
 		 * All of p's threads have already detached their mm's. There's
 		 * no need to report them; they can't be oom killed anyway.
+		 *
+		 * Or we got here from an interrupt and the task lock is
+		 * locked, in which case we're forced to ignore this task to
+		 * avoid deadlocks.
 		 */
 		return 0;
 	}
 
+	if (oc->dump_min_rss_pages &&
+	    get_mm_rss(task->mm) < oc->dump_min_rss_pages) {
+		task_unlock(task);
+		return 0;
+	}
+
 	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 		task->pid, from_kuid(&init_user_ns, task_uid(task)),
 		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
@@ -437,6 +472,23 @@  static void dump_tasks(struct oom_control *oc)
 	}
 }
 
+void oom_dump_tasks(unsigned long min_rss_pages)
+{
+	const gfp_t gfp_mask = GFP_KERNEL;
+	struct oom_control oc = {
+		.zonelist = node_zonelist(first_memory_node, gfp_mask),
+		.nodemask = NULL,
+		.memcg = NULL,
+		.gfp_mask = gfp_mask,
+		.order = -1,
+		.dump_min_rss_pages = min_rss_pages,
+		.dump_trylock = in_interrupt(),
+	};
+
+	dump_tasks(&oc);
+}
+EXPORT_SYMBOL_GPL(oom_dump_tasks);
+
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
 {
 	/* one line summary of the oom killer context. */