diff mbox series

[RESEND,v2,2/2] mm: delete unused MMF_OOM_VICTIM flag

Message ID 20220531223100.510392-2-surenb@google.com
State New
Headers show
Series [RESEND,v2,1/2] mm: drop oom code from exit_mmap | expand

Commit Message

Suren Baghdasaryan May 31, 2022, 10:31 p.m. UTC
With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
now unused and can be removed.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/oom.h            | 9 ---------
 include/linux/sched/coredump.h | 7 +++----
 mm/oom_kill.c                  | 4 +---
 3 files changed, 4 insertions(+), 16 deletions(-)

Comments

Andrew Morton Aug. 22, 2022, 10:21 p.m. UTC | #1
On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote:

> With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> now unused and can be removed.
> 
> ...
>
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
>  	return tsk->signal->oom_mm;
>  }
>  
> -/*
> - * Use this helper if tsk->mm != mm and the victim mm needs a special
> - * handling. This is guaranteed to stay true after once set.
> - */
> -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> -{
> -	return test_bit(MMF_OOM_VICTIM, &mm->flags);
> -}
> -

The patch "mm: multi-gen LRU: support page table walks" from the MGLRU
series
(https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com)
adds two calls to mm_is_oom_victim(), so my build broke.

I assume the fix is simply

--- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
+++ a/mm/vmscan.c
@@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
 	if (size < MIN_LRU_BATCH)
 		return true;
 
-	if (mm_is_oom_victim(mm))
-		return true;
-
 	return !mmget_not_zero(mm);
 }
 
@@ -4127,9 +4124,6 @@ restart:
 
 		walk_pmd_range(&val, addr, next, args);
 
-		if (mm_is_oom_victim(args->mm))
-			return 1;
-
 		/* a racy check to curtail the waiting time */
 		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
 			return 1;
Yu Zhao Aug. 22, 2022, 10:33 p.m. UTC | #2
On Mon, Aug 22, 2022 at 4:21 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote:
>
> > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> > now unused and can be removed.
> >
> > ...
> >
> > --- a/include/linux/oom.h
> > +++ b/include/linux/oom.h
> > @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
> >       return tsk->signal->oom_mm;
> >  }
> >
> > -/*
> > - * Use this helper if tsk->mm != mm and the victim mm needs a special
> > - * handling. This is guaranteed to stay true after once set.
> > - */
> > -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> > -{
> > -     return test_bit(MMF_OOM_VICTIM, &mm->flags);
> > -}
> > -
>
> The patch "mm: multi-gen LRU: support page table walks" from the MGLRU
> series
> (https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com)
> adds two calls to mm_is_oom_victim(), so my build broke.
>
> I assume the fix is simply
>
> --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> +++ a/mm/vmscan.c
> @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
>         if (size < MIN_LRU_BATCH)
>                 return true;
>
> -       if (mm_is_oom_victim(mm))
> -               return true;
> -
>         return !mmget_not_zero(mm);
>  }
>
> @@ -4127,9 +4124,6 @@ restart:
>
>                 walk_pmd_range(&val, addr, next, args);
>
> -               if (mm_is_oom_victim(args->mm))
> -                       return 1;
> -
>                 /* a racy check to curtail the waiting time */
>                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
>                         return 1;
> _
>
> Please confirm?

LGTM.  The deleted checks are not about correctness.

I've queued

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
struct lru_gen_mm_walk *walk)
        if (size < MIN_LRU_BATCH)
                return true;

-       if (mm_is_oom_victim(mm))
+       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
                return true;

        return !mmget_not_zero(mm);
@@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
long start, unsigned long end,

                walk_pmd_range(&val, addr, next, args);

-               if (mm_is_oom_victim(args->mm))
+               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
                        return 1;

                /* a racy check to curtail the waiting time */
Andrew Morton Aug. 22, 2022, 10:48 p.m. UTC | #3
On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote:

> > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> > +++ a/mm/vmscan.c
> > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
> >         if (size < MIN_LRU_BATCH)
> >                 return true;
> >
> > -       if (mm_is_oom_victim(mm))
> > -               return true;
> > -
> >         return !mmget_not_zero(mm);
> >  }
> >
> > @@ -4127,9 +4124,6 @@ restart:
> >
> >                 walk_pmd_range(&val, addr, next, args);
> >
> > -               if (mm_is_oom_victim(args->mm))
> > -                       return 1;
> > -
> >                 /* a racy check to curtail the waiting time */
> >                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
> >                         return 1;
> > _
> >
> > Please confirm?
> 
> LGTM.  The deleted checks are not about correctness.

OK, for now.

> I've queued
> 
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
> struct lru_gen_mm_walk *walk)
>         if (size < MIN_LRU_BATCH)
>                 return true;
> 
> -       if (mm_is_oom_victim(mm))
> +       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
>                 return true;
> 
>         return !mmget_not_zero(mm);
> @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> long start, unsigned long end,
> 
>                 walk_pmd_range(&val, addr, next, args);
> 
> -               if (mm_is_oom_victim(args->mm))
> +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
>                         return 1;
> 
>                 /* a racy check to curtail the waiting time */

Oh.  Why?  What does this change do?
Yu Zhao Aug. 22, 2022, 10:59 p.m. UTC | #4
On Mon, Aug 22, 2022 at 4:48 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix
> > > +++ a/mm/vmscan.c
> > > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str
> > >         if (size < MIN_LRU_BATCH)
> > >                 return true;
> > >
> > > -       if (mm_is_oom_victim(mm))
> > > -               return true;
> > > -
> > >         return !mmget_not_zero(mm);
> > >  }
> > >
> > > @@ -4127,9 +4124,6 @@ restart:
> > >
> > >                 walk_pmd_range(&val, addr, next, args);
> > >
> > > -               if (mm_is_oom_victim(args->mm))
> > > -                       return 1;
> > > -
> > >                 /* a racy check to curtail the waiting time */
> > >                 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
> > >                         return 1;
> > > _
> > >
> > > Please confirm?
> >
> > LGTM.  The deleted checks are not about correctness.
>
> OK, for now.
>
> > I've queued
> >
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm,
> > struct lru_gen_mm_walk *walk)
> >         if (size < MIN_LRU_BATCH)
> >                 return true;
> >
> > -       if (mm_is_oom_victim(mm))
> > +       if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags))
> >                 return true;
> >
> >         return !mmget_not_zero(mm);
> > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > long start, unsigned long end,
> >
> >                 walk_pmd_range(&val, addr, next, args);
> >
> > -               if (mm_is_oom_victim(args->mm))
> > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> >                         return 1;
> >
> >                 /* a racy check to curtail the waiting time */
>
> Oh.  Why?  What does this change do?

The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
flag, but it's set at a later stage during an OOM kill.

When either is set, the OOM reaper is probably already freeing the
memory of this mm_struct, or at least it's going to. So there is no
need to dwell on it in the reclaim path, hence not about correctness.
Andrew Morton Aug. 22, 2022, 11:16 p.m. UTC | #5
On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:

> > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > long start, unsigned long end,
> > >
> > >                 walk_pmd_range(&val, addr, next, args);
> > >
> > > -               if (mm_is_oom_victim(args->mm))
> > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > >                         return 1;
> > >
> > >                 /* a racy check to curtail the waiting time */
> >
> > Oh.  Why?  What does this change do?
> 
> The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> flag, but it's set at a later stage during an OOM kill.
> 
> When either is set, the OOM reaper is probably already freeing the
> memory of this mm_struct, or at least it's going to. So there is no
> need to dwell on it in the reclaim path, hence not about correctness.

Thanks.  That sounds worthy of some code comments?
Yu Zhao Aug. 22, 2022, 11:20 p.m. UTC | #6
On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:
>
> > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > > long start, unsigned long end,
> > > >
> > > >                 walk_pmd_range(&val, addr, next, args);
> > > >
> > > > -               if (mm_is_oom_victim(args->mm))
> > > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > > >                         return 1;
> > > >
> > > >                 /* a racy check to curtail the waiting time */
> > >
> > > Oh.  Why?  What does this change do?
> >
> > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> > flag, but it's set at a later stage during an OOM kill.
> >
> > When either is set, the OOM reaper is probably already freeing the
> > memory of this mm_struct, or at least it's going to. So there is no
> > need to dwell on it in the reclaim path, hence not about correctness.
>
> Thanks.  That sounds worthy of some code comments?

Will do. Thanks.
Michal Hocko Aug. 23, 2022, 8:36 a.m. UTC | #7
On Mon 22-08-22 17:20:17, Yu Zhao wrote:
> On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote:
> >
> > > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned
> > > > > long start, unsigned long end,
> > > > >
> > > > >                 walk_pmd_range(&val, addr, next, args);
> > > > >
> > > > > -               if (mm_is_oom_victim(args->mm))
> > > > > +               if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags))
> > > > >                         return 1;
> > > > >
> > > > >                 /* a racy check to curtail the waiting time */
> > > >
> > > > Oh.  Why?  What does this change do?
> > >
> > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM
> > > flag, but it's set at a later stage during an OOM kill.
> > >
> > > When either is set, the OOM reaper is probably already freeing the
> > > memory of this mm_struct, or at least it's going to. So there is no
> > > need to dwell on it in the reclaim path, hence not about correctness.
> >
> > Thanks.  That sounds worthy of some code comments?
> 
> Will do. Thanks.

I would rather not see this abuse. You cannot really make any
assumptions about oom_reaper and how quickly it is going to free the
memory. If this is really worth it (and I have to say I doubt it) then
it should be a separate patch with numbers justifying it.
Michal Hocko Aug. 29, 2022, 10:45 a.m. UTC | #8
On Mon 29-08-22 12:40:05, Michal Hocko wrote:
> On Sun 28-08-22 13:50:09, Yu Zhao wrote:
> > On Tue, Aug 23, 2022 at 2:36 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > You cannot really make any
> > > assumptions about oom_reaper and how quickly it is going to free the
> > > memory.
> > 
> > Agreed. But here we are talking about heuristics, not dependencies on
> > certain behaviors. Assume we are playing a guessing game: there are
> > multiple mm_structs available for reclaim, would the oom-killed ones
> > be more profitable on average? I'd say no, because I assume it's more
> > likely than unlikely that the oom reaper is doing/to do its work. Note
> > that the assumption is about likelihood, hence arguably valid.
> 
> Well, my main counter argument would be that we do not really want to
> carve last resort mechanism (which the oom reaper is) into any heuristic
> because any future changes into that mechanism will be much harder to
> justify and change. There is a cost of the maintenance that should be
> considered. While you might be right that this change would be
> beneficial, there is no actual proof of that. Historically we've had
> several examples of such a behavior which was really hard to change
> later on because the effect would be really hard to evaluate.

Forgot to mention the recent change as a clear example of the change
which would be have a higher burden to evaluate. e4a38402c36e
("oom_kill.c: futex: delay the OOM reaper to allow time for proper futex
cleanup") has changed the wake up logic to be triggered after a timeout.
This means that the task will be sitting there on the queue without any
actual reclaim done on it. The timeout itself can be changed in the
future and I would really hate to argue that changeing it from $FOO to
$FOO + epsilon breaks a very subtle dependency somewhere deep in the
reclaim path. From the oom reaper POV any timeout is reasonable becaude
this is the _last_ resort to resolve OOM stall/deadlock when the victim
cannot exit on its own for whatever reason. This is a considerably
different objective from "we want to optimize which taks to scan to
reclaim efficiently".

See my point?
diff mbox series

Patch

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6cdde62b078b..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -77,15 +77,6 @@  static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
-/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
-	return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@  static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
-#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS	27	/* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
  * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
  * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@  static inline int get_dumpable(struct mm_struct *mm)
  * pinned pages were unpinned later on, we'll still keep this bit set for the
  * lifecycle of this mm, just for simplicity.
  */
-#define MMF_HAS_PINNED		28	/* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 98dca2b42357..c6c76c313b39 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -764,10 +764,8 @@  static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
 		mmgrab(tsk->signal->oom_mm);
-		set_bit(MMF_OOM_VICTIM, &mm->flags);
-	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep