Message ID | 20220531223100.510392-2-surenb@google.com |
---|---|
State | New |
Headers | show |
Series | [RESEND,v2,1/2] mm: drop oom code from exit_mmap | expand |
On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote: > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is > now unused and can be removed. > > ... > > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) > return tsk->signal->oom_mm; > } > > -/* > - * Use this helper if tsk->mm != mm and the victim mm needs a special > - * handling. This is guaranteed to stay true after once set. > - */ > -static inline bool mm_is_oom_victim(struct mm_struct *mm) > -{ > - return test_bit(MMF_OOM_VICTIM, &mm->flags); > -} > - The patch "mm: multi-gen LRU: support page table walks" from the MGLRU series (https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com) adds two calls to mm_is_oom_victim(), so my build broke. I assume the fix is simply --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix +++ a/mm/vmscan.c @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str if (size < MIN_LRU_BATCH) return true; - if (mm_is_oom_victim(mm)) - return true; - return !mmget_not_zero(mm); } @@ -4127,9 +4124,6 @@ restart: walk_pmd_range(&val, addr, next, args); - if (mm_is_oom_victim(args->mm)) - return 1; - /* a racy check to curtail the waiting time */ if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) return 1;
On Mon, Aug 22, 2022 at 4:21 PM Andrew Morton <akpm@linux-foundation.org> wrote: > > On Tue, 31 May 2022 15:31:00 -0700 Suren Baghdasaryan <surenb@google.com> wrote: > > > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is > > now unused and can be removed. > > > > ... > > > > --- a/include/linux/oom.h > > +++ b/include/linux/oom.h > > @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) > > return tsk->signal->oom_mm; > > } > > > > -/* > > - * Use this helper if tsk->mm != mm and the victim mm needs a special > > - * handling. This is guaranteed to stay true after once set. > > - */ > > -static inline bool mm_is_oom_victim(struct mm_struct *mm) > > -{ > > - return test_bit(MMF_OOM_VICTIM, &mm->flags); > > -} > > - > > The patch "mm: multi-gen LRU: support page table walks" from the MGLRU > series > (https://lkml.kernel.org/r/20220815071332.627393-9-yuzhao@google.com) > adds two calls to mm_is_oom_victim(), so my build broke. > > I assume the fix is simply > > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix > +++ a/mm/vmscan.c > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str > if (size < MIN_LRU_BATCH) > return true; > > - if (mm_is_oom_victim(mm)) > - return true; > - > return !mmget_not_zero(mm); > } > > @@ -4127,9 +4124,6 @@ restart: > > walk_pmd_range(&val, addr, next, args); > > - if (mm_is_oom_victim(args->mm)) > - return 1; > - > /* a racy check to curtail the waiting time */ > if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) > return 1; > _ > > Please confirm? LGTM. The deleted checks are not about correctness. I've queued --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) if (size < MIN_LRU_BATCH) return true; - if (mm_is_oom_victim(mm)) + if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags)) return true; return !mmget_not_zero(mm); @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, walk_pmd_range(&val, addr, next, args); - if (mm_is_oom_victim(args->mm)) + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) return 1; /* a racy check to curtail the waiting time */
On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote: > > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix > > +++ a/mm/vmscan.c > > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str > > if (size < MIN_LRU_BATCH) > > return true; > > > > - if (mm_is_oom_victim(mm)) > > - return true; > > - > > return !mmget_not_zero(mm); > > } > > > > @@ -4127,9 +4124,6 @@ restart: > > > > walk_pmd_range(&val, addr, next, args); > > > > - if (mm_is_oom_victim(args->mm)) > > - return 1; > > - > > /* a racy check to curtail the waiting time */ > > if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) > > return 1; > > _ > > > > Please confirm? > > LGTM. The deleted checks are not about correctness. OK, for now. > I've queued > > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm, > struct lru_gen_mm_walk *walk) > if (size < MIN_LRU_BATCH) > return true; > > - if (mm_is_oom_victim(mm)) > + if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags)) > return true; > > return !mmget_not_zero(mm); > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned > long start, unsigned long end, > > walk_pmd_range(&val, addr, next, args); > > - if (mm_is_oom_victim(args->mm)) > + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) > return 1; > > /* a racy check to curtail the waiting time */ Oh. Why? What does this change do?
On Mon, Aug 22, 2022 at 4:48 PM Andrew Morton <akpm@linux-foundation.org> wrote: > > On Mon, 22 Aug 2022 16:33:51 -0600 Yu Zhao <yuzhao@google.com> wrote: > > > > --- a/mm/vmscan.c~mm-delete-unused-mmf_oom_victim-flag-fix > > > +++ a/mm/vmscan.c > > > @@ -3429,9 +3429,6 @@ static bool should_skip_mm(struct mm_str > > > if (size < MIN_LRU_BATCH) > > > return true; > > > > > > - if (mm_is_oom_victim(mm)) > > > - return true; > > > - > > > return !mmget_not_zero(mm); > > > } > > > > > > @@ -4127,9 +4124,6 @@ restart: > > > > > > walk_pmd_range(&val, addr, next, args); > > > > > > - if (mm_is_oom_victim(args->mm)) > > > - return 1; > > > - > > > /* a racy check to curtail the waiting time */ > > > if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) > > > return 1; > > > _ > > > > > > Please confirm? > > > > LGTM. The deleted checks are not about correctness. > > OK, for now. > > > I've queued > > > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -3402,7 +3402,7 @@ static bool should_skip_mm(struct mm_struct *mm, > > struct lru_gen_mm_walk *walk) > > if (size < MIN_LRU_BATCH) > > return true; > > > > - if (mm_is_oom_victim(mm)) > > + if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags)) > > return true; > > > > return !mmget_not_zero(mm); > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned > > long start, unsigned long end, > > > > walk_pmd_range(&val, addr, next, args); > > > > - if (mm_is_oom_victim(args->mm)) > > + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) > > return 1; > > > > /* a racy check to curtail the waiting time */ > > Oh. Why? What does this change do? The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM flag, but it's set at a later stage during an OOM kill. When either is set, the OOM reaper is probably already freeing the memory of this mm_struct, or at least it's going to. So there is no need to dwell on it in the reclaim path, hence not about correctness.
On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote: > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned > > > long start, unsigned long end, > > > > > > walk_pmd_range(&val, addr, next, args); > > > > > > - if (mm_is_oom_victim(args->mm)) > > > + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) > > > return 1; > > > > > > /* a racy check to curtail the waiting time */ > > > > Oh. Why? What does this change do? > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM > flag, but it's set at a later stage during an OOM kill. > > When either is set, the OOM reaper is probably already freeing the > memory of this mm_struct, or at least it's going to. So there is no > need to dwell on it in the reclaim path, hence not about correctness. Thanks. That sounds worthy of some code comments?
On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote: > > On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote: > > > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned > > > > long start, unsigned long end, > > > > > > > > walk_pmd_range(&val, addr, next, args); > > > > > > > > - if (mm_is_oom_victim(args->mm)) > > > > + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) > > > > return 1; > > > > > > > > /* a racy check to curtail the waiting time */ > > > > > > Oh. Why? What does this change do? > > > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM > > flag, but it's set at a later stage during an OOM kill. > > > > When either is set, the OOM reaper is probably already freeing the > > memory of this mm_struct, or at least it's going to. So there is no > > need to dwell on it in the reclaim path, hence not about correctness. > > Thanks. That sounds worthy of some code comments? Will do. Thanks.
On Mon 22-08-22 17:20:17, Yu Zhao wrote: > On Mon, Aug 22, 2022 at 5:16 PM Andrew Morton <akpm@linux-foundation.org> wrote: > > > > On Mon, 22 Aug 2022 16:59:29 -0600 Yu Zhao <yuzhao@google.com> wrote: > > > > > > > @@ -4109,7 +4109,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned > > > > > long start, unsigned long end, > > > > > > > > > > walk_pmd_range(&val, addr, next, args); > > > > > > > > > > - if (mm_is_oom_victim(args->mm)) > > > > > + if (test_bit(MMF_OOM_REAP_QUEUED, &args->mm->flags)) > > > > > return 1; > > > > > > > > > > /* a racy check to curtail the waiting time */ > > > > > > > > Oh. Why? What does this change do? > > > > > > The MMF_OOM_REAP_QUEUED flag is similar to the deleted MMF_OOM_VICTIM > > > flag, but it's set at a later stage during an OOM kill. > > > > > > When either is set, the OOM reaper is probably already freeing the > > > memory of this mm_struct, or at least it's going to. So there is no > > > need to dwell on it in the reclaim path, hence not about correctness. > > > > Thanks. That sounds worthy of some code comments? > > Will do. Thanks. I would rather not see this abuse. You cannot really make any assumptions about oom_reaper and how quickly it is going to free the memory. If this is really worth it (and I have to say I doubt it) then it should be a separate patch with numbers justifying it.
On Mon 29-08-22 12:40:05, Michal Hocko wrote: > On Sun 28-08-22 13:50:09, Yu Zhao wrote: > > On Tue, Aug 23, 2022 at 2:36 AM Michal Hocko <mhocko@suse.com> wrote: > [...] > > > You cannot really make any > > > assumptions about oom_reaper and how quickly it is going to free the > > > memory. > > > > Agreed. But here we are talking about heuristics, not dependencies on > > certain behaviors. Assume we are playing a guessing game: there are > > multiple mm_structs available for reclaim, would the oom-killed ones > > be more profitable on average? I'd say no, because I assume it's more > > likely than unlikely that the oom reaper is doing/to do its work. Note > > that the assumption is about likelihood, hence arguably valid. > > Well, my main counter argument would be that we do not really want to > carve last resort mechanism (which the oom reaper is) into any heuristic > because any future changes into that mechanism will be much harder to > justify and change. There is a cost of the maintenance that should be > considered. While you might be right that this change would be > beneficial, there is no actual proof of that. Historically we've had > several examples of such a behavior which was really hard to change > later on because the effect would be really hard to evaluate. Forgot to mention the recent change as a clear example of the change which would be have a higher burden to evaluate. e4a38402c36e ("oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup") has changed the wake up logic to be triggered after a timeout. This means that the task will be sitting there on the queue without any actual reclaim done on it. The timeout itself can be changed in the future and I would really hate to argue that changeing it from $FOO to $FOO + epsilon breaks a very subtle dependency somewhere deep in the reclaim path. From the oom reaper POV any timeout is reasonable becaude this is the _last_ resort to resolve OOM stall/deadlock when the victim cannot exit on its own for whatever reason. This is a considerably different objective from "we want to optimize which taks to scan to reclaim efficiently". See my point?
diff --git a/include/linux/oom.h b/include/linux/oom.h index 6cdde62b078b..7d0c9c48a0c5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } -/* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70..8270ad7ae14c 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98dca2b42357..c6c76c313b39 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -764,10 +764,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep