Message ID | 20241213095407.271357-1-gmonaco@redhat.com |
---|---|
Headers | show |
Series | sched: Move task_mm_cid_work to mm delayed work | expand |
On Fri, 2024-12-13 at 10:54 +0100, Gabriele Monaco wrote: > OVERHEAD COMPARISON > > [..] > > I will post another email with the scripts used to retrieve the data > and > more details about the runtime distribution. This message contains the performance results produced by my scripts, which are attached. The tracing is done via bpftrace while a simple bash script is spawning and killing the loads.
On 2024-12-13 04:54, Gabriele Monaco wrote: > Currently, the task_mm_cid_work function is called in a task work > triggered by a scheduler tick. This can delay the execution of the task > for the entire duration of the function, negatively affecting the > response of real time tasks. > > This patch runs the task_mm_cid_work in a new delayed work connected to > the mm_struct rather than in the task context before returning to > userspace. > > This delayed work is initialised while allocating the mm and disabled > before freeing it, its execution is no longer triggered by scheduler > ticks but run periodically based on the defined MM_CID_SCAN_DELAY. > > The main advantage of this change is that the function can be offloaded > to a different CPU and even preempted by RT tasks. > > Moreover, this new behaviour could be more predictable in some > situations since the delayed work is always scheduled with the same > periodicity for each mm. This last paragraph could be clarified. AFAIR, the problem with the preexisting approach based on the scheduler tick is with a mm consisting of a set of periodic threads, where none happen to run while the scheduler tick is running. This would skip mm_cid compaction. So it's not a bug per se, because the mm_cid allocation will just be slightly less compact than it should be in that case. The underlying question here is whether eventual convergence of mm_cid towards 0 when the number of threads or the allowed CPU mask are reduced in a mm should be guaranteed or only best effort. If best effort, then this corner-case is not worthy of a "Fix" tag. Otherwise, we should identify which commit it fixes and introduce a "Fix" tag. Thanks, Mathieu > > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com> > --- > include/linux/mm_types.h | 11 +++++++++ > include/linux/sched.h | 1 - > kernel/sched/core.c | 51 ++++++---------------------------------- > kernel/sched/sched.h | 7 ------ > 4 files changed, 18 insertions(+), 52 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 7361a8f3ab68..92acb827fee4 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -856,6 +856,7 @@ struct mm_struct { > * mm nr_cpus_allowed updates. > */ > raw_spinlock_t cpus_allowed_lock; > + struct delayed_work mm_cid_work; > #endif > #ifdef CONFIG_MMU > atomic_long_t pgtables_bytes; /* size of all page tables */ > @@ -1144,11 +1145,16 @@ static inline void vma_iter_init(struct vma_iterator *vmi, > > #ifdef CONFIG_SCHED_MM_CID > > +#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ > +#define MM_CID_SCAN_DELAY 100 /* 100ms */ > + > enum mm_cid_state { > MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ > MM_CID_LAZY_PUT = (1U << 31), > }; > > +extern void task_mm_cid_work(struct work_struct *work); > + > static inline bool mm_cid_is_unset(int cid) > { > return cid == MM_CID_UNSET; > @@ -1221,12 +1227,17 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * > if (!mm->pcpu_cid) > return -ENOMEM; > mm_init_cid(mm, p); > + INIT_DELAYED_WORK(&mm->mm_cid_work, task_mm_cid_work); > + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); > + schedule_delayed_work(&mm->mm_cid_work, > + msecs_to_jiffies(MM_CID_SCAN_DELAY)); > return 0; > } > #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) > > static inline void mm_destroy_cid(struct mm_struct *mm) > { > + disable_delayed_work_sync(&mm->mm_cid_work); > free_percpu(mm->pcpu_cid); > mm->pcpu_cid = NULL; > } > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d380bffee2ef..5d141c310917 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1374,7 +1374,6 @@ struct task_struct { > int last_mm_cid; /* Most recent cid in mm */ > int migrate_from_cpu; > int mm_cid_active; /* Whether cid bitmap is active */ > - struct callback_head cid_work; > #endif > > struct tlbflush_unmap_batch tlb_ubc; > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index c6d8232ad9ee..e3b27b73301c 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4516,7 +4516,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->wake_entry.u_flags = CSD_TYPE_TTWU; > p->migration_pending = NULL; > #endif > - init_sched_mm_cid(p); > } > > DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); > @@ -5654,7 +5653,6 @@ void sched_tick(void) > resched_latency = cpu_resched_latency(rq); > calc_global_load_tick(rq); > sched_core_tick(rq); > - task_tick_mm_cid(rq, donor); > scx_tick(rq); > > rq_unlock(rq, &rf); > @@ -10520,22 +10518,14 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, > sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); > } > > -static void task_mm_cid_work(struct callback_head *work) > +void task_mm_cid_work(struct work_struct *work) > { > unsigned long now = jiffies, old_scan, next_scan; > - struct task_struct *t = current; > struct cpumask *cidmask; > - struct mm_struct *mm; > + struct delayed_work *delayed_work = container_of(work, struct delayed_work, work); > + struct mm_struct *mm = container_of(delayed_work, struct mm_struct, mm_cid_work); > int weight, cpu; > > - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); > - > - work->next = work; /* Prevent double-add */ > - if (t->flags & PF_EXITING) > - return; > - mm = t->mm; > - if (!mm) > - return; > old_scan = READ_ONCE(mm->mm_cid_next_scan); > next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); > if (!old_scan) { > @@ -10548,9 +10538,9 @@ static void task_mm_cid_work(struct callback_head *work) > old_scan = next_scan; > } > if (time_before(now, old_scan)) > - return; > + goto out; > if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) > - return; > + goto out; > cidmask = mm_cidmask(mm); > /* Clear cids that were not recently used. */ > for_each_possible_cpu(cpu) > @@ -10562,35 +10552,8 @@ static void task_mm_cid_work(struct callback_head *work) > */ > for_each_possible_cpu(cpu) > sched_mm_cid_remote_clear_weight(mm, cpu, weight); > -} > - > -void init_sched_mm_cid(struct task_struct *t) > -{ > - struct mm_struct *mm = t->mm; > - int mm_users = 0; > - > - if (mm) { > - mm_users = atomic_read(&mm->mm_users); > - if (mm_users == 1) > - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); > - } > - t->cid_work.next = &t->cid_work; /* Protect against double add */ > - init_task_work(&t->cid_work, task_mm_cid_work); > -} > - > -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) > -{ > - struct callback_head *work = &curr->cid_work; > - unsigned long now = jiffies; > - > - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || > - work->next != work) > - return; > - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) > - return; > - > - /* No page allocation under rq lock */ > - task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); > +out: > + schedule_delayed_work(delayed_work, msecs_to_jiffies(MM_CID_SCAN_DELAY)); > } > > void sched_mm_cid_exit_signals(struct task_struct *t) > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 76f5f53a645f..21be461ff913 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -3581,16 +3581,11 @@ extern void sched_dynamic_update(int mode); > > #ifdef CONFIG_SCHED_MM_CID > > -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ > -#define MM_CID_SCAN_DELAY 100 /* 100ms */ > - > extern raw_spinlock_t cid_lock; > extern int use_cid_lock; > > extern void sched_mm_cid_migrate_from(struct task_struct *t); > extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); > -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); > -extern void init_sched_mm_cid(struct task_struct *t); > > static inline void __mm_cid_put(struct mm_struct *mm, int cid) > { > @@ -3839,8 +3834,6 @@ static inline void switch_mm_cid(struct rq *rq, > static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } > static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } > static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } > -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } > -static inline void init_sched_mm_cid(struct task_struct *t) { } > #endif /* !CONFIG_SCHED_MM_CID */ > > extern u64 avg_vruntime(struct cfs_rq *cfs_rq);