[RFC,v2,7/8] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value

Message ID	20161027174108.31139-8-patrick.bellasi@arm.com
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Patrick Bellasi <patrick.bellasi@arm.com> To: linux-kernel@vger.kernel.org Cc: Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Vincent Guittot <vincent.guittot@linaro.org>, Steve Muckle <steve.muckle@linaro.org>, Leo Yan <leo.yan@linaro.org>, Viresh Kumar <viresh.kumar@linaro.org>, "Rafael J . Wysocki" <rjw@rjwysocki.net>, Todd Kjos <tkjos@google.com>, Srinath Sridharan <srinathsr@google.com>, Andres Oportus <andresoportus@google.com>, Juri Lelli <juri.lelli@arm.com>, Morten Rasmussen <morten.rasmussen@arm.com>, Dietmar Eggemann <dietmar.eggemann@arm.com>, Chris Redpath <chris.redpath@arm.com>, Robin Randhawa <robin.randhawa@arm.com>, Patrick Bellasi <patrick.bellasi@arm.com> Subject: [RFC v2 7/8] sched/{fair, tune}: track RUNNABLE tasks impact on per CPU boost value Date: Thu, 27 Oct 2016 18:41:07 +0100 Message-Id: <20161027174108.31139-8-patrick.bellasi@arm.com> In-Reply-To: <20161027174108.31139-1-patrick.bellasi@arm.com> References: <20161027174108.31139-1-patrick.bellasi@arm.com> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk

diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45..541e4e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,6 +55,8 @@ #include <linux/shm.h> #include <linux/kcov.h> +#include "sched/tune.h" + #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/pgtable.h> @@ -775,6 +777,9 @@ void __noreturn do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * Ensure that all new tsk->pi_lock acquisitions must observe * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 313a815..f56953b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4570,6 +4570,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } + /* + * Update SchedTune accouting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a trottled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) add_nr_running(rq, 1); @@ -4629,6 +4648,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } + /* + * Update SchedTune accouting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) sub_nr_running(rq, 1); diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 6a51a4d..965a3e1 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,7 @@ #include <linux/cgroup.h> #include <linux/err.h> #include <linux/percpu.h> +#include <linux/rcupdate.h> #include <linux/slab.h> #include "sched.h" @@ -16,6 +17,8 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; #ifdef CONFIG_CGROUP_SCHED_TUNE +static bool schedtune_initialized; + /* * CFS Scheduler Tunables for Task Groups. */ @@ -99,6 +102,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned int tasks; } group[boostgroups_max]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -171,6 +176,213 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; + + /* Update boosted tasks count while avoiding to make it negative */ + bg->group[idx].tasks = max(0, tasks); + + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + lockdep_assert_held(&cpu_rq(cpu)->lock); + + if (!unlikely(schedtune_initialized)) + return; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +static int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *dst_css; + struct rq_flags rq_flags; + struct task_struct *task; + struct boost_groups *bg; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + cgroup_taskset_for_each(task, dst_css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = task_rq_lock(task, &rq_flags); + + if (!task->on_rq) { + task_rq_unlock(rq, task, &rq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and + * requires interrupt to be disabled to avoid race conditions + * on tasks migrations. + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(dst_css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + task_rq_unlock(rq, task, &rq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + task_rq_unlock(rq, task, &rq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || + bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + } + + return 0; +} + +static void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* + * This can happen only if SchedTune controller is mounted with + * other hierarchies and one of them fails. Since usually SchedTune is + * mounted on its own hierarchy, for the time being we do not implement + * a proper rollback mechanism. + */ + WARN(1, "SchedTune cancel attach not implemented"); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + lockdep_assert_held(&cpu_rq(cpu)->lock); + + if (!unlikely(schedtune_initialized)) + return; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). + */ + if (p->flags & PF_EXITING) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct rq_flags rq_flags; + struct schedtune *st; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = task_rq_lock(tsk, &rq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + task_rq_unlock(rq, tsk, &rq_flags); +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -288,6 +500,8 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; @@ -306,6 +520,8 @@ schedtune_init_cgroups(void) pr_info("schedtune: configured to support %d boost groups\n", boostgroups_max); + + schedtune_initialized = true; } #endif /* CONFIG_CGROUP_SCHED_TUNE */ diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index e936b91..ae7dccf 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -14,14 +14,27 @@ extern struct reciprocal_value schedtune_spc_rdiv; int schedtune_cpu_boost(int cpu); +void schedtune_exit_task(struct task_struct *tsk); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + #else /* CONFIG_CGROUP_SCHED_TUNE */ #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_exit_task(task) do { } while (0) + #endif /* CONFIG_CGROUP_SCHED_TUNE */ #else /* CONFIG_SCHED_TUNE */ #define schedtune_cpu_boost(cpu) 0 +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_exit_task(task) do { } while (0) + #endif /* CONFIG_SCHED_TUNE */

[RFC,v2,7/8] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value

Commit Message

Patch