diff mbox series

[v3,10/24] sched/fair: Use IPCC scores to select a busiest runqueue

Message ID 20230207051105.11575-11-ricardo.neri-calderon@linux.intel.com
State New
Headers show
Series sched: Introduce classes of tasks for load balance | expand

Commit Message

Ricardo Neri Feb. 7, 2023, 5:10 a.m. UTC
For two runqueues of equal priority and equal number of running of tasks,
select the one whose current task would have the highest IPC class score
if placed on the destination CPU.

For now, use IPCC scores only for scheduling domains with the
SD_ASYM_PACKING flag.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
Changes since v2:
 * Only use IPCC scores to break ties if the sched domain uses
   asym_packing. (Ionela)
 * Handle errors of arch_get_ipcc_score(). (Ionela)

Changes since v1:
 * Fixed a bug when selecting a busiest runqueue: when comparing two
   runqueues with equal nr_running, we must compute the IPCC score delta
   of both.
 * Renamed local variables to improve the layout of the code block.
   (PeterZ)
 * Used the new interface names.
---
 kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

Comments

Vincent Guittot March 31, 2023, 12:23 p.m. UTC | #1
On Thu, 30 Mar 2023 at 04:03, Ricardo Neri
<ricardo.neri-calderon@linux.intel.com> wrote:
>
> On Tue, Mar 28, 2023 at 12:03:58PM +0200, Vincent Guittot wrote:
> > On Tue, 7 Feb 2023 at 06:01, Ricardo Neri
> > <ricardo.neri-calderon@linux.intel.com> wrote:
> > >
> > > For two runqueues of equal priority and equal number of running of tasks,
> > > select the one whose current task would have the highest IPC class score
> > > if placed on the destination CPU.
> >
> > You failed to explain why it make sense to compare current task score
> > whereas we will most probably not pull this task at the end
>
> Thank you for your feedback Vincent! Please kindly refer to my reply to
> your feedback in patch 7.
>
> > >
> > > For now, use IPCC scores only for scheduling domains with the
> > > SD_ASYM_PACKING flag.
> > >
> > > Cc: Ben Segall <bsegall@google.com>
> > > Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> > > Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > > Cc: Ionela Voinescu <ionela.voinescu@arm.com>
> > > Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > Cc: Len Brown <len.brown@intel.com>
> > > Cc: Lukasz Luba <lukasz.luba@arm.com>
> > > Cc: Mel Gorman <mgorman@suse.de>
> > > Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> > > Cc: Steven Rostedt <rostedt@goodmis.org>
> > > Cc: Tim C. Chen <tim.c.chen@intel.com>
> > > Cc: Valentin Schneider <vschneid@redhat.com>
> > > Cc: x86@kernel.org
> > > Cc: linux-pm@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
> > > ---
> > > Changes since v2:
> > >  * Only use IPCC scores to break ties if the sched domain uses
> > >    asym_packing. (Ionela)
> > >  * Handle errors of arch_get_ipcc_score(). (Ionela)
> > >
> > > Changes since v1:
> > >  * Fixed a bug when selecting a busiest runqueue: when comparing two
> > >    runqueues with equal nr_running, we must compute the IPCC score delta
> > >    of both.
> > >  * Renamed local variables to improve the layout of the code block.
> > >    (PeterZ)
> > >  * Used the new interface names.
> > > ---
> > >  kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 64 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 72d88270b320..d3c22dc145f7 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -9399,6 +9399,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
> > >         return sched_asym_ipcc_prefer(a_stats, b_stats);
> > >  }
> > >
> > > +/**
> > > + * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu
> > > + * @p:         A task
> > > + * @env:       Load balancing environment
> > > + *
> > > + * Returns: The IPCC score delta that @p would get if placed in the destination
> > > + * CPU of @env. LONG_MIN to indicate that the delta should not be used.
> > > + */
> > > +static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
> > > +{
> > > +       unsigned long score_src, score_dst;
> > > +       unsigned short ipcc = p->ipcc;
> > > +
> > > +       if (!sched_ipcc_enabled())
> > > +               return LONG_MIN;
> > > +
> > > +       /* Only asym_packing uses IPCC scores at the moment. */
> > > +       if (!(env->sd->flags & SD_ASYM_PACKING))
> > > +               return LONG_MIN;
> > > +
> > > +       score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu);
> > > +       if (IS_ERR_VALUE(score_dst))
> > > +               return LONG_MIN;
> > > +
> > > +       score_src = arch_get_ipcc_score(ipcc, task_cpu(p));
> > > +       if (IS_ERR_VALUE(score_src))
> > > +               return LONG_MIN;
> > > +
> > > +       return score_dst - score_src;
> > > +}
> > > +
> > >  #else /* CONFIG_IPC_CLASSES */
> > >  static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
> > >                                     struct rq *rq)
> > > @@ -9429,6 +9460,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
> > >         return false;
> > >  }
> > >
> > > +static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
> > > +{
> > > +       return LONG_MIN;
> > > +}
> > > +
> > >  #endif /* CONFIG_IPC_CLASSES */
> > >
> > >  /**
> > > @@ -10589,6 +10625,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > >  {
> > >         struct rq *busiest = NULL, *rq;
> > >         unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
> > > +       long busiest_ipcc_delta = LONG_MIN;
> > >         unsigned int busiest_nr = 0;
> > >         int i;
> > >
> > > @@ -10705,8 +10742,35 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > >
> > >                 case migrate_task:
> > >                         if (busiest_nr < nr_running) {
> > > +                               struct task_struct *curr;
> > > +
> > >                                 busiest_nr = nr_running;
> > >                                 busiest = rq;
> > > +
> > > +                               /*
> > > +                                * Remember the IPCC score delta of busiest::curr.
> > > +                                * We may need it to break a tie with other queues
> > > +                                * with equal nr_running.
> > > +                                */
> > > +                               curr = rcu_dereference(busiest->curr);
> > > +                               busiest_ipcc_delta = ipcc_score_delta(curr, env);
> >
> > Hmm, i don't like this at all
> >
> > Also, curr is the least probable task to be pulled which means that
> > all this his useless
>
> but when doing asym_packing balancing nr_running = 1, need_active_balance()
> returns true and we will pull the current task, no? This is also true for
> fully_busy groups with one task per CPU. These are the only two cases that
> currently use IPCC scores.

hmm, for sure it's not true for fully_busy and I don't see anything
about asym_packing mandating that  nr_running = 1

You should have a look at misfit task which seems to better fit your
situation where you have one task that doesn't fit  its cpu instead of
adding such condition
>
> If there are more than one tasks in the runqueue, the group will be
> classified as overloaded and we will not use IPCC scores nor active
> balance.
>
> Thanks and BR,
> Ricardo
Ricardo Neri April 17, 2023, 11:01 p.m. UTC | #2
On Fri, Mar 31, 2023 at 02:23:58PM +0200, Vincent Guittot wrote:
> On Thu, 30 Mar 2023 at 04:03, Ricardo Neri
> <ricardo.neri-calderon@linux.intel.com> wrote:
> >
> > On Tue, Mar 28, 2023 at 12:03:58PM +0200, Vincent Guittot wrote:
> > > On Tue, 7 Feb 2023 at 06:01, Ricardo Neri
> > > <ricardo.neri-calderon@linux.intel.com> wrote:
> > > >
> > > > For two runqueues of equal priority and equal number of running of tasks,
> > > > select the one whose current task would have the highest IPC class score
> > > > if placed on the destination CPU.
> > >
> > > You failed to explain why it make sense to compare current task score
> > > whereas we will most probably not pull this task at the end
> >
> > Thank you for your feedback Vincent! Please kindly refer to my reply to
> > your feedback in patch 7.
> >
> > > >
> > > > For now, use IPCC scores only for scheduling domains with the
> > > > SD_ASYM_PACKING flag.
> > > >
> > > > Cc: Ben Segall <bsegall@google.com>
> > > > Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> > > > Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > > > Cc: Ionela Voinescu <ionela.voinescu@arm.com>
> > > > Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > Cc: Len Brown <len.brown@intel.com>
> > > > Cc: Lukasz Luba <lukasz.luba@arm.com>
> > > > Cc: Mel Gorman <mgorman@suse.de>
> > > > Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > > Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> > > > Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > Cc: Tim C. Chen <tim.c.chen@intel.com>
> > > > Cc: Valentin Schneider <vschneid@redhat.com>
> > > > Cc: x86@kernel.org
> > > > Cc: linux-pm@vger.kernel.org
> > > > Cc: linux-kernel@vger.kernel.org
> > > > Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
> > > > ---
> > > > Changes since v2:
> > > >  * Only use IPCC scores to break ties if the sched domain uses
> > > >    asym_packing. (Ionela)
> > > >  * Handle errors of arch_get_ipcc_score(). (Ionela)
> > > >
> > > > Changes since v1:
> > > >  * Fixed a bug when selecting a busiest runqueue: when comparing two
> > > >    runqueues with equal nr_running, we must compute the IPCC score delta
> > > >    of both.
> > > >  * Renamed local variables to improve the layout of the code block.
> > > >    (PeterZ)
> > > >  * Used the new interface names.
> > > > ---
> > > >  kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 64 insertions(+)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 72d88270b320..d3c22dc145f7 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -9399,6 +9399,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
> > > >         return sched_asym_ipcc_prefer(a_stats, b_stats);
> > > >  }
> > > >
> > > > +/**
> > > > + * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu
> > > > + * @p:         A task
> > > > + * @env:       Load balancing environment
> > > > + *
> > > > + * Returns: The IPCC score delta that @p would get if placed in the destination
> > > > + * CPU of @env. LONG_MIN to indicate that the delta should not be used.
> > > > + */
> > > > +static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
> > > > +{
> > > > +       unsigned long score_src, score_dst;
> > > > +       unsigned short ipcc = p->ipcc;
> > > > +
> > > > +       if (!sched_ipcc_enabled())
> > > > +               return LONG_MIN;
> > > > +
> > > > +       /* Only asym_packing uses IPCC scores at the moment. */
> > > > +       if (!(env->sd->flags & SD_ASYM_PACKING))
> > > > +               return LONG_MIN;
> > > > +
> > > > +       score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu);
> > > > +       if (IS_ERR_VALUE(score_dst))
> > > > +               return LONG_MIN;
> > > > +
> > > > +       score_src = arch_get_ipcc_score(ipcc, task_cpu(p));
> > > > +       if (IS_ERR_VALUE(score_src))
> > > > +               return LONG_MIN;
> > > > +
> > > > +       return score_dst - score_src;
> > > > +}
> > > > +
> > > >  #else /* CONFIG_IPC_CLASSES */
> > > >  static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
> > > >                                     struct rq *rq)
> > > > @@ -9429,6 +9460,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
> > > >         return false;
> > > >  }
> > > >
> > > > +static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
> > > > +{
> > > > +       return LONG_MIN;
> > > > +}
> > > > +
> > > >  #endif /* CONFIG_IPC_CLASSES */
> > > >
> > > >  /**
> > > > @@ -10589,6 +10625,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > > >  {
> > > >         struct rq *busiest = NULL, *rq;
> > > >         unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
> > > > +       long busiest_ipcc_delta = LONG_MIN;
> > > >         unsigned int busiest_nr = 0;
> > > >         int i;
> > > >
> > > > @@ -10705,8 +10742,35 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > > >
> > > >                 case migrate_task:
> > > >                         if (busiest_nr < nr_running) {
> > > > +                               struct task_struct *curr;
> > > > +
> > > >                                 busiest_nr = nr_running;
> > > >                                 busiest = rq;
> > > > +
> > > > +                               /*
> > > > +                                * Remember the IPCC score delta of busiest::curr.
> > > > +                                * We may need it to break a tie with other queues
> > > > +                                * with equal nr_running.
> > > > +                                */
> > > > +                               curr = rcu_dereference(busiest->curr);
> > > > +                               busiest_ipcc_delta = ipcc_score_delta(curr, env);
> > >
> > > Hmm, i don't like this at all
> > >
> > > Also, curr is the least probable task to be pulled which means that
> > > all this his useless
> >
> > but when doing asym_packing balancing nr_running = 1, need_active_balance()
> > returns true and we will pull the current task, no? This is also true for
> > fully_busy groups with one task per CPU. These are the only two cases that
> > currently use IPCC scores.
> 
> hmm, for sure it's not true for fully_busy and I don't see anything
> about asym_packing mandating that  nr_running = 1

I meant to say that if nr_running = 1, we would pull the running task
because there is nothing else to pull.

Just as when identifying the busiest group, we can instead break the tie
using the task at the back of the runqueue. We will start with this task
when migrating tasks.

> 
> You should have a look at misfit task which seems to better fit your
> situation where you have one task that doesn't fit  its cpu instead of
> adding such condition

Thank you for the suggestion! I did take a look. When dealing with misfit
tasks, we identify one task that is too big for a small CPU. The
destination CPU is not needed when updating the misfit status.

On the other hand, identifying the “an IPCC-misfit” task (similar to
update_misfit_status()) would require knowing the destination CPU, of which
there may be more than one type. We could compute the IPCC-misfit status
for all prospective destinations CPU types but that may introduce a non-
trivial overhead and would be too complex for a tie breaker, IMO.

Unlike migrate_misfit, we would not take immediate action but use IPCC
scores to improve the selection of the busiest runqueue.
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 72d88270b320..d3c22dc145f7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9399,6 +9399,37 @@  static bool sched_asym_ipcc_pick(struct sched_group *a,
 	return sched_asym_ipcc_prefer(a_stats, b_stats);
 }
 
+/**
+ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu
+ * @p:		A task
+ * @env:	Load balancing environment
+ *
+ * Returns: The IPCC score delta that @p would get if placed in the destination
+ * CPU of @env. LONG_MIN to indicate that the delta should not be used.
+ */
+static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
+{
+	unsigned long score_src, score_dst;
+	unsigned short ipcc = p->ipcc;
+
+	if (!sched_ipcc_enabled())
+		return LONG_MIN;
+
+	/* Only asym_packing uses IPCC scores at the moment. */
+	if (!(env->sd->flags & SD_ASYM_PACKING))
+		return LONG_MIN;
+
+	score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu);
+	if (IS_ERR_VALUE(score_dst))
+		return LONG_MIN;
+
+	score_src = arch_get_ipcc_score(ipcc, task_cpu(p));
+	if (IS_ERR_VALUE(score_src))
+		return LONG_MIN;
+
+	return score_dst - score_src;
+}
+
 #else /* CONFIG_IPC_CLASSES */
 static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 				    struct rq *rq)
@@ -9429,6 +9460,11 @@  static bool sched_asym_ipcc_pick(struct sched_group *a,
 	return false;
 }
 
+static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
+{
+	return LONG_MIN;
+}
+
 #endif /* CONFIG_IPC_CLASSES */
 
 /**
@@ -10589,6 +10625,7 @@  static struct rq *find_busiest_queue(struct lb_env *env,
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+	long busiest_ipcc_delta = LONG_MIN;
 	unsigned int busiest_nr = 0;
 	int i;
 
@@ -10705,8 +10742,35 @@  static struct rq *find_busiest_queue(struct lb_env *env,
 
 		case migrate_task:
 			if (busiest_nr < nr_running) {
+				struct task_struct *curr;
+
 				busiest_nr = nr_running;
 				busiest = rq;
+
+				/*
+				 * Remember the IPCC score delta of busiest::curr.
+				 * We may need it to break a tie with other queues
+				 * with equal nr_running.
+				 */
+				curr = rcu_dereference(busiest->curr);
+				busiest_ipcc_delta = ipcc_score_delta(curr, env);
+			/*
+			 * If rq and busiest have the same number of running
+			 * tasks and IPC classes are supported, pick rq if doing
+			 * so would give rq::curr a bigger IPC boost on dst_cpu.
+			 */
+			} else if (busiest_nr == nr_running) {
+				struct task_struct *curr;
+				long delta;
+
+				curr = rcu_dereference(rq->curr);
+				delta = ipcc_score_delta(curr, env);
+
+				if (busiest_ipcc_delta < delta) {
+					busiest_ipcc_delta = delta;
+					busiest_nr = nr_running;
+					busiest = rq;
+				}
 			}
 			break;