diff mbox series

[v2,1/3] sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT

Message ID 20210428232821.2506201-2-swood@redhat.com
State New
Headers show
Series newidle_balance() PREEMPT_RT latency mitigations | expand

Commit Message

Crystal Wood April 28, 2021, 11:28 p.m. UTC
This is required in order to be able to enable interrupts in the next
patch.  This is limited to PREEMPT_RT to avoid adding potentially
measurable overhead to the non-RT case (requiring a double switch when
pulling a task onto a newly idle cpu).

update_misfit_status() is factored out for the PREEMPT_RT case, to ensure
that the misfit status is kept consistent before dropping the lock.

Signed-off-by: Scott Wood <swood@redhat.com>
---
v2: Use a balance callback, and limit to PREEMPT_RT

 kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

Comments

Vincent Guittot May 5, 2021, 12:13 p.m. UTC | #1
On Thu, 29 Apr 2021 at 01:28, Scott Wood <swood@redhat.com> wrote:
>

> This is required in order to be able to enable interrupts in the next

> patch.  This is limited to PREEMPT_RT to avoid adding potentially

> measurable overhead to the non-RT case (requiring a double switch when

> pulling a task onto a newly idle cpu).


IIUC, only the newidle_balance is a problem and not the idle load
balance that runs softirq. In this case, why not skipping
newidle_balance entirely in case of preempt_rt and kick an idle load
balance instead as you switch to idle thread context anyway


>

> update_misfit_status() is factored out for the PREEMPT_RT case, to ensure

> that the misfit status is kept consistent before dropping the lock.

>

> Signed-off-by: Scott Wood <swood@redhat.com>

> ---

> v2: Use a balance callback, and limit to PREEMPT_RT

>

>  kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++----

>  1 file changed, 30 insertions(+), 4 deletions(-)

>

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 794c2cb945f8..ff369c38a5b5 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

>

>  #ifdef CONFIG_SMP

>

> +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT);

> +static DEFINE_PER_CPU(struct callback_head, rebalance_head);

> +

>  /* Working cpumask for: load_balance, load_balance_newidle. */

>  DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);

>  DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);

> @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }

>   *     0 - failed, no new tasks

>   *   > 0 - success, new (fair) tasks present

>   */

> -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

> +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)

>  {

>         unsigned long next_balance = jiffies + HZ;

>         int this_cpu = this_rq->cpu;

> @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

>         int pulled_task = 0;

>         u64 curr_cost = 0;

>

> -       update_misfit_status(NULL, this_rq);

> +       if (!newidle_balance_in_callback)

> +               update_misfit_status(NULL, this_rq);

> +

>         /*

>          * We must set idle_stamp _before_ calling idle_balance(), such that we

>          * measure the duration of idle_balance() as idle time.

> @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

>          * further scheduler activity on it and we're being very careful to

>          * re-start the picking loop.

>          */

> -       rq_unpin_lock(this_rq, rf);

> +       if (!newidle_balance_in_callback)

> +               rq_unpin_lock(this_rq, rf);

>

>         if (this_rq->avg_idle < sysctl_sched_migration_cost ||

>             !READ_ONCE(this_rq->rd->overload)) {

> @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

>         if (pulled_task)

>                 this_rq->idle_stamp = 0;

>

> -       rq_repin_lock(this_rq, rf);

> +       if (!newidle_balance_in_callback)

> +               rq_repin_lock(this_rq, rf);

>

>         return pulled_task;

>  }

>

> +static void newidle_balance_cb(struct rq *this_rq)

> +{

> +       update_rq_clock(this_rq);

> +       do_newidle_balance(this_rq, NULL);

> +}

> +

> +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

> +{

> +       if (newidle_balance_in_callback) {

> +               update_misfit_status(NULL, this_rq);

> +               queue_balance_callback(this_rq,

> +                                      &per_cpu(rebalance_head, this_rq->cpu),

> +                                      newidle_balance_cb);

> +               return 0;

> +       }

> +

> +       return do_newidle_balance(this_rq, rf);

> +}

> +

>  /*

>   * run_rebalance_domains is triggered when needed from the scheduler tick.

>   * Also triggered for nohz idle balancing (with nohz_balancing_kick set).

> --

> 2.27.0

>
Valentin Schneider May 7, 2021, 3:19 p.m. UTC | #2
On 05/05/21 14:13, Vincent Guittot wrote:
> On Thu, 29 Apr 2021 at 01:28, Scott Wood <swood@redhat.com> wrote:

>>

>> This is required in order to be able to enable interrupts in the next

>> patch.  This is limited to PREEMPT_RT to avoid adding potentially

>> measurable overhead to the non-RT case (requiring a double switch when

>> pulling a task onto a newly idle cpu).

>

> IIUC, only the newidle_balance is a problem and not the idle load

> balance that runs softirq. In this case, why not skipping

> newidle_balance entirely in case of preempt_rt and kick an idle load

> balance instead as you switch to idle thread context anyway

>


So if I follow you, that would be along the lines of having PREEMPT_RT turn
newidle_balance() into:

        rq->idle_balance = CPU_IDLE;
        rq->next_balance = jiffies;
        trigger_load_balance(rq);

which I'm thinking isn't too crazy.
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..ff369c38a5b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5660,6 +5660,9 @@  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT);
+static DEFINE_PER_CPU(struct callback_head, rebalance_head);
+
 /* Working cpumask for: load_balance, load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
@@ -10549,7 +10552,7 @@  static inline void nohz_newidle_balance(struct rq *this_rq) { }
  *     0 - failed, no new tasks
  *   > 0 - success, new (fair) tasks present
  */
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
 	unsigned long next_balance = jiffies + HZ;
 	int this_cpu = this_rq->cpu;
@@ -10557,7 +10560,9 @@  static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 
-	update_misfit_status(NULL, this_rq);
+	if (!newidle_balance_in_callback)
+		update_misfit_status(NULL, this_rq);
+
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
@@ -10576,7 +10581,8 @@  static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	 * further scheduler activity on it and we're being very careful to
 	 * re-start the picking loop.
 	 */
-	rq_unpin_lock(this_rq, rf);
+	if (!newidle_balance_in_callback)
+		rq_unpin_lock(this_rq, rf);
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !READ_ONCE(this_rq->rd->overload)) {
@@ -10655,11 +10661,31 @@  static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	if (pulled_task)
 		this_rq->idle_stamp = 0;
 
-	rq_repin_lock(this_rq, rf);
+	if (!newidle_balance_in_callback)
+		rq_repin_lock(this_rq, rf);
 
 	return pulled_task;
 }
 
+static void newidle_balance_cb(struct rq *this_rq)
+{
+	update_rq_clock(this_rq);
+	do_newidle_balance(this_rq, NULL);
+}
+
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+	if (newidle_balance_in_callback) {
+		update_misfit_status(NULL, this_rq);
+		queue_balance_callback(this_rq,
+				       &per_cpu(rebalance_head, this_rq->cpu),
+				       newidle_balance_cb);
+		return 0;
+	}
+
+	return do_newidle_balance(this_rq, rf);
+}
+
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).