@@ -26,6 +26,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_enter_rt(rq);
+
put_prev_task(rq, prev);
schedstat_inc(rq, sched_goidle);
@@ -47,6 +49,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_rt(rq);
idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
@@ -992,6 +992,27 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
+/* Set CPUPRI_IDLE bitmap for this cpu when entering idle. */
+void idle_enter_rt(struct rq *this_rq)
+{
+ struct cpupri *cp = &this_rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[this_rq->cpu];
+
+ BUG_ON(currpri != CPUPRI_NORMAL);
+ cpupri_set(cp, this_rq->cpu, MAX_PRIO);
+}
+
+/* Set CPUPRI_NORMAL bitmap for this cpu when exiting from idle. */
+void idle_exit_rt(struct rq *this_rq)
+{
+ struct cpupri *cp = &this_rq->rd->cpupri;
+ int currpri = cp->cpu_to_pri[this_rq->cpu];
+
+ /* RT tasks may be queued before, this judgement is needed. */
+ if (currpri == CPUPRI_IDLE)
+ cpupri_set(cp, this_rq->cpu, MAX_RT_PRIO);
+}
+
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
@@ -1162,11 +1162,17 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
+extern void idle_enter_rt(struct rq *this_rq);
+extern void idle_exit_rt(struct rq *this_rq);
+
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
#else
+static inline void idle_enter_rt(struct rq *rq) { }
+static inline void idle_exit_rt(struct rq *rq) { }
+
static inline void idle_enter_fair(struct rq *rq) { }
static inline void idle_exit_fair(struct rq *rq) { }
When a runqueue runs out of RT tasks, it may have non-RT tasks or none tasks(idle). Currently, RT balance treats the two cases equally and manipulates cpupri.pri_to_cpu[CPUPRI_NORMAL] only which may cause problems. For instance, 4 cpus system, non-RT task1 is running on cpu0, RT task2 is running on cpu3, cpu1/cpu2 both are idle. Then RT task3 (usually CPU-intensive) is waken up or created on cpu3, it will be placed to cpu0 (see find_lowest_rq()) causing task1 starving until cfs load balance places task1 to another cpu, or even worse if task1 is bound on cpu0. So, it would be reasonable to put task3 to cpu1 or cpu2 which is idle(even though doing this may break the energy-saving idle state). This patch tackles the problem by operating pri_to_cpu[CPUPRI_IDLE] of cpupri according to the stages of idle task, so that when pushing or selecting RT tasks through find_lowest_rq(), it will try to find one idle cpu as the goal. Signed-off-by: pang.xunlei <pang.xunlei@linaro.org> --- kernel/sched/idle_task.c | 3 +++ kernel/sched/rt.c | 21 +++++++++++++++++++++ kernel/sched/sched.h | 6 ++++++ 3 files changed, 30 insertions(+)