@@ -445,6 +445,16 @@ struct util_est {
#define UTIL_AVG_UNCHANGED 0x80000000
} __attribute__((__aligned__(sizeof(u64))));
+/*
+ * For sched_setattr_nocheck() (kernel) only
+ *
+ * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more
+ * accurate utilization info. This is useful when guest kernels have some way of
+ * tracking its own runqueue's utilization.
+ *
+ */
+#define SCHED_FLAG_UTIL_GUEST 0x20000000
+
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,6 +509,7 @@ struct sched_avg {
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
+ unsigned long util_guest;
struct util_est util_est;
} ____cacheline_aligned;
@@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
+static void __setscheduler_task_util(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST)))
+ return;
+
+ p->se.avg.util_guest = attr->sched_util_min;
+}
+
bool sched_task_on_rq(struct task_struct *p)
{
return task_on_rq_queued(p);
@@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p,
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST))
return -EINVAL;
/*
@@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p,
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+ return -EINVAL;
+
retval = security_task_setscheduler(p);
if (retval)
return retval;
@@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p,
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p,
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
+ __setscheduler_task_util(p, attr);
if (queued) {
/*
@@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
- return READ_ONCE(p->se.avg.util_avg);
+ return max(READ_ONCE(p->se.avg.util_avg),
+ READ_ONCE(p->se.avg.util_guest));
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+ return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
+ max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);
+ /*
+ * The normal code path for host thread enqueue doesn't take into
+ * account guest task migrations when updating cpufreq util.
+ * So, always update the cpufreq when a vCPU thread has a
+ * non-zero util_guest value.
+ */
+ if (READ_ONCE(p->se.avg.util_guest))
+ cpufreq_update_util(rq, 0);
+
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag