@@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
Be aware that not all cpufreq drivers support the conservative
governor. If unsure have a look at the help section of the
driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_CFS
+ bool "cfs"
+ select CPU_FREQ_GOV_CFS
+ select CPU_FREQ_GOV_PERFORMANCE
+ help
+ Use the CPUfreq governor 'cfs' as default. This scales
+ cpu frequency from the scheduler as per-entity load tracking
+ statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_CFS
+ tristate "'cfs' cpufreq governor"
+ depends on CPU_FREQ
+ select CPU_FREQ_GOV_COMMON
+ help
+ 'cfs' - this governor scales cpu frequency from the
+ scheduler as a function of cpu capacity utilization. It does
+ not evaluate utilization on a periodic basis (as ondemand
+ does) but instead is invoked from the completely fair
+ scheduler when updating per-entity load tracking statistics.
+ Latency to respond to changes in load is improved over polling
+ governors due to its event-driven design.
+
+ If in doubt, say N.
+
comment "CPU frequency scaling drivers"
config CPUFREQ_DT
@@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
extern struct cpufreq_governor cpufreq_gov_conservative;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV)
+extern struct cpufreq_governor cpufreq_gov_cap_gov;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov)
#endif
/*********************************************************************
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ_GOV_CFS) += cpufreq_cfs.o
new file mode 100644
@@ -0,0 +1,311 @@
+/*
+ * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+
+#include "sched.h"
+
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */
+#define THROTTLE_NSEC 50000000 /* 50ms default */
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @throttle: next throttling period expiry. Derived from throttle_nsec
+ * @throttle_nsec: throttle period length in nanoseconds
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ *
+ * struct gov_data is the per-policy cpufreq_cfs-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_cfs governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+ ktime_t throttle;
+ unsigned int throttle_nsec;
+ struct task_struct *task;
+ struct irq_work irq_work;
+ struct cpufreq_policy *policy;
+};
+
+/**
+ * cpufreq_cfs_select_freq - pick the next frequency for a cpu
+ * @policy: the cpufreq policy whose frequency may be changed
+ *
+ * cpufreq_cfs_select_freq selects a frequency based on pelt load statistics
+ * tracked by cfs. First it finds the most utilized cpu in the policy and then
+ * maps that utilization value onto a cpu frequency and returns it.
+ *
+ * Additionally, cpufreq_cfs_select_freq adds a margin to the cpu utilization value
+ * before converting it to a frequency. The margin is derived from MARGIN_PCT,
+ * which itself is inspired by imbalance_pct in cfs. This is needed to
+ * proactively increase frequency in the case of increasing load.
+ *
+ * This approach attempts to maintain headroom of 25% unutilized cpu capacity.
+ * A traditional way of doing this is to take 75% of the current capacity and
+ * check if current utilization crosses that threshold. The only problem with
+ * that approach is determining the next cpu frequency target if that threshold
+ * is crossed.
+ *
+ * Instead of using the 75% threshold, cpufreq_cfs_select_freq adds a 25%
+ * utilization margin to the utilization and converts that to a frequency. This
+ * removes conditional logic around checking thresholds and better supports
+ * drivers that use non-discretized frequency ranges (i.e. no pre-defined
+ * frequency tables or operating points).
+ *
+ * Returns frequency selected.
+ */
+static unsigned long cpufreq_cfs_select_freq(struct cpufreq_policy *policy)
+{
+ int cpu = 0;
+ struct gov_data *gd;
+ unsigned max_usage = 0, usage = 0;
+
+ if (!policy->governor_data)
+ return 0;
+
+ gd = policy->governor_data;
+
+ /*
+ * get_cpu_usage is called without locking the runqueues. This is the
+ * same behavior used by find_busiest_cpu in load_balance. We are
+ * willing to accept occasionally stale data here in exchange for
+ * lockless behavior.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ usage = get_cpu_usage(cpu);
+ if (usage > max_usage)
+ max_usage = usage;
+ }
+
+ /* add margin to max_usage based on imbalance_pct */
+ max_usage = max_usage * MARGIN_PCT / 100;
+
+ cpu = cpumask_first(policy->cpus);
+
+ /* freq is current utilization + 25% */
+ return max_usage * policy->max / capacity_orig_of(cpu);
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_cfs_thread(void *data)
+{
+ struct sched_param param;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned long freq;
+ int ret;
+
+ policy = (struct cpufreq_policy *) data;
+ if (!policy) {
+ pr_warn("%s: missing policy\n", __func__);
+ do_exit(-EINVAL);
+ }
+
+ gd = policy->governor_data;
+ if (!gd) {
+ pr_warn("%s: missing governor data\n", __func__);
+ do_exit(-EINVAL);
+ }
+
+ param.sched_priority = 50;
+ ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
+ if (ret) {
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ do_exit(-EINVAL);
+ } else {
+ pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+ __func__, gd->task->pid);
+ }
+
+ ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
+ if (ret) {
+ pr_warn("%s: failed to set allowed ptr\n", __func__);
+ do_exit(-EINVAL);
+ }
+
+ /* main loop of the per-policy kthread */
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ if (kthread_should_stop())
+ break;
+
+ /* avoid race with cpufreq_cfs_stop */
+ if (!down_write_trylock(&policy->rwsem))
+ continue;
+
+ freq = cpufreq_cfs_select_freq(policy);
+
+ ret = __cpufreq_driver_target(policy, freq,
+ CPUFREQ_RELATION_L);
+ if (ret)
+ pr_debug("%s: __cpufreq_driver_target returned %d\n",
+ __func__, ret);
+
+ gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+ up_write(&policy->rwsem);
+ } while (!kthread_should_stop());
+
+ do_exit(0);
+}
+
+static void cpufreq_cfs_irq_work(struct irq_work *irq_work)
+{
+ struct gov_data *gd;
+
+ gd = container_of(irq_work, struct gov_data, irq_work);
+ if (!gd) {
+ return;
+ }
+
+ wake_up_process(gd->task);
+}
+
+/**
+ * cpufreq_cfs_update_cpu - interface to scheduler for changing capacity values
+ * @cpu: cpu whose capacity utilization has recently changed
+ *
+ * cpufreq_cfs_udpate_cpu is an interface exposed to the scheduler so that the
+ * scheduler may inform the governor of updates to capacity utilization and
+ * make changes to cpu frequency. Currently this interface is designed around
+ * PELT values in CFS. It can be expanded to other scheduling classes in the
+ * future if needed.
+ *
+ * cpufreq_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
+ * the thread that does the actual work, cpufreq_cfs_thread.
+ */
+void cpufreq_cfs_update_cpu(int cpu)
+{
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+
+ /* XXX put policy pointer in per-cpu data? */
+ policy = cpufreq_cpu_get(cpu);
+ if (IS_ERR_OR_NULL(policy)) {
+ return;
+ }
+
+ if (!policy->governor_data) {
+ goto out;
+ }
+
+ gd = policy->governor_data;
+
+ /* bail early if we are throttled */
+ if (ktime_before(ktime_get(), gd->throttle)) {
+ goto out;
+ }
+
+ irq_work_queue_on(&gd->irq_work, cpu);
+
+out:
+ cpufreq_cpu_put(policy);
+ return;
+}
+
+static void cpufreq_cfs_start(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd;
+
+ /* prepare per-policy private data */
+ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+ if (!gd) {
+ pr_debug("%s: failed to allocate private data\n", __func__);
+ return;
+ }
+
+ /*
+ * Don't ask for freq changes at an higher rate than what
+ * the driver advertises as transition latency.
+ */
+ gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_NSEC;
+ pr_debug("%s: throttle threshold = %u [ns]\n",
+ __func__, gd->throttle_nsec);
+
+ /* init per-policy kthread */
+ gd->task = kthread_run(cpufreq_cfs_thread, policy, "kcpufreq_cfs_task");
+ if (IS_ERR_OR_NULL(gd->task))
+ pr_err("%s: failed to create kcpufreq_cfs_task thread\n", __func__);
+
+ init_irq_work(&gd->irq_work, cpufreq_cfs_irq_work);
+ policy->governor_data = gd;
+ gd->policy = policy;
+}
+
+static void cpufreq_cfs_stop(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd;
+
+ gd = policy->governor_data;
+ kthread_stop(gd->task);
+
+ policy->governor_data = NULL;
+
+ /* FIXME replace with devm counterparts? */
+ kfree(gd);
+}
+
+static int cpufreq_cfs_setup(struct cpufreq_policy *policy, unsigned int event)
+{
+ switch (event) {
+ case CPUFREQ_GOV_START:
+ /* Start managing the frequency */
+ cpufreq_cfs_start(policy);
+ return 0;
+
+ case CPUFREQ_GOV_STOP:
+ cpufreq_cfs_stop(policy);
+ return 0;
+
+ case CPUFREQ_GOV_LIMITS: /* unused */
+ case CPUFREQ_GOV_POLICY_INIT: /* unused */
+ case CPUFREQ_GOV_POLICY_EXIT: /* unused */
+ break;
+ }
+ return 0;
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS
+static
+#endif
+struct cpufreq_governor cpufreq_cfs = {
+ .name = "cfs",
+ .governor = cpufreq_cfs_setup,
+ .owner = THIS_MODULE,
+};
+
+static int __init cpufreq_cfs_init(void)
+{
+ return cpufreq_register_governor(&cpufreq_cfs);
+}
+
+static void __exit cpufreq_cfs_exit(void)
+{
+ cpufreq_unregister_governor(&cpufreq_cfs);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_cfs_init);
+
+MODULE_LICENSE("GPL");
@@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_rq_runnable_avg(rq, rq->nr_running);
add_nr_running(rq, 1);
}
+
+ if(sched_energy_freq())
+ cpufreq_cfs_update_cpu(cpu_of(rq));
+
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
}
+
+ if(sched_energy_freq())
+ cpufreq_cfs_update_cpu(cpu_of(rq));
+
hrtick_update(rq);
}
@@ -7789,6 +7797,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
+
+ if(sched_energy_freq())
+ cpufreq_cfs_update_cpu(cpu_of(rq));
}
/*
@@ -1429,6 +1429,12 @@ static inline int get_cpu_usage(int cpu)
return (usage * capacity) >> SCHED_LOAD_SHIFT;
}
+#ifdef CONFIG_CPU_FREQ_GOV_CFS
+void cpufreq_cfs_update_cpu(int cpu);
+#else
+static inline void cpufreq_cfs_update_cpu(int cpu) {}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));