diff mbox

[RFC,4/4] cpuset: Add cpusets.quiesce option

Message ID e643acf71086db239254d44c77d4966b2a4a01e5.1395322529.git.viresh.kumar@linaro.org
State New
Headers show

Commit Message

Viresh Kumar March 20, 2014, 1:49 p.m. UTC
For networking applications platforms need to provide one CPU per each user
space data plane thread. These CPUs should not be interrupted by kernel at all
unless userspace has requested for some syscalls. Currently, there are
background kernel activities that are running on almost every CPU, like:
timers/hrtimers/watchdogs/etc, and these are required to be migrated to other
CPUs.

To achieve that, this patch adds another option to cpusets, i.e. 'quiesce'.
Writing '1' on this file would migrate these unbound/unpinned timers/workqueues
away from the CPUs of the cpuset in question. Writing '0' has no effect and this
file can't be read from userspace as we aren't maintaining a state here.

Currently, only timers are migrated. This would be followed by other kernel
infrastructure later.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 kernel/cpuset.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

Comments

Zefan Li March 27, 2014, 2:47 a.m. UTC | #1
On 2014/3/20 21:49, Viresh Kumar wrote:
> For networking applications platforms need to provide one CPU per each user
> space data plane thread. These CPUs should not be interrupted by kernel at all
> unless userspace has requested for some syscalls. Currently, there are
> background kernel activities that are running on almost every CPU, like:
> timers/hrtimers/watchdogs/etc, and these are required to be migrated to other
> CPUs.
> 
> To achieve that, this patch adds another option to cpusets, i.e. 'quiesce'.
> Writing '1' on this file would migrate these unbound/unpinned timers/workqueues
> away from the CPUs of the cpuset in question. Writing '0' has no effect and this
> file can't be read from userspace as we aren't maintaining a state here.
> 

This doesn't look like a complete solution, because newer timers/workqueues can
still run in those CPUs. Seems like the proposal discussed is to support setting
cpu affinity for workqueues through sysfs. If so, we can migrate workqueues when
affinity is set, so we don't need this cpuset.quiesce ?

> Currently, only timers are migrated. This would be followed by other kernel
> infrastructure later.
> 
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Viresh Kumar March 27, 2014, 4:29 a.m. UTC | #2
On 27 March 2014 08:17, Li Zefan <lizefan@huawei.com> wrote:
> This doesn't look like a complete solution, because newer timers/workqueues can
> still run in those CPUs.

The initial idea was to disable load balance between CPUs and then do this.
So, that new timers and workqueues from other CPUs would never get
queued on this CPU..

But I think we can just modify get_nohz_timer_target() for making sure this
for timers..

> Seems like the proposal discussed is to support setting
> cpu affinity for workqueues through sysfs. If so, we can migrate workqueues when
> affinity is set, so we don't need this cpuset.quiesce ?

That was another thread just for workqueues, but this one is about migrating
everything else as well.. Probably some more additions apart from timers/
hrtimers/wqs in future. So, for us it is still required :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c41..1b79ae6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -43,10 +43,12 @@ 
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
+#include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/string.h>
@@ -150,6 +152,7 @@  typedef enum {
 	CS_SCHED_LOAD_BALANCE,
 	CS_SPREAD_PAGE,
 	CS_SPREAD_SLAB,
+	CS_QUIESCE,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -1208,6 +1211,44 @@  static int update_relax_domain_level(struct cpuset *cs, s64 val)
 	return 0;
 }
 
+void timer_quiesce_cpu(void *cpu);
+
+/**
+ * quiesce_cpuset - Move unbound timers/workqueues away from cpuset.cpus
+ * @cs: cpuset to be quiesced
+ *
+ * For isolating a core with cpusets we require all unbound timers/workqueues to
+ * move away for isolated core. For simplicity, currently we migrate these to
+ * the first online CPU which is not part of tick_nohz_full_mask.
+ *
+ * Currently we are only migrating timers away.
+ */
+void quiesce_cpuset(struct cpuset *cs)
+{
+	int from_cpu, to_cpu;
+	cpumask_t cpumask;
+
+	cpumask_andnot(&cpumask, cpu_online_mask, cs->cpus_allowed);
+
+#ifdef CONFIG_NO_HZ_FULL
+	cpumask_andnot(&cpumask, &cpumask, tick_nohz_full_mask);
+#endif
+
+	if (cpumask_empty(&cpumask)) {
+		pr_err("%s: Couldn't find a CPU to migrate to\n", __func__);
+		return;
+	}
+
+	to_cpu = cpumask_first(&cpumask);
+
+	for_each_cpu(from_cpu, cs->cpus_allowed) {
+		pr_debug("%s: Migrating from CPU:%d to CPU:%d\n", __func__,
+				from_cpu, to_cpu);
+		smp_call_function_single(to_cpu, timer_quiesce_cpu,
+				(void *)from_cpu, true);
+	}
+}
+
 /**
  * update_tasks_flags - update the spread flags of tasks in the cpuset.
  * @cs: the cpuset in which each task's spread flags needs to be changed
@@ -1244,6 +1285,11 @@  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	int spread_flag_changed;
 	int err;
 
+	if (bit == CS_QUIESCE && turning_on) {
+		quiesce_cpuset(cs);
+		return 0;
+	}
+
 	trialcs = alloc_trial_cpuset(cs);
 	if (!trialcs)
 		return -ENOMEM;
@@ -1526,6 +1572,7 @@  typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+	FILE_CPU_QUIESCE,
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1569,6 +1616,9 @@  static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, val);
 		break;
+	case FILE_CPU_QUIESCE:
+		retval = update_flag(CS_QUIESCE, cs, val);
+		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -1837,6 +1887,12 @@  static struct cftype files[] = {
 		.private = FILE_MEMORY_PRESSURE_ENABLED,
 	},
 
+	{
+		.name = "quiesce",
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_CPU_QUIESCE,
+	},
+
 	{ }	/* terminate */
 };