@@ -5250,6 +5250,12 @@
neutralize any effect of /proc/sys/kernel/sysrq.
Useful for debugging.
+ task_isolation_debug [KNL]
+ In kernels built with CONFIG_TASK_ISOLATION, this
+ setting will generate console backtraces to
+ accompany the diagnostics generated about
+ interrupting tasks running with task isolation.
+
tcpmhash_entries= [KNL,NET]
Set the number of tcp_metrics_hash slots.
Default value is 8192 or 16384 depending on total
@@ -284,6 +284,26 @@ static ssize_t print_cpus_isolated(struct device *dev,
}
static DEVICE_ATTR(isolated, 0444, print_cpus_isolated, NULL);
+#ifdef CONFIG_TASK_ISOLATION
+static ssize_t isolation_running_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n;
+ cpumask_var_t isolation_running;
+
+ if (!zalloc_cpumask_var(&isolation_running, GFP_KERNEL))
+ return -ENOMEM;
+
+ task_isolation_cpumask(isolation_running);
+ n = sprintf(buf, "%*pbl\n", cpumask_pr_args(isolation_running));
+
+ free_cpumask_var(isolation_running);
+
+ return n;
+}
+static DEVICE_ATTR_RO(isolation_running);
+#endif
+
#ifdef CONFIG_NO_HZ_FULL
static ssize_t print_cpus_nohz_full(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -471,6 +491,9 @@ static struct attribute *cpu_root_attrs[] = {
#ifdef CONFIG_NO_HZ_FULL
&dev_attr_nohz_full.attr,
#endif
+#ifdef CONFIG_TASK_ISOLATION
+ &dev_attr_isolation_running.attr,
+#endif
#ifdef CONFIG_GENERIC_CPU_AUTOPROBE
&dev_attr_modalias.attr,
#endif
@@ -529,6 +529,10 @@ extern void __init hrtimers_init(void);
/* Show pending timers: */
extern void sysrq_timer_list_show(void);
+#ifdef CONFIG_TASK_ISOLATION
+extern void kick_hrtimer(void);
+#endif
+
int hrtimers_prepare_cpu(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_dead_cpu(unsigned int cpu);
new file mode 100644
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Task isolation support
+ *
+ * Authors:
+ * Chris Metcalf <cmetcalf@mellanox.com>
+ * Alex Belits <abelits@marvell.com>
+ * Yuri Norov <ynorov@marvell.com>
+ */
+#ifndef _LINUX_ISOLATION_H
+#define _LINUX_ISOLATION_H
+
+#include <stdarg.h>
+#include <linux/errno.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/irqflags.h>
+#include <linux/prctl.h>
+#include <linux/types.h>
+
+struct task_struct;
+
+#ifdef CONFIG_TASK_ISOLATION
+
+/*
+ * Logging
+ */
+int task_isolation_message(int cpu, int level, bool supp, const char *fmt, ...);
+
+#define pr_task_isol_emerg(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_EMERG, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_alert(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_ALERT, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_crit(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_CRIT, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_err(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_ERR, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_warn(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_WARNING, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_notice(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_NOTICE, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_info(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_INFO, false, fmt, ##__VA_ARGS__)
+#define pr_task_isol_debug(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_DEBUG, false, fmt, ##__VA_ARGS__)
+
+#define pr_task_isol_emerg_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_EMERG, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_alert_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_ALERT, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_crit_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_CRIT, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_err_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_ERR, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_warn_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_WARNING, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_notice_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_NOTICE, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_info_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_INFO, true, fmt, ##__VA_ARGS__)
+#define pr_task_isol_debug_supp(cpu, fmt, ...) \
+ task_isolation_message(cpu, LOGLEVEL_DEBUG, true, fmt, ##__VA_ARGS__)
+
+#define BIT_LL_TASK_ISOLATION (0)
+#define BIT_LL_TASK_ISOLATION_BROKEN (1)
+#define BIT_LL_TASK_ISOLATION_REQUEST (2)
+#define FLAG_LL_TASK_ISOLATION (1 << BIT_LL_TASK_ISOLATION)
+#define FLAG_LL_TASK_ISOLATION_BROKEN (1 << BIT_LL_TASK_ISOLATION_BROKEN)
+#define FLAG_LL_TASK_ISOLATION_REQUEST (1 << BIT_LL_TASK_ISOLATION_REQUEST)
+
+DECLARE_PER_CPU(unsigned long, ll_isol_flags);
+extern cpumask_var_t task_isolation_map;
+
+/**
+ * task_isolation_request() - prctl hook to request task isolation
+ * @flags: Flags from <linux/prctl.h> PR_TASK_ISOLATION_xxx.
+ *
+ * This is called from the generic prctl() code for PR_TASK_ISOLATION.
+ *
+ * Return: Returns 0 when task isolation enabled, otherwise a negative
+ * errno.
+ */
+extern int task_isolation_request(unsigned int flags);
+
+/**
+ * task_isolation_kernel_enter() - clear low-level task isolation flag
+ *
+ * This should be called immediately after entering kernel. It must
+ * be inline, and suitable for running after leaving isolated
+ * userspace in a "stale" state when synchronization is required
+ * before the CPU can safely enter the rest of the kernel.
+ */
+static __always_inline void task_isolation_kernel_enter(void)
+{
+ unsigned long flags;
+
+ /*
+ * This function runs on a CPU that ran isolated task.
+ *
+ * We don't want this CPU running code from the rest of kernel
+ * until other CPUs know that it is no longer isolated. When
+ * CPU is running isolated task until this point anything that
+ * causes an interrupt on this CPU must end up calling this
+ * before touching the rest of kernel. That is, this function
+ * or fast_task_isolation_cpu_cleanup() or stop_isolation()
+ * calling it. If any interrupt, including scheduling timer,
+ * arrives, it will still end up here early after entering
+ * kernel. From this point interrupts are disabled until all
+ * CPUs will see that this CPU is no longer running isolated
+ * task.
+ *
+ * See also fast_task_isolation_cpu_cleanup().
+ */
+ if ((this_cpu_read(ll_isol_flags) & FLAG_LL_TASK_ISOLATION) == 0)
+ return;
+
+ raw_local_irq_save(flags);
+
+ /* Change low-level flags to indicate broken isolation */
+ this_cpu_write(ll_isol_flags, FLAG_LL_TASK_ISOLATION_BROKEN);
+
+ /*
+ * If something happened that requires a barrier that would
+ * otherwise be called from remote CPUs by CPU kick procedure,
+ * this barrier runs instead of it. After this barrier, CPU
+ * kick procedure would see the updated ll_isol_flags, so it
+ * will run its own IPI to trigger a barrier.
+ */
+ smp_mb();
+ /*
+ * Synchronize instructions -- this CPU was not kicked while
+ * in isolated mode, so it might require synchronization.
+ * There might be an IPI if kick procedure happened and
+ * ll_isol_flags was already updated while it assembled a CPU
+ * mask. However if this did not happen, synchronize everything
+ * here.
+ */
+ instr_sync();
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * task_isolation_exit_to_user_mode() - set low-level task isolation flag
+ * if task isolation is requested
+ *
+ * This should be called immediately before exiting kernel. It must
+ * be inline, and the state of CPI may become "stale" between setting
+ * the flag and returning to the userspace.
+ */
+static __always_inline void task_isolation_exit_to_user_mode(void)
+{
+ unsigned long flags;
+
+ /* Check if this task is entering isolation */
+ if ((this_cpu_read(ll_isol_flags) & FLAG_LL_TASK_ISOLATION_REQUEST)
+ == 0)
+ return;
+ raw_local_irq_save(flags);
+
+ /* Set low-level flags */
+ this_cpu_write(ll_isol_flags, FLAG_LL_TASK_ISOLATION);
+ /*
+ * After this barrier the rest of the system stops using IPIs
+ * to synchronize this CPU state. Since only exit to userspace
+ * follows, this is safe. Synchronization will happen again in
+ * task_isolation_enter() when this CPU will enter kernel.
+ */
+ smp_mb();
+ /*
+ * From this point this is recognized as isolated by
+ * other CPUs
+ */
+ raw_local_irq_restore(flags);
+}
+
+extern void task_isolation_cpu_cleanup(void);
+/**
+ * task_isolation_start() - attempt to actually start task isolation
+ *
+ * This function should be invoked as the last thing prior to returning to
+ * user space if TIF_TASK_ISOLATION is set in the thread_info flags. It
+ * will attempt to quiesce the core and enter task-isolation mode. If it
+ * fails, it will reset the system call return value to an error code that
+ * indicates the failure mode.
+ */
+extern void task_isolation_start(void);
+
+/**
+ * is_isolation_cpu() - check if CPU is intended for running isolated tasks.
+ * @cpu: CPU to check.
+ */
+static inline bool is_isolation_cpu(int cpu)
+{
+ return task_isolation_map != NULL &&
+ cpumask_test_cpu(cpu, task_isolation_map);
+}
+
+/**
+ * task_isolation_on_cpu() - check if the cpu is running isolated task
+ * @cpu: CPU to check.
+ */
+static inline int task_isolation_on_cpu(int cpu)
+{
+ return test_bit(BIT_LL_TASK_ISOLATION, &per_cpu(ll_isol_flags, cpu));
+}
+
+/**
+ * task_isolation_cpumask() - set CPUs currently running isolated tasks
+ * @mask: Mask to modify.
+ */
+extern void task_isolation_cpumask(struct cpumask *mask);
+
+/**
+ * task_isolation_clear_cpumask() - clear CPUs currently running isolated tasks
+ * @mask: Mask to modify.
+ */
+extern void task_isolation_clear_cpumask(struct cpumask *mask);
+
+/**
+ * task_isolation_syscall() - report a syscall from an isolated task
+ * @nr: The syscall number.
+ *
+ * This routine should be invoked at syscall entry if TIF_TASK_ISOLATION is
+ * set in the thread_info flags. It checks for valid syscalls,
+ * specifically prctl() with PR_TASK_ISOLATION, exit(), and exit_group().
+ * For any other syscall it will raise a signal and return failure.
+ *
+ * Return: 0 for acceptable syscalls, -1 for all others.
+ */
+extern int task_isolation_syscall(int nr);
+
+/**
+ * task_isolation_before_pending_work_check() - check for isolation breaking
+ *
+ * This routine is called from the code responsible for exiting to user mode,
+ * before the point when thread flags are checked for pending work.
+ * That function must be called if the current task is isolated, because
+ * TIF_TASK_ISOLATION must trigger a call to it.
+ */
+void task_isolation_before_pending_work_check(void);
+
+/**
+ * _task_isolation_interrupt() - report an interrupt of an isolated task
+ * @fmt: A format string describing the interrupt
+ * @...: Format arguments, if any.
+ *
+ * This routine should be invoked at any exception or IRQ if
+ * TIF_TASK_ISOLATION is set in the thread_info flags. It is not necessary
+ * to invoke it if the exception will generate a signal anyway (e.g. a bad
+ * page fault), and in that case it is preferable not to invoke it but just
+ * rely on the standard Linux signal. The macro task_isolation_syscall()
+ * wraps the TIF_TASK_ISOLATION flag test to simplify the caller code.
+ */
+extern void _task_isolation_interrupt(const char *fmt, ...);
+#define task_isolation_interrupt(fmt, ...) \
+ do { \
+ if (current_thread_info()->flags & _TIF_TASK_ISOLATION) \
+ _task_isolation_interrupt(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+/**
+ * task_isolation_remote() - report a remote interrupt of an isolated task
+ * @cpu: The remote cpu that is about to be interrupted.
+ * @fmt: A format string describing the interrupt
+ * @...: Format arguments, if any.
+ *
+ * This routine should be invoked any time a remote IPI or other type of
+ * interrupt is being delivered to another cpu. The function will check to
+ * see if the target core is running a task-isolation task, and generate a
+ * diagnostic on the console if so; in addition, we tag the task so it
+ * doesn't generate another diagnostic when the interrupt actually arrives.
+ * Generating a diagnostic remotely yields a clearer indication of what
+ * happened then just reporting only when the remote core is interrupted.
+ *
+ */
+extern void task_isolation_remote(int cpu, const char *fmt, ...);
+
+/**
+ * task_isolation_remote_cpumask() - report interruption of multiple cpus
+ * @mask: The set of remotes cpus that are about to be interrupted.
+ * @fmt: A format string describing the interrupt
+ * @...: Format arguments, if any.
+ *
+ * This is the cpumask variant of _task_isolation_remote(). We
+ * generate a single-line diagnostic message even if multiple remote
+ * task-isolation cpus are being interrupted.
+ */
+extern void task_isolation_remote_cpumask(const struct cpumask *mask,
+ const char *fmt, ...);
+
+/**
+ * _task_isolation_signal() - disable task isolation when signal is pending
+ * @task: The task for which to disable isolation.
+ *
+ * This function generates a diagnostic and disables task isolation;
+ * it should be called if TIF_TASK_ISOLATION is set when notifying a
+ * task of a pending signal. The task_isolation_interrupt() function
+ * normally generates a diagnostic for events that just interrupt a
+ * task without generating a signal; here we need to hook the paths
+ * that correspond to interrupts that do generate a signal. The macro
+ * task_isolation_signal() wraps the TIF_TASK_ISOLATION flag test to
+ * simplify the caller code.
+ */
+extern void _task_isolation_signal(struct task_struct *task);
+#define task_isolation_signal(task) \
+ do { \
+ if (task_thread_info(task)->flags & _TIF_TASK_ISOLATION) \
+ _task_isolation_signal(task); \
+ } while (0)
+
+#else /* !CONFIG_TASK_ISOLATION */
+static inline int task_isolation_request(unsigned int flags) { return -EINVAL; }
+static inline void task_isolation_kernel_enter(void) {}
+static inline void task_isolation_exit_to_user_mode(void) {}
+static inline void task_isolation_start(void) { }
+static inline bool is_isolation_cpu(int cpu) { return 0; }
+static inline int task_isolation_on_cpu(int cpu) { return 0; }
+static inline void task_isolation_cpumask(struct cpumask *mask) { }
+static inline void task_isolation_clear_cpumask(struct cpumask *mask) { }
+static inline void task_isolation_cpu_cleanup(void) { }
+static inline int task_isolation_syscall(int nr) { return 0; }
+static inline void task_isolation_before_pending_work_check(void) { }
+static inline void task_isolation_signal(struct task_struct *task) { }
+#endif
+
+#endif /* _LINUX_ISOLATION_H */
@@ -1316,6 +1316,11 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_TASK_ISOLATION
+ unsigned short task_isolation_flags; /* prctl */
+ unsigned short task_isolation_state;
+#endif
+
#ifdef CONFIG_X86_MCE
void __user *mce_vaddr;
__u64 mce_kflags;
@@ -268,6 +268,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
extern void tick_nohz_full_kick_cpu(int cpu);
extern void __tick_nohz_task_switch(void);
extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
+#ifdef CONFIG_TASK_ISOLATION
+extern int try_stop_full_tick(void);
+#endif
#else
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -247,4 +247,10 @@ struct prctl_mm_map {
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58
+/* Enable task_isolation mode for TASK_ISOLATION kernels. */
+#define PR_TASK_ISOLATION 48
+# define PR_TASK_ISOLATION_ENABLE (1 << 0)
+# define PR_TASK_ISOLATION_SET_SIG(sig) (((sig) & 0x7f) << 8)
+# define PR_TASK_ISOLATION_GET_SIG(bits) (((bits) >> 8) & 0x7f)
+
#endif /* _LINUX_PRCTL_H */
@@ -648,6 +648,33 @@ config CPU_ISOLATION
source "kernel/rcu/Kconfig"
+config HAVE_ARCH_TASK_ISOLATION
+ bool
+
+config TASK_ISOLATION
+ bool "Provide hard CPU isolation from the kernel on demand"
+ depends on NO_HZ_FULL && HAVE_ARCH_TASK_ISOLATION
+ help
+ Allow userspace processes that place themselves on cores with
+ nohz_full and isolcpus enabled, and run prctl(PR_TASK_ISOLATION),
+ to "isolate" themselves from the kernel. Prior to returning to
+ userspace, isolated tasks will arrange that no future kernel
+ activity will interrupt the task while the task is running in
+ userspace. Attempting to re-enter the kernel while in this mode
+ will cause the task to be terminated with a signal; you must
+ explicitly use prctl() to disable task isolation before resuming
+ normal use of the kernel.
+
+ This "hard" isolation from the kernel is required for userspace
+ tasks that are running hard real-time tasks in userspace, such as
+ a high-speed network driver in userspace. Without this option, but
+ with NO_HZ_FULL enabled, the kernel will make a best-faith, "soft"
+ effort to shield a single userspace process from interrupts, but
+ makes no guarantees.
+
+ You should say "N" unless you are intending to run a
+ high-performance userspace driver or similar task.
+
config BUILD_BIN2C
bool
default n
@@ -133,6 +133,8 @@ KCOV_INSTRUMENT_stackleak.o := n
obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
+obj-$(CONFIG_TASK_ISOLATION) += isolation.o
+
$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
new file mode 100644
@@ -0,0 +1,714 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of task isolation.
+ *
+ * Authors:
+ * Chris Metcalf <cmetcalf@mellanox.com>
+ * Alex Belits <abelits@marvell.com>
+ * Yuri Norov <ynorov@marvell.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <linux/sched.h>
+#include <linux/isolation.h>
+#include <linux/syscalls.h>
+#include <linux/smp.h>
+#include <linux/tick.h>
+#include <asm/unistd.h>
+#include <asm/syscall.h>
+#include <linux/hrtimer.h>
+
+/*
+ * These values are stored in task_isolation_state.
+ * Note that STATE_NORMAL + TIF_TASK_ISOLATION means we are still
+ * returning from sys_prctl() to userspace.
+ */
+enum {
+ STATE_NORMAL = 0, /* Not isolated */
+ STATE_ISOLATED = 1 /* In userspace, isolated */
+};
+
+/*
+ * Low-level isolation flags.
+ * Those flags are used by low-level isolation set/clear/check routines.
+ * Those flags should be set last before return to userspace and cleared
+ * first upon kernel entry, and synchronized to allow isolation breaking
+ * detection before touching potentially unsynchronized parts of kernel.
+ * Isolated task does not receive synchronization events of any kind, so
+ * at the time of the first entry into kernel it might not be ready to
+ * run most of the kernel code. However to perform synchronization
+ * properly, kernel entry code should also enable synchronization events
+ * at the same time. This presents a problem because more kernel code
+ * should run to determine the cause of isolation breaking, signals may
+ * have to be generated, etc. So some flag clearing and synchronization
+ * should happen in "low-level" entry code but processing of isolation
+ * breaking should happen in "high-level" code. Low-level isolation flags
+ * should be set in that low-level code, possibly long before the cause
+ * of isolation breaking is known. Symmetrically, entering isolation
+ * should disable synchronization events before returning to userspace
+ * but after all potentially volatile code is finished.
+ */
+DEFINE_PER_CPU(unsigned long, ll_isol_flags);
+
+/*
+ * Description of the last two tasks that ran isolated on a given CPU.
+ * This is intended only for messages about isolation breaking. We
+ * don't want any references to actual task while accessing this from
+ * CPU that caused isolation breaking -- we know nothing about timing
+ * and don't want to use locking or RCU.
+ */
+struct isol_task_desc {
+ atomic_t curr_index;
+ atomic_t curr_index_wr;
+ bool warned[2];
+ pid_t pid[2];
+ pid_t tgid[2];
+ char comm[2][TASK_COMM_LEN];
+};
+static DEFINE_PER_CPU(struct isol_task_desc, isol_task_descs);
+
+/*
+ * Counter for isolation exiting procedures (from request to the start of
+ * cleanup) being attempted at once on a CPU. Normally incrementing of
+ * this counter is performed from the CPU that caused isolation breaking,
+ * however decrementing is done from the cleanup procedure, delegated to
+ * the CPU that is exiting isolation, not from the CPU that caused isolation
+ * breaking.
+ *
+ * If incrementing this counter while starting isolation exit procedure
+ * results in a value greater than 0, isolation exiting is already in
+ * progress, and cleanup did not start yet. This means, counter should be
+ * decremented back, and isolation exit that is already in progress, should
+ * be allowed to complete. Otherwise, a new isolation exit procedure should
+ * be started.
+ */
+DEFINE_PER_CPU(atomic_t, isol_exit_counter);
+
+/*
+ * Descriptor for isolation-breaking SMP calls
+ */
+DEFINE_PER_CPU(call_single_data_t, isol_break_csd);
+
+cpumask_var_t task_isolation_map;
+cpumask_var_t task_isolation_cleanup_map;
+static DEFINE_SPINLOCK(task_isolation_cleanup_lock);
+
+/* We can run on cpus that are isolated from the scheduler and are nohz_full. */
+static int __init task_isolation_init(void)
+{
+ alloc_bootmem_cpumask_var(&task_isolation_cleanup_map);
+ if (alloc_cpumask_var(&task_isolation_map, GFP_KERNEL))
+ /*
+ * At this point task isolation should match
+ * nohz_full. This may change in the future.
+ */
+ cpumask_copy(task_isolation_map, tick_nohz_full_mask);
+ return 0;
+}
+core_initcall(task_isolation_init)
+
+/* Enable stack backtraces of any interrupts of task_isolation cores. */
+static bool task_isolation_debug;
+static int __init task_isolation_debug_func(char *str)
+{
+ task_isolation_debug = true;
+ return 1;
+}
+__setup("task_isolation_debug", task_isolation_debug_func);
+
+/*
+ * Record name, pid and group pid of the task entering isolation on
+ * the current CPU.
+ */
+static void record_curr_isolated_task(void)
+{
+ int ind;
+ int cpu = smp_processor_id();
+ struct isol_task_desc *desc = &per_cpu(isol_task_descs, cpu);
+ struct task_struct *task = current;
+
+ /* Finish everything before recording current task */
+ smp_mb();
+ ind = atomic_inc_return(&desc->curr_index_wr) & 1;
+ desc->comm[ind][sizeof(task->comm) - 1] = '\0';
+ memcpy(desc->comm[ind], task->comm, sizeof(task->comm) - 1);
+ desc->pid[ind] = task->pid;
+ desc->tgid[ind] = task->tgid;
+ desc->warned[ind] = false;
+ /* Write everything, to be seen by other CPUs */
+ smp_mb();
+ atomic_inc(&desc->curr_index);
+ /* Everyone will see the new record from this point */
+ smp_mb();
+}
+
+/*
+ * Print message prefixed with the description of the current (or
+ * last) isolated task on a given CPU. Intended for isolation breaking
+ * messages that include target task for the user's convenience.
+ *
+ * Messages produced with this function may have obsolete task
+ * information if isolated tasks managed to exit, start and enter
+ * isolation multiple times, or multiple tasks tried to enter
+ * isolation on the same CPU at once. For those unusual cases it would
+ * contain a valid description of the cause for isolation breaking and
+ * target CPU number, just not the correct description of which task
+ * ended up losing isolation.
+ */
+int task_isolation_message(int cpu, int level, bool supp, const char *fmt, ...)
+{
+ struct isol_task_desc *desc;
+ struct task_struct *task;
+ va_list args;
+ char buf_prefix[TASK_COMM_LEN + 20 + 3 * 20];
+ char buf[200];
+ int curr_cpu, ind_counter, ind_counter_old, ind;
+
+ curr_cpu = get_cpu();
+ /* Barrier to synchronize with recording isolated task information */
+ smp_rmb();
+ desc = &per_cpu(isol_task_descs, cpu);
+ ind_counter = atomic_read(&desc->curr_index);
+
+ if (curr_cpu == cpu) {
+ /*
+ * Message is for the current CPU so current
+ * task_struct should be used instead of cached
+ * information.
+ *
+ * Like in other diagnostic messages, if issued from
+ * interrupt context, current will be the interrupted
+ * task. Unlike other diagnostic messages, this is
+ * always relevant because the message is about
+ * interrupting a task.
+ */
+ ind = ind_counter & 1;
+ if (supp && desc->warned[ind]) {
+ /*
+ * If supp is true, skip the message if the
+ * same task was mentioned in the message
+ * originated on remote CPU, and it did not
+ * re-enter isolated state since then (warned
+ * is true). Only local messages following
+ * remote messages, likely about the same
+ * isolation breaking event, are skipped to
+ * avoid duplication. If remote cause is
+ * immediately followed by a local one before
+ * isolation is broken, local cause is skipped
+ * from messages.
+ */
+ put_cpu();
+ return 0;
+ }
+ task = current;
+ snprintf(buf_prefix, sizeof(buf_prefix),
+ "isolation %s/%d/%d (cpu %d)",
+ task->comm, task->tgid, task->pid, cpu);
+ put_cpu();
+ } else {
+ /*
+ * Message is for remote CPU, use cached information.
+ */
+ put_cpu();
+ /*
+ * Make sure, index remained unchanged while data was
+ * copied. If it changed, data that was copied may be
+ * inconsistent because two updates in a sequence could
+ * overwrite the data while it was being read.
+ */
+ do {
+ /* Make sure we are reading up to date values */
+ smp_mb();
+ ind = ind_counter & 1;
+ snprintf(buf_prefix, sizeof(buf_prefix),
+ "isolation %s/%d/%d (cpu %d)",
+ desc->comm[ind], desc->tgid[ind],
+ desc->pid[ind], cpu);
+ desc->warned[ind] = true;
+ ind_counter_old = ind_counter;
+ /* Record the warned flag, then re-read descriptor */
+ smp_mb();
+ ind_counter = atomic_read(&desc->curr_index);
+ /*
+ * If the counter changed, something was updated, so
+ * repeat everything to get the current data
+ */
+ } while (ind_counter != ind_counter_old);
+ }
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ switch (level) {
+ case LOGLEVEL_EMERG:
+ pr_emerg("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_ALERT:
+ pr_alert("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_CRIT:
+ pr_crit("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_ERR:
+ pr_err("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_WARNING:
+ pr_warn("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_NOTICE:
+ pr_notice("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_INFO:
+ pr_info("%s: %s", buf_prefix, buf);
+ break;
+ case LOGLEVEL_DEBUG:
+ pr_debug("%s: %s", buf_prefix, buf);
+ break;
+ default:
+ /* No message without a valid level */
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Dump stack if need be. This can be helpful even from the final exit
+ * to usermode code since stack traces sometimes carry information about
+ * what put you into the kernel, e.g. an interrupt number encoded in
+ * the initial entry stack frame that is still visible at exit time.
+ */
+static void debug_dump_stack(void)
+{
+ if (task_isolation_debug)
+ dump_stack();
+}
+
+/*
+ * Set the flags word but don't try to actually start task isolation yet.
+ * We will start it when entering user space in task_isolation_start().
+ */
+int task_isolation_request(unsigned int flags)
+{
+ struct task_struct *task = current;
+
+ /*
+ * The task isolation flags should always be cleared just by
+ * virtue of having entered the kernel.
+ */
+ WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_TASK_ISOLATION));
+ WARN_ON_ONCE(task->task_isolation_flags != 0);
+ WARN_ON_ONCE(task->task_isolation_state != STATE_NORMAL);
+
+ task->task_isolation_flags = flags;
+ if (!(task->task_isolation_flags & PR_TASK_ISOLATION_ENABLE))
+ return 0;
+
+ /* We are trying to enable task isolation. */
+ set_tsk_thread_flag(task, TIF_TASK_ISOLATION);
+
+ /*
+ * Shut down the vmstat worker so we're not interrupted later.
+ * We have to try to do this here (with interrupts enabled) since
+ * we are canceling delayed work and will call flush_work()
+ * (which enables interrupts) and possibly schedule().
+ */
+ quiet_vmstat_sync();
+
+ /* We return 0 here but we may change that in task_isolation_start(). */
+ return 0;
+}
+
+/*
+ * Perform actions that should be done immediately on exit from isolation.
+ */
+static void fast_task_isolation_cpu_cleanup(void *info)
+{
+ unsigned long flags;
+
+ /*
+ * This function runs on a CPU that ran isolated task.
+ * It should be called either directly when isolation breaking is
+ * being processed, or using IPI from another CPU when it intends
+ * to break isolation on the given CPU.
+ *
+ * We don't want this CPU running code from the rest of kernel
+ * until other CPUs know that it is no longer isolated. Any
+ * entry into kernel will call task_isolation_kernel_enter()
+ * before calling this, so this will be already done by
+ * setting per-cpu flags and synchronizing in that function.
+ *
+ * For development purposes it makes sense to check if it was
+ * done, because there is a possibility that some entry points
+ * were left unguarded. That would be clearly a bug because it
+ * will mean that regular kernel code is running with no
+ * synchronization.
+ */
+ local_irq_save(flags);
+ atomic_dec(&per_cpu(isol_exit_counter, smp_processor_id()));
+ /* Barrier to sync with requesting a task isolation breaking */
+ smp_mb__after_atomic();
+ /*
+ * At this point breaking isolation will be treated as a
+ * separate isolation-breaking event, however interrupts won't
+ * arrive until local_irq_restore()
+ */
+
+ /*
+ * Check for the above mentioned entry without a call to
+ * task_isolation_kernel_enter()
+ */
+ if ((this_cpu_read(ll_isol_flags) & FLAG_LL_TASK_ISOLATION)) {
+ /*
+ * If it did happen, call the function here, to
+ * prevent further problems from running in
+ * un-synchronized state
+ */
+ task_isolation_kernel_enter();
+ /* Report the problem */
+ pr_task_isol_emerg(smp_processor_id(),
+ "Isolation breaking was not detected on kernel entry\n");
+ }
+ /*
+ * This task is no longer isolated (and if by any chance this
+ * is the wrong task, it's already not isolated)
+ */
+ current->task_isolation_flags = 0;
+ clear_tsk_thread_flag(current, TIF_TASK_ISOLATION);
+
+ /* Run the rest of cleanup later */
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+
+ local_irq_restore(flags);
+}
+
+/* Disable task isolation for the specified task. */
+static void stop_isolation(struct task_struct *p)
+{
+ int cpu, this_cpu;
+ unsigned long flags;
+
+ this_cpu = get_cpu();
+ cpu = task_cpu(p);
+ if (atomic_inc_return(&per_cpu(isol_exit_counter, cpu)) > 1) {
+ /* Already exiting isolation */
+ atomic_dec(&per_cpu(isol_exit_counter, cpu));
+ put_cpu();
+ return;
+ }
+
+ if (p == current) {
+ p->task_isolation_state = STATE_NORMAL;
+ fast_task_isolation_cpu_cleanup(NULL);
+ task_isolation_cpu_cleanup();
+ put_cpu();
+ } else {
+ /*
+ * Schedule "slow" cleanup. This relies on
+ * TIF_NOTIFY_RESUME being set
+ */
+ spin_lock_irqsave(&task_isolation_cleanup_lock, flags);
+ cpumask_set_cpu(cpu, task_isolation_cleanup_map);
+ spin_unlock_irqrestore(&task_isolation_cleanup_lock, flags);
+ /*
+ * Setting flags is delegated to the CPU where
+ * isolated task is running
+ * isol_exit_counter will be decremented from there as well.
+ */
+ per_cpu(isol_break_csd, cpu).func =
+ fast_task_isolation_cpu_cleanup;
+ per_cpu(isol_break_csd, cpu).info = NULL;
+ per_cpu(isol_break_csd, cpu).flags = 0;
+ smp_call_function_single_async(cpu,
+ &per_cpu(isol_break_csd, cpu));
+ put_cpu();
+ }
+}
+
+/*
+ * This code runs with interrupts disabled just before the return to
+ * userspace, after a prctl() has requested enabling task isolation.
+ * We take whatever steps are needed to avoid being interrupted later:
+ * drain the lru pages, stop the scheduler tick, etc. More
+ * functionality may be added here later to avoid other types of
+ * interrupts from other kernel subsystems. This, however, may still not
+ * have the intended result, so the rest of the system takes into account
+ * the possibility of receiving an interrupt and isolation breaking later.
+ *
+ * If we can't enable task isolation, we update the syscall return
+ * value with an appropriate error.
+ *
+ * This, however, does not enable isolation yet, as far as low-level
+ * flags are concerned. So if interrupts will be enabled, it's still
+ * possible for the task to be interrupted. The call to
+ * task_isolation_exit_to_user_mode() should finally enable task
+ * isolation after this function set FLAG_LL_TASK_ISOLATION_REQUEST.
+ */
+void task_isolation_start(void)
+{
+ int error;
+ unsigned long flags;
+
+ /*
+ * We should only be called in STATE_NORMAL (isolation
+ * disabled), on our way out of the kernel from the prctl()
+ * that turned it on. If we are exiting from the kernel in
+ * another state, it means we made it back into the kernel
+ * without disabling task isolation, and we should investigate
+ * how (and in any case disable task isolation at this
+ * point). We are clearly not on the path back from the
+ * prctl() so we don't touch the syscall return value.
+ */
+ if (WARN_ON_ONCE(current->task_isolation_state != STATE_NORMAL)) {
+ stop_isolation(current);
+ /* Report the problem */
+ pr_task_isol_emerg(smp_processor_id(),
+ "Isolation start requested while not in the normal state\n");
+ return;
+ }
+
+ /*
+ * Must be affinitized to a single core with task isolation possible.
+ * In principle this could be remotely modified between the prctl()
+ * and the return to userspace, so we have to check it here.
+ */
+ if (current->nr_cpus_allowed != 1 ||
+ !is_isolation_cpu(smp_processor_id())) {
+ error = -EINVAL;
+ goto error;
+ }
+
+ /* If the vmstat delayed work is not canceled, we have to try again. */
+ if (!vmstat_idle()) {
+ error = -EAGAIN;
+ goto error;
+ }
+
+ /* Try to stop the dynamic tick. */
+ error = try_stop_full_tick();
+ if (error)
+ goto error;
+
+ /* Drain the pagevecs to avoid unnecessary IPI flushes later. */
+ lru_add_drain();
+
+ local_irq_save(flags);
+
+ /* Record isolated task IDs and name */
+ record_curr_isolated_task();
+
+ current->task_isolation_state = STATE_ISOLATED;
+ this_cpu_write(ll_isol_flags, FLAG_LL_TASK_ISOLATION_REQUEST);
+ /* Barrier to synchronize with reading of flags */
+ smp_mb();
+ local_irq_restore(flags);
+ return;
+
+error:
+ stop_isolation(current);
+ syscall_set_return_value(current, current_pt_regs(), error, 0);
+}
+
+/* Stop task isolation on the remote task and send it a signal. */
+static void send_isolation_signal(struct task_struct *task)
+{
+ int flags = task->task_isolation_flags;
+ kernel_siginfo_t info = {
+ .si_signo = PR_TASK_ISOLATION_GET_SIG(flags) ?: SIGKILL,
+ };
+
+ if ((flags & PR_TASK_ISOLATION_ENABLE) == 0)
+ return;
+
+ stop_isolation(task);
+ send_sig_info(info.si_signo, &info, task);
+}
+
+/* Only a few syscalls are valid once we are in task isolation mode. */
+static bool is_acceptable_syscall(int syscall)
+{
+ /* No need to incur an isolation signal if we are just exiting. */
+ if (syscall == __NR_exit || syscall == __NR_exit_group)
+ return true;
+
+ /* Check to see if it's the prctl for isolation. */
+ if (syscall == __NR_prctl) {
+ unsigned long arg[SYSCALL_MAX_ARGS];
+
+ syscall_get_arguments(current, current_pt_regs(), arg);
+ if (arg[0] == PR_TASK_ISOLATION)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * This routine is called from syscall entry, prevents most syscalls
+ * from executing, and if needed raises a signal to notify the process.
+ *
+ * Note that we have to stop isolation before we even print a message
+ * here, since otherwise we might end up reporting an interrupt due to
+ * kicking the printk handling code, rather than reporting the true
+ * cause of interrupt here.
+ *
+ * The message is not suppressed by previous remotely triggered
+ * messages.
+ */
+int task_isolation_syscall(int syscall)
+{
+ struct task_struct *task = current;
+
+ /*
+ * Check if by any chance syscall is being processed from
+ * isolated state without a call to
+ * task_isolation_kernel_enter() happening on entry
+ */
+ if ((this_cpu_read(ll_isol_flags) & FLAG_LL_TASK_ISOLATION)) {
+ /*
+ * If it did happen, call the function here, to
+ * prevent further problems from running in
+ * un-synchronized state
+ */
+ task_isolation_kernel_enter();
+ /* Report the problem */
+ pr_task_isol_emerg(smp_processor_id(),
+ "Isolation breaking was not detected on syscall\n");
+ }
+ /*
+ * Clear low-level isolation flags to avoid triggering
+ * a signal on return to userspace
+ */
+ this_cpu_write(ll_isol_flags, 0);
+
+ if (is_acceptable_syscall(syscall)) {
+ stop_isolation(task);
+ return 0;
+ }
+
+ send_isolation_signal(task);
+
+ pr_task_isol_warn(smp_processor_id(),
+ "task_isolation lost due to syscall %d\n",
+ syscall);
+ debug_dump_stack();
+
+ syscall_set_return_value(task, current_pt_regs(), -ERESTARTNOINTR, -1);
+ return -1;
+}
+
+/*
+ * This routine is called from the code responsible for exiting to user mode,
+ * before the point when thread flags are checked for pending work.
+ * That function must be called if the current task is isolated, because
+ * TIF_TASK_ISOLATION must trigger a call to it.
+ */
+void task_isolation_before_pending_work_check(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ /* Handle isolation breaking */
+ if ((current->task_isolation_state != STATE_NORMAL)
+ && ((this_cpu_read(ll_isol_flags) & FLAG_LL_TASK_ISOLATION_BROKEN)
+ != 0)) {
+ /*
+ * Clear low-level isolation flags to avoid triggering
+ * a signal again
+ */
+ this_cpu_write(ll_isol_flags, 0);
+ /* Send signal to notify about isolation breaking */
+ send_isolation_signal(current);
+ /* Produce generic message about lost isolation */
+ pr_task_isol_warn(smp_processor_id(), "task_isolation lost\n");
+ debug_dump_stack();
+ }
+
+ /*
+ * If this CPU is in the map of CPUs with cleanup pending,
+ * remove it from the map and call cleanup
+ */
+ spin_lock_irqsave(&task_isolation_cleanup_lock, flags);
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, task_isolation_cleanup_map)) {
+ cpumask_clear_cpu(cpu, task_isolation_cleanup_map);
+ spin_unlock_irqrestore(&task_isolation_cleanup_lock, flags);
+ task_isolation_cpu_cleanup();
+ } else
+ spin_unlock_irqrestore(&task_isolation_cleanup_lock, flags);
+}
+
+/*
+ * Called before we wake up a task that has a signal to process.
+ * Needs to be done to handle interrupts that trigger signals, which
+ * we don't catch with task_isolation_interrupt() hooks.
+ *
+ * This message is also suppressed if there was already a remotely
+ * caused message about the same isolation breaking event.
+ */
+void _task_isolation_signal(struct task_struct *task)
+{
+ struct isol_task_desc *desc;
+ int ind, cpu;
+ bool do_warn = (task->task_isolation_state == STATE_ISOLATED);
+
+ cpu = task_cpu(task);
+ desc = &per_cpu(isol_task_descs, cpu);
+ ind = atomic_read(&desc->curr_index) & 1;
+ if (desc->warned[ind])
+ do_warn = false;
+
+ stop_isolation(task);
+
+ if (do_warn) {
+ pr_warn("isolation: %s/%d/%d (cpu %d): task_isolation lost due to signal\n",
+ task->comm, task->tgid, task->pid, cpu);
+ debug_dump_stack();
+ }
+}
+
+/*
+ * Set CPUs currently running isolated tasks in CPU mask.
+ */
+void task_isolation_cpumask(struct cpumask *mask)
+{
+ int cpu;
+
+ if (task_isolation_map == NULL)
+ return;
+
+ /* Barrier to synchronize with writing task isolation flags */
+ smp_rmb();
+ for_each_cpu(cpu, task_isolation_map)
+ if (task_isolation_on_cpu(cpu))
+ cpumask_set_cpu(cpu, mask);
+}
+
+/*
+ * Clear CPUs currently running isolated tasks in CPU mask.
+ */
+void task_isolation_clear_cpumask(struct cpumask *mask)
+{
+ int cpu;
+
+ if (task_isolation_map == NULL)
+ return;
+
+ /* Barrier to synchronize with writing task isolation flags */
+ smp_rmb();
+ for_each_cpu(cpu, task_isolation_map)
+ if (task_isolation_on_cpu(cpu))
+ cpumask_clear_cpu(cpu, mask);
+}
+
+/*
+ * Cleanup procedure. The call to this procedure may be delayed.
+ */
+void task_isolation_cpu_cleanup(void)
+{
+ kick_hrtimer();
+}
@@ -46,6 +46,7 @@
#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
+#include <linux/isolation.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -759,6 +760,7 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info)
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ task_isolation_signal(t);
set_tsk_thread_flag(t, TIF_SIGPENDING);
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
@@ -42,6 +42,7 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/isolation.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2530,6 +2531,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_TASK_ISOLATION:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = task_isolation_request(arg2);
+ break;
default:
error = -EINVAL;
break;
@@ -30,6 +30,7 @@
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
+#include <linux/isolation.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
@@ -720,6 +721,19 @@ static void retrigger_next_event(void *arg)
raw_spin_unlock(&base->lock);
}
+#ifdef CONFIG_TASK_ISOLATION
+void kick_hrtimer(void)
+{
+ unsigned long flags;
+
+ preempt_disable();
+ local_irq_save(flags);
+ retrigger_next_event(NULL);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+#endif
+
/*
* Switch to high resolution mode
*/
@@ -867,8 +881,21 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
void clock_was_set(void)
{
#ifdef CONFIG_HIGH_RES_TIMERS
+#ifdef CONFIG_TASK_ISOLATION
+ struct cpumask mask;
+
+ cpumask_clear(&mask);
+ task_isolation_cpumask(&mask);
+ cpumask_complement(&mask, &mask);
+ /*
+ * Retrigger the CPU local events everywhere except CPUs
+ * running isolated tasks.
+ */
+ on_each_cpu_mask(&mask, retrigger_next_event, NULL, 1);
+#else
/* Retrigger the CPU local events everywhere */
on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
#endif
timerfd_clock_was_set();
}
@@ -896,6 +896,24 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
#endif
}
+#ifdef CONFIG_TASK_ISOLATION
+int try_stop_full_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ /* For an unstable clock, we should return a permanent error code. */
+ if (atomic_read(&tick_dep_mask) & TICK_DEP_MASK_CLOCK_UNSTABLE)
+ return -EINVAL;
+
+ if (!can_stop_full_tick(cpu, ts))
+ return -EAGAIN;
+
+ tick_nohz_stop_sched_tick(ts, cpu);
+ return 0;
+}
+#endif
+
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*