@@ -225,6 +225,16 @@ extern struct cred init_cred;
#define INIT_TASK_SECURITY
#endif
+#ifdef CONFIG_SCHED_RT
+#define INIT_TASK_RT(tsk) \
+ .rt = { \
+ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
+ .time_slice = RR_TIMESLICE, \
+ },
+#else
+#define INIT_TASK_RT(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,10 +260,7 @@ extern struct cred init_cred;
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
- .rt = { \
- .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
- .time_slice = RR_TIMESLICE, \
- }, \
+ INIT_TASK_RT(tsk) \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
INIT_CGROUP_SCHED(tsk) \
@@ -12,7 +12,7 @@
#ifndef __LINUX_RT_MUTEX_H
#define __LINUX_RT_MUTEX_H
-#if 1 /* will become def CONFIG_SCHED_RT later */
+#ifdef CONFIG_SCHED_RT
#include <linux/linkage.h>
#include <linux/rbtree.h>
@@ -518,7 +518,9 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
+#ifdef CONFIG_SCHED_RT
struct sched_rt_entity rt;
+#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -7,7 +7,7 @@ struct task_struct;
static inline int rt_prio(int prio)
{
- if (unlikely(prio < MAX_RT_PRIO))
+ if (IS_ENABLED(CONFIG_SCHED_RT) && unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
}
@@ -17,7 +17,7 @@ static inline int rt_task(struct task_struct *p)
return rt_prio(p->prio);
}
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
/*
* Must hold either p->pi_lock or task_rq(p)->lock.
*/
@@ -52,4 +52,10 @@ extern void normalize_rt_tasks(void);
*/
#define RR_TIMESLICE (100 * HZ / 1000)
+#ifdef CONFIG_SCHED_RT
+#define rt_timeout(tsk) (tsk)->rt.timeout
+#else
+#define rt_timeout(tsk) 0
+#endif
+
#endif /* _LINUX_SCHED_RT_H */
@@ -687,7 +687,7 @@ config TREE_RCU_TRACE
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+ depends on SCHED_RT && RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
default n
help
This option boosts the priority of preempted RCU readers that
@@ -1090,7 +1090,7 @@ config CFS_BANDWIDTH
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
- depends on CGROUP_SCHED
+ depends on CGROUP_SCHED && SCHED_RT
default n
help
This feature lets you explicitly allocate real CPU bandwidth
@@ -1303,6 +1303,14 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_RT
+ bool "Real Time Task Scheduling" if EXPERT
+ default y
+ help
+ This adds the sched_rt scheduling class to the kernel providing
+ support for the SCHED_FIFO and SCHED_RR policies. You might want
+ to disable this to reduce the kernel size. If unsure say y.
+
config SCHED_DL
bool "Deadline Task Scheduling" if EXPERT
default y
@@ -1640,7 +1648,7 @@ config FUTEX
config FUTEX_PI
bool
- depends on FUTEX && RT_MUTEXES
+ depends on FUTEX && RT_MUTEXES && SCHED_RT
default y
config HAVE_FUTEX_CMPXCHG
@@ -20,8 +20,11 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
+# Compatibility wrappers in rtmutex.h are used when CONFIG_SCHED_Rt=n
+ifeq ($(CONFIG_SCHED_RT),y)
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+endif
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
@@ -446,7 +446,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
.name = "ww_mutex_lock"
};
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
static DEFINE_RT_MUTEX(torture_rtmutex);
static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
@@ -872,7 +872,7 @@ static int __init lock_torture_init(void)
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
&rtmutex_lock_ops,
#endif
&rwsem_lock_ops,
@@ -16,8 +16,8 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-y += idle_task.o fair.o rt.o
+obj-y += wait.o swait.o completion.o idle.o idle_task.o fair.o
+obj-$(CONFIG_SCHED_RT) += rt.o
obj-$(CONFIG_SCHED_DL) += deadline.o $(if $(CONFIG_SMP),cpudeadline.o)
obj-$(CONFIG_SMP) += cpupri.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
@@ -642,8 +642,8 @@ bool sched_can_stop_tick(struct rq *rq)
* If there are more than one RR tasks, we need the tick to effect the
* actual RR behaviour.
*/
- if (rq->rt.rr_nr_running) {
- if (rq->rt.rr_nr_running == 1)
+ if (rt_rr_nr_running(rq)) {
+ if (rt_rr_nr_running(rq) == 1)
return true;
else
return false;
@@ -653,7 +653,7 @@ bool sched_can_stop_tick(struct rq *rq)
* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
* forced preemption between FIFO tasks.
*/
- fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ fifo_nr_running = rt_rt_nr_running(rq) - rt_rr_nr_running(rq);
if (fifo_nr_running)
return true;
@@ -1584,7 +1584,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
* Reset it back to a normal scheduling class so that
* it can die in pieces.
*/
- old_stop->sched_class = &rt_sched_class;
+ old_stop->sched_class = stop_sched_class.next;
}
}
@@ -2180,11 +2180,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
#endif
+#ifdef CONFIG_SCHED_RT
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3595,7 +3597,7 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
@@ -3994,6 +3996,23 @@ static int __sched_setscheduler(struct task_struct *p,
/* May grab non-irq protected spin_locks: */
BUG_ON(in_interrupt());
+
+ /*
+ * When the RT scheduling class is disabled, let's make sure kernel threads
+ * wanting RT still get lowest nice value to give them highest available
+ * priority rather than simply returning an error. Obviously we can't test
+ * rt_policy() here as it is always false in that case.
+ */
+ if (!IS_ENABLED(CONFIG_SCHED_RT) && !user &&
+ (policy == SCHED_FIFO || policy == SCHED_RR)) {
+ static const struct sched_attr k_attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = MIN_NICE,
+ };
+ attr = &k_attr;
+ policy = SCHED_NORMAL;
+ }
+
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -5857,7 +5876,10 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
+#ifdef CONFIG_SCHED_RT
init_rt_rq(&rq->rt);
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
@@ -5886,7 +5908,6 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -1825,7 +1825,11 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
const struct sched_class dl_sched_class = {
+#ifdef CONFIG_SCHED_RT
.next = &rt_sched_class,
+#else
+ .next = &fair_sched_class,
+#endif
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -645,7 +645,9 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
+#ifdef CONFIG_SCHED_RT
print_rt_stats(m, cpu);
+#endif
#ifdef CONFIG_SCHED_DL
print_dl_stats(m, cpu);
#endif
@@ -132,7 +132,8 @@ static inline int fair_policy(int policy)
static inline int rt_policy(int policy)
{
- return policy == SCHED_FIFO || policy == SCHED_RR;
+ return IS_ENABLED(CONFIG_SCHED_RT) &&
+ (policy == SCHED_FIFO || policy == SCHED_RR);
}
static inline int dl_policy(int policy)
@@ -398,8 +399,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
-extern void free_rt_sched_group(struct task_group *tg);
-extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
@@ -518,7 +517,7 @@ struct cfs_rq {
static inline int rt_bandwidth_enabled(void)
{
- return sysctl_sched_rt_runtime >= 0;
+ return IS_ENABLED(CONFIG_SCHED_RT) && sysctl_sched_rt_runtime >= 0;
}
/* RT IPI pull logic requires IRQ_WORK */
@@ -567,6 +566,24 @@ struct rt_rq {
#endif
};
+extern struct rt_bandwidth def_rt_bandwidth;
+
+#ifdef CONFIG_SCHED_RT
+#define rt_rr_nr_running(rq) (rq)->rt.rr_nr_running
+#define rt_rt_nr_running(rq) (rq)->rt.rt_nr_running
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void free_rt_sched_group(struct task_group *tg);
+extern void init_sched_rt_class(void);
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+#else
+#define rt_rr_nr_running(rq) 0
+#define rt_rt_nr_running(rq) 0
+#define alloc_rt_sched_group(...) 1
+#define free_rt_sched_group(tg) do { } while (0)
+#define init_sched_rt_class() do { } while (0)
+#define init_rt_bandwidth(...) do { } while (0)
+#endif
+
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
@@ -1470,8 +1487,10 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
#define sched_class_highest (&stop_sched_class)
#elif defined(CONFIG_SCHED_DL)
#define sched_class_highest (&dl_sched_class)
-#else
+#elif defined(CONFIG_SCHED_RT)
#define sched_class_highest (&rt_sched_class)
+#else
+#define sched_class_highest (&fair_sched_class)
#endif
#define for_each_class(class) \
@@ -1524,15 +1543,11 @@ extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
-extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
-extern struct rt_bandwidth def_rt_bandwidth;
-extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
@@ -112,8 +112,10 @@ static void update_curr_stop(struct rq *rq)
const struct sched_class stop_sched_class = {
#ifdef CONFIG_SCHED_DL
.next = &dl_sched_class,
-#else
+#elif defined(CONFIG_SCHED_RT)
.next = &rt_sched_class,
+#else
+ .next = &fair_sched_class,
#endif
.enqueue_task = enqueue_task_stop,
@@ -401,6 +401,7 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_RT
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -422,6 +423,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -1071,7 +1073,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
},
#endif
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
{
.procname = "max_lock_depth",
.data = &max_lock_depth,
@@ -4,6 +4,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/rt.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
@@ -814,13 +815,14 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = IS_ENABLED(CONFIG_SCHED_RT) ?
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur) : RLIM_INFINITY;
if (soft != RLIM_INFINITY) {
unsigned long hard =
READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ rt_timeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
@@ -832,7 +834,7 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+ if (rt_timeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
@@ -1008,7 +1008,7 @@ menu "Lock Debugging (spinlocks, mutexes, etc...)"
config DEBUG_RT_MUTEXES
bool "RT Mutex debugging, deadlock detection"
- depends on DEBUG_KERNEL && RT_MUTEXES
+ depends on DEBUG_KERNEL && RT_MUTEXES && SCHED_RT
help
This allows rt mutex semantics violations and rt mutex related
deadlocks (lockups) to be detected and reported automatically.
On most small systems where user space is tightly controlled, the realtime scheduling class can often be dispensed with to reduce the kernel footprint. Let's make it configurable. The code that makes explicit assumptions about actual RT mutexes (i.e where the compatibility wrappers don't make sense) has to be made conditional on CONFIG_SCHED_RT. This is also done here. Signed-off-by: Nicolas Pitre <nico@linaro.org> --- include/linux/init_task.h | 15 +++++++++++---- include/linux/rtmutex.h | 2 +- include/linux/sched.h | 2 ++ include/linux/sched/rt.h | 10 ++++++++-- init/Kconfig | 14 +++++++++++--- kernel/locking/Makefile | 3 +++ kernel/locking/locktorture.c | 4 ++-- kernel/sched/Makefile | 4 ++-- kernel/sched/core.c | 33 +++++++++++++++++++++++++++------ kernel/sched/deadline.c | 4 ++++ kernel/sched/debug.c | 2 ++ kernel/sched/sched.h | 33 ++++++++++++++++++++++++--------- kernel/sched/stop_task.c | 4 +++- kernel/sysctl.c | 4 +++- kernel/time/posix-cpu-timers.c | 8 +++++--- lib/Kconfig.debug | 2 +- 16 files changed, 109 insertions(+), 35 deletions(-) -- 2.9.4