[v2,8/8] sched/rt: make it configurable

Message ID	20170606232450.30278-9-nicolas.pitre@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Nicolas Pitre <nicolas.pitre@linaro.org> To: Ingo Molnar <mingo@redhat.com>, Peter Zijlstra <peterz@infradead.org> Cc: linux-kernel@vger.kernel.org Subject: [PATCH v2 8/8] sched/rt: make it configurable Date: Tue, 6 Jun 2017 19:24:50 -0400 Message-Id: <20170606232450.30278-9-nicolas.pitre@linaro.org> In-Reply-To: <20170606232450.30278-1-nicolas.pitre@linaro.org> References: <20170606232450.30278-1-nicolas.pitre@linaro.org> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk
Series	scheduler tinification \| expand [v2,0/8] scheduler tinification [v2,1/8] cpuset/sched: cpuset makes sense for SMP only [v2,2/8] sched: omit stop_sched_class when !SMP [v2,3/8] futex: make PI support optional [v2,4/8] sched/deadline: move dl related code out of sched/core.c [v2,5/8] sched/rt: move rt related code out of sched/core.c [v2,6/8] sched/deadline: make it configurable [v2,7/8] rtmutex: compatibility wrappers when no RT support is configured [v2,8/8] sched/rt: make it configurable

diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e049526bc1..6befc0aa61 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -225,6 +225,16 @@ extern struct cred init_cred; #define INIT_TASK_SECURITY #endif +#ifdef CONFIG_SCHED_RT +#define INIT_TASK_RT(tsk) \ + .rt = { \ + .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ + .time_slice = RR_TIMESLICE, \ + }, +#else +#define INIT_TASK_RT(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -250,10 +260,7 @@ extern struct cred init_cred; .se = { \ .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ }, \ - .rt = { \ - .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ - .time_slice = RR_TIMESLICE, \ - }, \ + INIT_TASK_RT(tsk) \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ INIT_CGROUP_SCHED(tsk) \ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 01db77a41b..05c444f930 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -12,7 +12,7 @@ #ifndef __LINUX_RT_MUTEX_H #define __LINUX_RT_MUTEX_H -#if 1 /* will become def CONFIG_SCHED_RT later */ +#ifdef CONFIG_SCHED_RT #include <linux/linkage.h> #include <linux/rbtree.h> diff --git a/include/linux/sched.h b/include/linux/sched.h index ba0c203669..71a43480ed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -518,7 +518,9 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; +#ifdef CONFIG_SCHED_RT struct sched_rt_entity rt; +#endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index f93329aba3..681c48361f 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -7,7 +7,7 @@ struct task_struct; static inline int rt_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO)) + if (IS_ENABLED(CONFIG_SCHED_RT) && unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } @@ -17,7 +17,7 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -#ifdef CONFIG_RT_MUTEXES +#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT) /* * Must hold either p->pi_lock or task_rq(p)->lock. */ @@ -52,4 +52,10 @@ extern void normalize_rt_tasks(void); */ #define RR_TIMESLICE (100 * HZ / 1000) +#ifdef CONFIG_SCHED_RT +#define rt_timeout(tsk) (tsk)->rt.timeout +#else +#define rt_timeout(tsk) 0 +#endif + #endif /* _LINUX_SCHED_RT_H */ diff --git a/init/Kconfig b/init/Kconfig index 43e6ae3414..723ec1cb5c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -687,7 +687,7 @@ config TREE_RCU_TRACE config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + depends on SCHED_RT && RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT default n help This option boosts the priority of preempted RCU readers that @@ -1090,7 +1090,7 @@ config CFS_BANDWIDTH config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" - depends on CGROUP_SCHED + depends on CGROUP_SCHED && SCHED_RT default n help This feature lets you explicitly allocate real CPU bandwidth @@ -1303,6 +1303,14 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_RT + bool "Real Time Task Scheduling" if EXPERT + default y + help + This adds the sched_rt scheduling class to the kernel providing + support for the SCHED_FIFO and SCHED_RR policies. You might want + to disable this to reduce the kernel size. If unsure say y. + config SCHED_DL bool "Deadline Task Scheduling" if EXPERT default y @@ -1640,7 +1648,7 @@ config FUTEX config FUTEX_PI bool - depends on FUTEX && RT_MUTEXES + depends on FUTEX && RT_MUTEXES && SCHED_RT default y config HAVE_FUTEX_CMPXCHG diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 760158d9d9..52892cf26c 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -20,8 +20,11 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o +# Compatibility wrappers in rtmutex.h are used when CONFIG_SCHED_Rt=n +ifeq ($(CONFIG_SCHED_RT),y) obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +endif obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d4da..53d6753d50 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -446,7 +446,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = { .name = "ww_mutex_lock" }; -#ifdef CONFIG_RT_MUTEXES +#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT) static DEFINE_RT_MUTEX(torture_rtmutex); static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) @@ -872,7 +872,7 @@ static int __init lock_torture_init(void) &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, -#ifdef CONFIG_RT_MUTEXES +#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT) &rtmutex_lock_ops, #endif &rwsem_lock_ops, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 3bd6a7c1cc..bccbef85e5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,8 +16,8 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += wait.o swait.o completion.o idle.o -obj-y += idle_task.o fair.o rt.o +obj-y += wait.o swait.o completion.o idle.o idle_task.o fair.o +obj-$(CONFIG_SCHED_RT) += rt.o obj-$(CONFIG_SCHED_DL) += deadline.o $(if $(CONFIG_SMP),cpudeadline.o) obj-$(CONFIG_SMP) += cpupri.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 30138033b7..0d718b68df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -642,8 +642,8 @@ bool sched_can_stop_tick(struct rq *rq) * If there are more than one RR tasks, we need the tick to effect the * actual RR behaviour. */ - if (rq->rt.rr_nr_running) { - if (rq->rt.rr_nr_running == 1) + if (rt_rr_nr_running(rq)) { + if (rt_rr_nr_running(rq) == 1) return true; else return false; @@ -653,7 +653,7 @@ bool sched_can_stop_tick(struct rq *rq) * If there's no RR tasks, but FIFO tasks, we can skip the tick, no * forced preemption between FIFO tasks. */ - fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + fifo_nr_running = rt_rt_nr_running(rq) - rt_rr_nr_running(rq); if (fifo_nr_running) return true; @@ -1584,7 +1584,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) * Reset it back to a normal scheduling class so that * it can die in pieces. */ - old_stop->sched_class = &rt_sched_class; + old_stop->sched_class = stop_sched_class.next; } } @@ -2180,11 +2180,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); #endif +#ifdef CONFIG_SCHED_RT INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; p->rt.time_slice = sched_rr_timeslice; p->rt.on_rq = 0; p->rt.on_list = 0; +#endif #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -3595,7 +3597,7 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -#ifdef CONFIG_RT_MUTEXES +#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT) static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { @@ -3994,6 +3996,23 @@ static int __sched_setscheduler(struct task_struct *p, /* May grab non-irq protected spin_locks: */ BUG_ON(in_interrupt()); + + /* + * When the RT scheduling class is disabled, let's make sure kernel threads + * wanting RT still get lowest nice value to give them highest available + * priority rather than simply returning an error. Obviously we can't test + * rt_policy() here as it is always false in that case. + */ + if (!IS_ENABLED(CONFIG_SCHED_RT) && !user && + (policy == SCHED_FIFO || policy == SCHED_RR)) { + static const struct sched_attr k_attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = MIN_NICE, + }; + attr = &k_attr; + policy = SCHED_NORMAL; + } + recheck: /* Double check policy once rq lock held: */ if (policy < 0) { @@ -5857,7 +5876,10 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); +#ifdef CONFIG_SCHED_RT init_rt_rq(&rq->rt); + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#endif init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; @@ -5886,7 +5908,6 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e879feae5f..08d7193ba5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1825,7 +1825,11 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } const struct sched_class dl_sched_class = { +#ifdef CONFIG_SCHED_RT .next = &rt_sched_class, +#else + .next = &fair_sched_class, +#endif .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 84f80a81ab..c550723ce9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -645,7 +645,9 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); +#ifdef CONFIG_SCHED_RT print_rt_stats(m, cpu); +#endif #ifdef CONFIG_SCHED_DL print_dl_stats(m, cpu); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c05cc33848..07366c1d04 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -132,7 +132,8 @@ static inline int fair_policy(int policy) static inline int rt_policy(int policy) { - return policy == SCHED_FIFO || policy == SCHED_RR; + return IS_ENABLED(CONFIG_SCHED_RT) && + (policy == SCHED_FIFO || policy == SCHED_RR); } static inline int dl_policy(int policy) @@ -398,8 +399,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -518,7 +517,7 @@ struct cfs_rq { static inline int rt_bandwidth_enabled(void) { - return sysctl_sched_rt_runtime >= 0; + return IS_ENABLED(CONFIG_SCHED_RT) && sysctl_sched_rt_runtime >= 0; } /* RT IPI pull logic requires IRQ_WORK */ @@ -567,6 +566,24 @@ struct rt_rq { #endif }; +extern struct rt_bandwidth def_rt_bandwidth; + +#ifdef CONFIG_SCHED_RT +#define rt_rr_nr_running(rq) (rq)->rt.rr_nr_running +#define rt_rt_nr_running(rq) (rq)->rt.rt_nr_running +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void free_rt_sched_group(struct task_group *tg); +extern void init_sched_rt_class(void); +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +#else +#define rt_rr_nr_running(rq) 0 +#define rt_rt_nr_running(rq) 0 +#define alloc_rt_sched_group(...) 1 +#define free_rt_sched_group(tg) do { } while (0) +#define init_sched_rt_class() do { } while (0) +#define init_rt_bandwidth(...) do { } while (0) +#endif + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ @@ -1470,8 +1487,10 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) #define sched_class_highest (&stop_sched_class) #elif defined(CONFIG_SCHED_DL) #define sched_class_highest (&dl_sched_class) -#else +#elif defined(CONFIG_SCHED_RT) #define sched_class_highest (&rt_sched_class) +#else +#define sched_class_highest (&fair_sched_class) #endif #define for_each_class(class) \ @@ -1524,15 +1543,11 @@ extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; -extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); - extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 5632dc3e63..7cad8c1540 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -112,8 +112,10 @@ static void update_curr_stop(struct rq *rq) const struct sched_class stop_sched_class = { #ifdef CONFIG_SCHED_DL .next = &dl_sched_class, -#else +#elif defined(CONFIG_SCHED_RT) .next = &rt_sched_class, +#else + .next = &fair_sched_class, #endif .enqueue_task = enqueue_task_stop, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76c..1c670f4053 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -401,6 +401,7 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ +#ifdef CONFIG_SCHED_RT { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, @@ -422,6 +423,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", @@ -1071,7 +1073,7 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, }, #endif -#ifdef CONFIG_RT_MUTEXES +#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT) { .procname = "max_lock_depth", .data = &max_lock_depth, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d2a1e6dd02..32b2ea6212 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -4,6 +4,7 @@ #include <linux/sched/signal.h> #include <linux/sched/cputime.h> +#include <linux/sched/rt.h> #include <linux/posix-timers.h> #include <linux/errno.h> #include <linux/math64.h> @@ -814,13 +815,14 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = IS_ENABLED(CONFIG_SCHED_RT) ? + READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur) : RLIM_INFINITY; if (soft != RLIM_INFINITY) { unsigned long hard = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + rt_timeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -832,7 +834,7 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + if (rt_timeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e4587ebe52..0ecc7eb9dc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1008,7 +1008,7 @@ menu "Lock Debugging (spinlocks, mutexes, etc...)" config DEBUG_RT_MUTEXES bool "RT Mutex debugging, deadlock detection" - depends on DEBUG_KERNEL && RT_MUTEXES + depends on DEBUG_KERNEL && RT_MUTEXES && SCHED_RT help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically.

[v2,8/8] sched/rt: make it configurable

Commit Message

Patch