diff mbox series

[RFC/RFT,1/2] sched/core: Check and schedule ksoftirq

Message ID 20221215184300.1592872-2-srinivas.pandruvada@linux.intel.com
State New
Headers show
Series [RFC/RFT,1/2] sched/core: Check and schedule ksoftirq | expand

Commit Message

Srinivas Pandruvada Dec. 15, 2022, 6:42 p.m. UTC
From: Frederic Weisbecker <frederic@kernel.org>

It is possible that ksoftirqd is woken up and racing with forced
idle injection via play_idle_precise() called from a FIFO thread.

Here it is possible that need_resched() is false but ksoftirqd
is pending. This will result in printing of warning:

[147777.095484] NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

Also the processing can be delayed upto the duration of forced idle.

For example in the following traces (with added traces for debug):

<idle>-0 [004] 207.087742: sched_wakeup: ksoftirqd/4
<idle>-0 [004] 207.087743: sched_switch: swapper/4:0 [120] R ==> idle_inject/4:37 [49]

idle_inject/4-37 [004] 207.087744: play_idle_enter
idle_inject/4-37 [004] 207.087746: bputs: can_stop_idle_tick.isra.16: soft irq pending

(Printed warning at this point
NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

...
...
idle_inject/4-37 [004] 207.112127: play_idle_exit: state=24000000 cpu_id=4

idle_inject/4-37 [004] 207.112134: sched_switch: idle_inject/4:37 [49] S ==> ksoftirqd/4:39 [120]
ksoftirqd/4-39   [004] 207.112142: softirq_entry: vec=3 [action=NET_RX]
ksoftirqd/4-39   [004] 207.112213: irq_handler_entry: irq=142 name=enp0s

Delayed by :  207.112142 - 207.087742 = 0.024400
(250HZ configuration) and the idle duration for play_idle_precise is 24ms.
So, the softirq entered after the expiry of forced idle time.

The solution here is to give chance to run softirq. Currently do_idle()
checks for need_resched() and break. But here in addition to need_resched()
also added a check for task_is_running(__this_cpu_read(ksoftirqd)).
So if the ksoftirq is pending break the do_idle() loop and give 1 jiffie
to process via schedule_timeout(1). If the required idle time is
not expired, start hrtimer again and call do_idle() again.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 kernel/sched/idle.c | 55 ++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 18 deletions(-)
diff mbox series

Patch

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..77d6168288cf 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -49,6 +49,11 @@  static int __init cpu_idle_nopoll_setup(char *__unused)
 __setup("hlt", cpu_idle_nopoll_setup);
 #endif
 
+static bool __cpuidle idle_loop_should_break(void)
+{
+	return need_resched() || task_is_running(__this_cpu_read(ksoftirqd));
+}
+
 static noinline int __cpuidle cpu_idle_poll(void)
 {
 	trace_cpu_idle(0, smp_processor_id());
@@ -56,7 +61,7 @@  static noinline int __cpuidle cpu_idle_poll(void)
 	ct_idle_enter();
 	local_irq_enable();
 
-	while (!tif_need_resched() &&
+	while (!idle_loop_should_break() &&
 	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
 		cpu_relax();
 
@@ -174,7 +179,7 @@  static void cpuidle_idle_call(void)
 	 * Check if the idle task must be rescheduled. If it is the
 	 * case, exit the function after re-enabling the local irq.
 	 */
-	if (need_resched()) {
+	if (idle_loop_should_break()) {
 		local_irq_enable();
 		return;
 	}
@@ -276,7 +281,7 @@  static void do_idle(void)
 	__current_set_polling();
 	tick_nohz_idle_enter();
 
-	while (!need_resched()) {
+	while (!idle_loop_should_break()) {
 		rmb();
 
 		local_irq_disable();
@@ -358,6 +363,7 @@  static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
 void play_idle_precise(u64 duration_ns, u64 latency_ns)
 {
 	struct idle_timer it;
+	ktime_t remaining;
 
 	/*
 	 * Only FIFO tasks can disable the tick since they don't need the forced
@@ -370,25 +376,38 @@  void play_idle_precise(u64 duration_ns, u64 latency_ns)
 	WARN_ON_ONCE(!duration_ns);
 	WARN_ON_ONCE(current->mm);
 
-	rcu_sleep_check();
-	preempt_disable();
-	current->flags |= PF_IDLE;
-	cpuidle_use_deepest_state(latency_ns);
+	remaining = ns_to_ktime(duration_ns);
 
-	it.done = 0;
-	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
-	it.timer.function = idle_inject_timer_fn;
-	hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
-		      HRTIMER_MODE_REL_PINNED_HARD);
+	do {
+		rcu_sleep_check();
+		preempt_disable();
+		current->flags |= PF_IDLE;
+		cpuidle_use_deepest_state(latency_ns);
 
-	while (!READ_ONCE(it.done))
-		do_idle();
+		it.done = 0;
+		hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+		it.timer.function = idle_inject_timer_fn;
+		hrtimer_start(&it.timer, remaining, HRTIMER_MODE_REL_PINNED_HARD);
+
+		while (!READ_ONCE(it.done) && !task_is_running(__this_cpu_read(ksoftirqd)))
+			do_idle();
 
-	cpuidle_use_deepest_state(0);
-	current->flags &= ~PF_IDLE;
+		remaining = hrtimer_get_remaining(&it.timer);
 
-	preempt_fold_need_resched();
-	preempt_enable();
+		hrtimer_cancel(&it.timer);
+
+		cpuidle_use_deepest_state(0);
+		current->flags &= ~PF_IDLE;
+
+		preempt_fold_need_resched();
+		preempt_enable();
+
+		/* Give ksoftirqd 1 jiffy to get a chance to start its job */
+		if (!READ_ONCE(it.done) && task_is_running(__this_cpu_read(ksoftirqd))) {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(1);
+		}
+	} while (!READ_ONCE(it.done));
 }
 EXPORT_SYMBOL_GPL(play_idle_precise);