diff mbox series

[RT,1/3] eventfd: Make signal recursion protection a task bit

Message ID 20220107021502.1121597-2-lgoncalv@redhat.com
State New
Headers show
Series Linux v5.10.90-rt61-rc1 | expand

Commit Message

Luis Claudio R. Goncalves Jan. 7, 2022, 2:15 a.m. UTC
From: Thomas Gleixner <tglx@linutronix.de>

v5.10.90-rt61-rc1 stable review patch.
If anyone has any objections, please let me know.

-----------


Upstream commit b542e383d8c005f06a131e2b40d5889b812f19c6

The recursion protection for eventfd_signal() is based on a per CPU
variable and relies on the !RT semantics of spin_lock_irqsave() for
protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
disables preemption nor interrupts which allows the spin lock held section
to be preempted. If the preempting task invokes eventfd_signal() as well,
then the recursion warning triggers.

Paolo suggested to protect the per CPU variable with a local lock, but
that's heavyweight and actually not necessary. The goal of this protection
is to prevent the task stack from overflowing, which can be achieved with a
per task recursion protection as well.

Replace the per CPU variable with a per task bit similar to other recursion
protection bits like task_struct::in_page_owner. This works on both !RT and
RT kernels and removes as a side effect the extra per CPU storage.

No functional change for !RT kernels.

Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/aio.c                |  2 +-
 fs/eventfd.c            | 12 +++++-------
 include/linux/eventfd.h | 11 +++++------
 include/linux/sched.h   |  4 ++++
 4 files changed, 15 insertions(+), 14 deletions(-)

Comments

Daniel Vacek Feb. 3, 2022, 8:59 a.m. UTC | #1
Hi,

OT: Btw for some reason you're sending a copy to
"stable-rt@vger.kernel.org"@redhat.com.

On Fri, Jan 7, 2022 at 8:04 PM Luis Claudio R. Goncalves
<lgoncalv@redhat.com> wrote:
>
> From: Thomas Gleixner <tglx@linutronix.de>
>
> v5.10.90-rt61-rc1 stable review patch.
> If anyone has any objections, please let me know.
>
> -----------
>
>
> Upstream commit b542e383d8c005f06a131e2b40d5889b812f19c6
>
> The recursion protection for eventfd_signal() is based on a per CPU
> variable and relies on the !RT semantics of spin_lock_irqsave() for
> protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
> disables preemption nor interrupts which allows the spin lock held section
> to be preempted. If the preempting task invokes eventfd_signal() as well,
> then the recursion warning triggers.
>
> Paolo suggested to protect the per CPU variable with a local lock, but
> that's heavyweight and actually not necessary. The goal of this protection
> is to prevent the task stack from overflowing, which can be achieved with a
> per task recursion protection as well.
>
> Replace the per CPU variable with a per task bit similar to other recursion
> protection bits like task_struct::in_page_owner. This works on both !RT and
> RT kernels and removes as a side effect the extra per CPU storage.
>
> No functional change for !RT kernels.
>
> Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
> Acked-by: Jason Wang <jasowang@redhat.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  fs/aio.c                |  2 +-
>  fs/eventfd.c            | 12 +++++-------
>  include/linux/eventfd.h | 11 +++++------
>  include/linux/sched.h   |  4 ++++
>  4 files changed, 15 insertions(+), 14 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index c72b2c51b446c..f7d47c9ff6deb 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1761,7 +1761,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
>                 list_del_init(&req->wait.entry);
>                 list_del(&iocb->ki_list);
>                 iocb->ki_res.res = mangle_poll(mask);
> -               if (iocb->ki_eventfd && eventfd_signal_count()) {
> +               if (iocb->ki_eventfd && eventfd_signal_allowed()) {

IIUC, the logic changed and this should read
!eventfd_signal_allowed(). Or am I missing something here?

Actually checking other stable branches and upstream, this looks to be
a typo only in v5.10.90-rt61-rc1

--nX




>                         iocb = NULL;
>                         INIT_WORK(&req->work, aio_poll_put_work);
>                         schedule_work(&req->work);
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index df466ef81dddf..9035ca60bfcf3 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -25,8 +25,6 @@
>  #include <linux/idr.h>
>  #include <linux/uio.h>
>
> -DEFINE_PER_CPU(int, eventfd_wake_count);
> -
>  static DEFINE_IDA(eventfd_ida);
>
>  struct eventfd_ctx {
> @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>          * Deadlock or stack overflow issues can happen if we recurse here
>          * through waitqueue wakeup handlers. If the caller users potentially
>          * nested waitqueues with custom wakeup handlers, then it should
> -        * check eventfd_signal_count() before calling this function. If
> -        * it returns true, the eventfd_signal() call should be deferred to a
> +        * check eventfd_signal_allowed() before calling this function. If
> +        * it returns false, the eventfd_signal() call should be deferred to a
>          * safe context.
>          */
> -       if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> +       if (WARN_ON_ONCE(current->in_eventfd_signal))
>                 return 0;
>
>         spin_lock_irqsave(&ctx->wqh.lock, flags);
> -       this_cpu_inc(eventfd_wake_count);
> +       current->in_eventfd_signal = 1;
>         if (ULLONG_MAX - ctx->count < n)
>                 n = ULLONG_MAX - ctx->count;
>         ctx->count += n;
>         if (waitqueue_active(&ctx->wqh))
>                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> -       this_cpu_dec(eventfd_wake_count);
> +       current->in_eventfd_signal = 0;
>         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>
>         return n;
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index dc4fd8a6644dd..836b4c021a0a4 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -14,6 +14,7 @@
>  #include <linux/err.h>
>  #include <linux/percpu-defs.h>
>  #include <linux/percpu.h>
> +#include <linux/sched.h>
>
>  /*
>   * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
> @@ -42,11 +43,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
>  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
>                                   __u64 *cnt);
>
> -DECLARE_PER_CPU(int, eventfd_wake_count);
> -
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
>  {
> -       return this_cpu_read(eventfd_wake_count);
> +       return !current->in_eventfd_signal;
>  }
>
>  #else /* CONFIG_EVENTFD */
> @@ -77,9 +76,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
>         return -ENOSYS;
>  }
>
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
>  {
> -       return false;
> +       return true;
>  }
>
>  #endif
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 409a24036952c..29e6ff1af1df9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -852,6 +852,10 @@ struct task_struct {
>         /* Stalled due to lack of memory */
>         unsigned                        in_memstall:1;
>  #endif
> +#ifdef CONFIG_EVENTFD
> +       /* Recursion prevention for eventfd_signal() */
> +       unsigned                        in_eventfd_signal:1;
> +#endif
>
>         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
>
> --
> 2.33.1
>
Daniel Vacek Feb. 3, 2022, 9:16 a.m. UTC | #2
Actually I see it was fixed later with
4b3749865374899e115aa8c48681709b086fe6d3 so I guess it should be
included as well.

--nX

On Thu, Feb 3, 2022 at 9:59 AM Daniel Vacek <neelx.g@gmail.com> wrote:
>
> Hi,
>
> OT: Btw for some reason you're sending a copy to
> "stable-rt@vger.kernel.org"@redhat.com.
>
> On Fri, Jan 7, 2022 at 8:04 PM Luis Claudio R. Goncalves
> <lgoncalv@redhat.com> wrote:
> >
> > From: Thomas Gleixner <tglx@linutronix.de>
> >
> > v5.10.90-rt61-rc1 stable review patch.
> > If anyone has any objections, please let me know.
> >
> > -----------
> >
> >
> > Upstream commit b542e383d8c005f06a131e2b40d5889b812f19c6
> >
> > The recursion protection for eventfd_signal() is based on a per CPU
> > variable and relies on the !RT semantics of spin_lock_irqsave() for
> > protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
> > disables preemption nor interrupts which allows the spin lock held section
> > to be preempted. If the preempting task invokes eventfd_signal() as well,
> > then the recursion warning triggers.
> >
> > Paolo suggested to protect the per CPU variable with a local lock, but
> > that's heavyweight and actually not necessary. The goal of this protection
> > is to prevent the task stack from overflowing, which can be achieved with a
> > per task recursion protection as well.
> >
> > Replace the per CPU variable with a per task bit similar to other recursion
> > protection bits like task_struct::in_page_owner. This works on both !RT and
> > RT kernels and removes as a side effect the extra per CPU storage.
> >
> > No functional change for !RT kernels.
> >
> > Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
> > Acked-by: Jason Wang <jasowang@redhat.com>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx
> > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > ---
> >  fs/aio.c                |  2 +-
> >  fs/eventfd.c            | 12 +++++-------
> >  include/linux/eventfd.h | 11 +++++------
> >  include/linux/sched.h   |  4 ++++
> >  4 files changed, 15 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/aio.c b/fs/aio.c
> > index c72b2c51b446c..f7d47c9ff6deb 100644
> > --- a/fs/aio.c
> > +++ b/fs/aio.c
> > @@ -1761,7 +1761,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
> >                 list_del_init(&req->wait.entry);
> >                 list_del(&iocb->ki_list);
> >                 iocb->ki_res.res = mangle_poll(mask);
> > -               if (iocb->ki_eventfd && eventfd_signal_count()) {
> > +               if (iocb->ki_eventfd && eventfd_signal_allowed()) {
>
> IIUC, the logic changed and this should read
> !eventfd_signal_allowed(). Or am I missing something here?
>
> Actually checking other stable branches and upstream, this looks to be
> a typo only in v5.10.90-rt61-rc1
>
> --nX
>
>
>
>
> >                         iocb = NULL;
> >                         INIT_WORK(&req->work, aio_poll_put_work);
> >                         schedule_work(&req->work);
> > diff --git a/fs/eventfd.c b/fs/eventfd.c
> > index df466ef81dddf..9035ca60bfcf3 100644
> > --- a/fs/eventfd.c
> > +++ b/fs/eventfd.c
> > @@ -25,8 +25,6 @@
> >  #include <linux/idr.h>
> >  #include <linux/uio.h>
> >
> > -DEFINE_PER_CPU(int, eventfd_wake_count);
> > -
> >  static DEFINE_IDA(eventfd_ida);
> >
> >  struct eventfd_ctx {
> > @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> >          * Deadlock or stack overflow issues can happen if we recurse here
> >          * through waitqueue wakeup handlers. If the caller users potentially
> >          * nested waitqueues with custom wakeup handlers, then it should
> > -        * check eventfd_signal_count() before calling this function. If
> > -        * it returns true, the eventfd_signal() call should be deferred to a
> > +        * check eventfd_signal_allowed() before calling this function. If
> > +        * it returns false, the eventfd_signal() call should be deferred to a
> >          * safe context.
> >          */
> > -       if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> > +       if (WARN_ON_ONCE(current->in_eventfd_signal))
> >                 return 0;
> >
> >         spin_lock_irqsave(&ctx->wqh.lock, flags);
> > -       this_cpu_inc(eventfd_wake_count);
> > +       current->in_eventfd_signal = 1;
> >         if (ULLONG_MAX - ctx->count < n)
> >                 n = ULLONG_MAX - ctx->count;
> >         ctx->count += n;
> >         if (waitqueue_active(&ctx->wqh))
> >                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> > -       this_cpu_dec(eventfd_wake_count);
> > +       current->in_eventfd_signal = 0;
> >         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> >
> >         return n;
> > diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> > index dc4fd8a6644dd..836b4c021a0a4 100644
> > --- a/include/linux/eventfd.h
> > +++ b/include/linux/eventfd.h
> > @@ -14,6 +14,7 @@
> >  #include <linux/err.h>
> >  #include <linux/percpu-defs.h>
> >  #include <linux/percpu.h>
> > +#include <linux/sched.h>
> >
> >  /*
> >   * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
> > @@ -42,11 +43,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
> >  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
> >                                   __u64 *cnt);
> >
> > -DECLARE_PER_CPU(int, eventfd_wake_count);
> > -
> > -static inline bool eventfd_signal_count(void)
> > +static inline bool eventfd_signal_allowed(void)
> >  {
> > -       return this_cpu_read(eventfd_wake_count);
> > +       return !current->in_eventfd_signal;
> >  }
> >
> >  #else /* CONFIG_EVENTFD */
> > @@ -77,9 +76,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
> >         return -ENOSYS;
> >  }
> >
> > -static inline bool eventfd_signal_count(void)
> > +static inline bool eventfd_signal_allowed(void)
> >  {
> > -       return false;
> > +       return true;
> >  }
> >
> >  #endif
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 409a24036952c..29e6ff1af1df9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -852,6 +852,10 @@ struct task_struct {
> >         /* Stalled due to lack of memory */
> >         unsigned                        in_memstall:1;
> >  #endif
> > +#ifdef CONFIG_EVENTFD
> > +       /* Recursion prevention for eventfd_signal() */
> > +       unsigned                        in_eventfd_signal:1;
> > +#endif
> >
> >         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
> >
> > --
> > 2.33.1
> >
Luis Claudio R. Goncalves Feb. 3, 2022, 9:42 p.m. UTC | #3
On Thu, Feb 3, 2022 at 6:17 AM Daniel Vacek <neelx.g@gmail.com> wrote:
>
> Actually I see it was fixed later with
> 4b3749865374899e115aa8c48681709b086fe6d3 so I guess it should be
> included as well.

Thank you for the feedback!
I will proceed with the changes and testing to release the kernel
probably on Monday.

Also, thank you for the note about the wrong email address!

Best regards,
Luis
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index c72b2c51b446c..f7d47c9ff6deb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1761,7 +1761,7 @@  static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		list_del_init(&req->wait.entry);
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
-		if (iocb->ki_eventfd && eventfd_signal_count()) {
+		if (iocb->ki_eventfd && eventfd_signal_allowed()) {
 			iocb = NULL;
 			INIT_WORK(&req->work, aio_poll_put_work);
 			schedule_work(&req->work);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index df466ef81dddf..9035ca60bfcf3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@ 
 #include <linux/idr.h>
 #include <linux/uio.h>
 
-DEFINE_PER_CPU(int, eventfd_wake_count);
-
 static DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
@@ -67,21 +65,21 @@  __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 	 * Deadlock or stack overflow issues can happen if we recurse here
 	 * through waitqueue wakeup handlers. If the caller users potentially
 	 * nested waitqueues with custom wakeup handlers, then it should
-	 * check eventfd_signal_count() before calling this function. If
-	 * it returns true, the eventfd_signal() call should be deferred to a
+	 * check eventfd_signal_allowed() before calling this function. If
+	 * it returns false, the eventfd_signal() call should be deferred to a
 	 * safe context.
 	 */
-	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+	if (WARN_ON_ONCE(current->in_eventfd_signal))
 		return 0;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	this_cpu_inc(eventfd_wake_count);
+	current->in_eventfd_signal = 1;
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
-	this_cpu_dec(eventfd_wake_count);
+	current->in_eventfd_signal = 0;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a6644dd..836b4c021a0a4 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -14,6 +14,7 @@ 
 #include <linux/err.h>
 #include <linux/percpu-defs.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -42,11 +43,9 @@  __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 
-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
 {
-	return this_cpu_read(eventfd_wake_count);
+	return !current->in_eventfd_signal;
 }
 
 #else /* CONFIG_EVENTFD */
@@ -77,9 +76,9 @@  static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
 	return -ENOSYS;
 }
 
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
 {
-	return false;
+	return true;
 }
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 409a24036952c..29e6ff1af1df9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -852,6 +852,10 @@  struct task_struct {
 	/* Stalled due to lack of memory */
 	unsigned			in_memstall:1;
 #endif
+#ifdef CONFIG_EVENTFD
+	/* Recursion prevention for eventfd_signal() */
+	unsigned			in_eventfd_signal:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */