diff mbox series

[RFC,v3,01/11] eventfd: track eventfd_signal() recursion depth separately in different cases

Message ID 20210119045920.447-2-xieyongji@bytedance.com
State New
Headers show
Series Introduce VDUSE - vDPA Device in Userspace | expand

Commit Message

Yongji Xie Jan. 19, 2021, 4:59 a.m. UTC
Now we have a global percpu counter to limit the recursion depth
of eventfd_signal(). This can avoid deadlock or stack overflow.
But in stack overflow case, it should be OK to increase the
recursion depth if needed. So we add a percpu counter in eventfd_ctx
to limit the recursion depth for deadlock case. Then it could be
fine to increase the global percpu counter later.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 fs/aio.c                |  3 ++-
 fs/eventfd.c            | 20 +++++++++++++++++++-
 include/linux/eventfd.h |  5 +----
 3 files changed, 22 insertions(+), 6 deletions(-)

Comments

Jason Wang Jan. 20, 2021, 4:24 a.m. UTC | #1
On 2021/1/19 下午12:59, Xie Yongji wrote:
> Now we have a global percpu counter to limit the recursion depth

> of eventfd_signal(). This can avoid deadlock or stack overflow.

> But in stack overflow case, it should be OK to increase the

> recursion depth if needed. So we add a percpu counter in eventfd_ctx

> to limit the recursion depth for deadlock case. Then it could be

> fine to increase the global percpu counter later.



I wonder whether or not it's worth to introduce percpu for each eventfd.

How about simply check if eventfd_signal_count() is greater than 2?

Thanks


>

> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>

> ---

>   fs/aio.c                |  3 ++-

>   fs/eventfd.c            | 20 +++++++++++++++++++-

>   include/linux/eventfd.h |  5 +----

>   3 files changed, 22 insertions(+), 6 deletions(-)

>

> diff --git a/fs/aio.c b/fs/aio.c

> index 1f32da13d39e..5d82903161f5 100644

> --- a/fs/aio.c

> +++ b/fs/aio.c

> @@ -1698,7 +1698,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,

>   		list_del(&iocb->ki_list);

>   		iocb->ki_res.res = mangle_poll(mask);

>   		req->done = true;

> -		if (iocb->ki_eventfd && eventfd_signal_count()) {

> +		if (iocb->ki_eventfd &&

> +			eventfd_signal_count(iocb->ki_eventfd)) {

>   			iocb = NULL;

>   			INIT_WORK(&req->work, aio_poll_put_work);

>   			schedule_work(&req->work);

> diff --git a/fs/eventfd.c b/fs/eventfd.c

> index e265b6dd4f34..2df24f9bada3 100644

> --- a/fs/eventfd.c

> +++ b/fs/eventfd.c

> @@ -25,6 +25,8 @@

>   #include <linux/idr.h>

>   #include <linux/uio.h>

>   

> +#define EVENTFD_WAKE_DEPTH 0

> +

>   DEFINE_PER_CPU(int, eventfd_wake_count);

>   

>   static DEFINE_IDA(eventfd_ida);

> @@ -42,9 +44,17 @@ struct eventfd_ctx {

>   	 */

>   	__u64 count;

>   	unsigned int flags;

> +	int __percpu *wake_count;

>   	int id;

>   };

>   

> +bool eventfd_signal_count(struct eventfd_ctx *ctx)

> +{

> +	return (this_cpu_read(*ctx->wake_count) ||

> +		this_cpu_read(eventfd_wake_count) > EVENTFD_WAKE_DEPTH);

> +}

> +EXPORT_SYMBOL_GPL(eventfd_signal_count);

> +

>   /**

>    * eventfd_signal - Adds @n to the eventfd counter.

>    * @ctx: [in] Pointer to the eventfd context.

> @@ -71,17 +81,19 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)

>   	 * it returns true, the eventfd_signal() call should be deferred to a

>   	 * safe context.

>   	 */

> -	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))

> +	if (WARN_ON_ONCE(eventfd_signal_count(ctx)))

>   		return 0;

>   

>   	spin_lock_irqsave(&ctx->wqh.lock, flags);

>   	this_cpu_inc(eventfd_wake_count);

> +	this_cpu_inc(*ctx->wake_count);

>   	if (ULLONG_MAX - ctx->count < n)

>   		n = ULLONG_MAX - ctx->count;

>   	ctx->count += n;

>   	if (waitqueue_active(&ctx->wqh))

>   		wake_up_locked_poll(&ctx->wqh, EPOLLIN);

>   	this_cpu_dec(eventfd_wake_count);

> +	this_cpu_dec(*ctx->wake_count);

>   	spin_unlock_irqrestore(&ctx->wqh.lock, flags);

>   

>   	return n;

> @@ -92,6 +104,7 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)

>   {

>   	if (ctx->id >= 0)

>   		ida_simple_remove(&eventfd_ida, ctx->id);

> +	free_percpu(ctx->wake_count);

>   	kfree(ctx);

>   }

>   

> @@ -423,6 +436,11 @@ static int do_eventfd(unsigned int count, int flags)

>   

>   	kref_init(&ctx->kref);

>   	init_waitqueue_head(&ctx->wqh);

> +	ctx->wake_count = alloc_percpu(int);

> +	if (!ctx->wake_count) {

> +		kfree(ctx);

> +		return -ENOMEM;

> +	}

>   	ctx->count = count;

>   	ctx->flags = flags;

>   	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);

> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h

> index fa0a524baed0..1a11ebbd74a9 100644

> --- a/include/linux/eventfd.h

> +++ b/include/linux/eventfd.h

> @@ -45,10 +45,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

>   

>   DECLARE_PER_CPU(int, eventfd_wake_count);

>   

> -static inline bool eventfd_signal_count(void)

> -{

> -	return this_cpu_read(eventfd_wake_count);

> -}

> +bool eventfd_signal_count(struct eventfd_ctx *ctx);

>   

>   #else /* CONFIG_EVENTFD */

>
Yongji Xie Jan. 20, 2021, 6:52 a.m. UTC | #2
On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>

>

> On 2021/1/19 下午12:59, Xie Yongji wrote:

> > Now we have a global percpu counter to limit the recursion depth

> > of eventfd_signal(). This can avoid deadlock or stack overflow.

> > But in stack overflow case, it should be OK to increase the

> > recursion depth if needed. So we add a percpu counter in eventfd_ctx

> > to limit the recursion depth for deadlock case. Then it could be

> > fine to increase the global percpu counter later.

>

>

> I wonder whether or not it's worth to introduce percpu for each eventfd.

>

> How about simply check if eventfd_signal_count() is greater than 2?

>


It can't avoid deadlock in this way. So we need a percpu counter for
each eventfd to limit the recursion depth for deadlock cases. And
using a global percpu counter to avoid stack overflow.

Thanks,
Yongji
Jason Wang Jan. 27, 2021, 3:37 a.m. UTC | #3
On 2021/1/20 下午2:52, Yongji Xie wrote:
> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

>>

>> On 2021/1/19 下午12:59, Xie Yongji wrote:

>>> Now we have a global percpu counter to limit the recursion depth

>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

>>> But in stack overflow case, it should be OK to increase the

>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

>>> to limit the recursion depth for deadlock case. Then it could be

>>> fine to increase the global percpu counter later.

>>

>> I wonder whether or not it's worth to introduce percpu for each eventfd.

>>

>> How about simply check if eventfd_signal_count() is greater than 2?

>>

> It can't avoid deadlock in this way.



I may miss something but the count is to avoid recursive eventfd call. 
So for VDUSE what we suffers is e.g the interrupt injection path:

userspace write IRQFD -> vq->cb() -> another IRQFD.

It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

Thanks


> So we need a percpu counter for

> each eventfd to limit the recursion depth for deadlock cases. And

> using a global percpu counter to avoid stack overflow.

>

> Thanks,

> Yongji

>
Yongji Xie Jan. 27, 2021, 9:11 a.m. UTC | #4
On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
>

>

> On 2021/1/20 下午2:52, Yongji Xie wrote:

> > On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

> >>

> >> On 2021/1/19 下午12:59, Xie Yongji wrote:

> >>> Now we have a global percpu counter to limit the recursion depth

> >>> of eventfd_signal(). This can avoid deadlock or stack overflow.

> >>> But in stack overflow case, it should be OK to increase the

> >>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

> >>> to limit the recursion depth for deadlock case. Then it could be

> >>> fine to increase the global percpu counter later.

> >>

> >> I wonder whether or not it's worth to introduce percpu for each eventfd.

> >>

> >> How about simply check if eventfd_signal_count() is greater than 2?

> >>

> > It can't avoid deadlock in this way.

>

>

> I may miss something but the count is to avoid recursive eventfd call.

> So for VDUSE what we suffers is e.g the interrupt injection path:

>

> userspace write IRQFD -> vq->cb() -> another IRQFD.

>

> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

>


Actually I mean the deadlock described in commit f0b493e ("io_uring:
prevent potential eventfd recursion on poll"). It can break this bug
fix if we just increase EVENTFD_WAKEUP_DEPTH.

Thanks,
Yongji
Jason Wang Jan. 28, 2021, 3:04 a.m. UTC | #5
On 2021/1/27 下午5:11, Yongji Xie wrote:
> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

>>

>> On 2021/1/20 下午2:52, Yongji Xie wrote:

>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

>>>>> Now we have a global percpu counter to limit the recursion depth

>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

>>>>> But in stack overflow case, it should be OK to increase the

>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

>>>>> to limit the recursion depth for deadlock case. Then it could be

>>>>> fine to increase the global percpu counter later.

>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

>>>>

>>>> How about simply check if eventfd_signal_count() is greater than 2?

>>>>

>>> It can't avoid deadlock in this way.

>>

>> I may miss something but the count is to avoid recursive eventfd call.

>> So for VDUSE what we suffers is e.g the interrupt injection path:

>>

>> userspace write IRQFD -> vq->cb() -> another IRQFD.

>>

>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

>>

> Actually I mean the deadlock described in commit f0b493e ("io_uring:

> prevent potential eventfd recursion on poll"). It can break this bug

> fix if we just increase EVENTFD_WAKEUP_DEPTH.



Ok, so can wait do something similar in that commit? (using async stuffs 
like wq).

Thanks


>

> Thanks,

> Yongji

>
Jens Axboe Jan. 28, 2021, 3:08 a.m. UTC | #6
On 1/27/21 8:04 PM, Jason Wang wrote:
> 

> On 2021/1/27 下午5:11, Yongji Xie wrote:

>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

>>>

>>> On 2021/1/20 下午2:52, Yongji Xie wrote:

>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

>>>>>> Now we have a global percpu counter to limit the recursion depth

>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

>>>>>> But in stack overflow case, it should be OK to increase the

>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

>>>>>> to limit the recursion depth for deadlock case. Then it could be

>>>>>> fine to increase the global percpu counter later.

>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

>>>>>

>>>>> How about simply check if eventfd_signal_count() is greater than 2?

>>>>>

>>>> It can't avoid deadlock in this way.

>>>

>>> I may miss something but the count is to avoid recursive eventfd call.

>>> So for VDUSE what we suffers is e.g the interrupt injection path:

>>>

>>> userspace write IRQFD -> vq->cb() -> another IRQFD.

>>>

>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

>>>

>> Actually I mean the deadlock described in commit f0b493e ("io_uring:

>> prevent potential eventfd recursion on poll"). It can break this bug

>> fix if we just increase EVENTFD_WAKEUP_DEPTH.

> 

> 

> Ok, so can wait do something similar in that commit? (using async stuffs 

> like wq).


io_uring should be fine in current kernels, but aio would still be
affected by this. But just in terms of recursion, bumping it one more
should probably still be fine.

-- 
Jens Axboe
Yongji Xie Jan. 28, 2021, 3:52 a.m. UTC | #7
On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:
>

>

> On 2021/1/27 下午5:11, Yongji Xie wrote:

> > On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

> >>

> >> On 2021/1/20 下午2:52, Yongji Xie wrote:

> >>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

> >>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

> >>>>> Now we have a global percpu counter to limit the recursion depth

> >>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

> >>>>> But in stack overflow case, it should be OK to increase the

> >>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

> >>>>> to limit the recursion depth for deadlock case. Then it could be

> >>>>> fine to increase the global percpu counter later.

> >>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

> >>>>

> >>>> How about simply check if eventfd_signal_count() is greater than 2?

> >>>>

> >>> It can't avoid deadlock in this way.

> >>

> >> I may miss something but the count is to avoid recursive eventfd call.

> >> So for VDUSE what we suffers is e.g the interrupt injection path:

> >>

> >> userspace write IRQFD -> vq->cb() -> another IRQFD.

> >>

> >> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

> >>

> > Actually I mean the deadlock described in commit f0b493e ("io_uring:

> > prevent potential eventfd recursion on poll"). It can break this bug

> > fix if we just increase EVENTFD_WAKEUP_DEPTH.

>

>

> Ok, so can wait do something similar in that commit? (using async stuffs

> like wq).

>


We can do that. But it will reduce the performance. Because the
eventfd recursion will be triggered every time kvm kick eventfd in
vhost-vdpa cases:

KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD

Thanks,
Yongji
Jason Wang Jan. 28, 2021, 4:31 a.m. UTC | #8
On 2021/1/28 上午11:52, Yongji Xie wrote:
> On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:

>>

>> On 2021/1/27 下午5:11, Yongji Xie wrote:

>>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

>>>> On 2021/1/20 下午2:52, Yongji Xie wrote:

>>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

>>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

>>>>>>> Now we have a global percpu counter to limit the recursion depth

>>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

>>>>>>> But in stack overflow case, it should be OK to increase the

>>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

>>>>>>> to limit the recursion depth for deadlock case. Then it could be

>>>>>>> fine to increase the global percpu counter later.

>>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

>>>>>>

>>>>>> How about simply check if eventfd_signal_count() is greater than 2?

>>>>>>

>>>>> It can't avoid deadlock in this way.

>>>> I may miss something but the count is to avoid recursive eventfd call.

>>>> So for VDUSE what we suffers is e.g the interrupt injection path:

>>>>

>>>> userspace write IRQFD -> vq->cb() -> another IRQFD.

>>>>

>>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

>>>>

>>> Actually I mean the deadlock described in commit f0b493e ("io_uring:

>>> prevent potential eventfd recursion on poll"). It can break this bug

>>> fix if we just increase EVENTFD_WAKEUP_DEPTH.

>>

>> Ok, so can wait do something similar in that commit? (using async stuffs

>> like wq).

>>

> We can do that. But it will reduce the performance. Because the

> eventfd recursion will be triggered every time kvm kick eventfd in

> vhost-vdpa cases:

>

> KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD

>

> Thanks,

> Yongji



Right, I think in the future we need to find a way to let KVM to wakeup 
VDUSE directly.

Havn't had a deep thought but it might work like irq bypass manager.

Thanks
Yongji Xie Jan. 28, 2021, 5:12 a.m. UTC | #9
On Thu, Jan 28, 2021 at 11:08 AM Jens Axboe <axboe@kernel.dk> wrote:
>

> On 1/27/21 8:04 PM, Jason Wang wrote:

> >

> > On 2021/1/27 下午5:11, Yongji Xie wrote:

> >> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

> >>>

> >>> On 2021/1/20 下午2:52, Yongji Xie wrote:

> >>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

> >>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

> >>>>>> Now we have a global percpu counter to limit the recursion depth

> >>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

> >>>>>> But in stack overflow case, it should be OK to increase the

> >>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

> >>>>>> to limit the recursion depth for deadlock case. Then it could be

> >>>>>> fine to increase the global percpu counter later.

> >>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

> >>>>>

> >>>>> How about simply check if eventfd_signal_count() is greater than 2?

> >>>>>

> >>>> It can't avoid deadlock in this way.

> >>>

> >>> I may miss something but the count is to avoid recursive eventfd call.

> >>> So for VDUSE what we suffers is e.g the interrupt injection path:

> >>>

> >>> userspace write IRQFD -> vq->cb() -> another IRQFD.

> >>>

> >>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

> >>>

> >> Actually I mean the deadlock described in commit f0b493e ("io_uring:

> >> prevent potential eventfd recursion on poll"). It can break this bug

> >> fix if we just increase EVENTFD_WAKEUP_DEPTH.

> >

> >

> > Ok, so can wait do something similar in that commit? (using async stuffs

> > like wq).

>

> io_uring should be fine in current kernels, but aio would still be

> affected by this. But just in terms of recursion, bumping it one more

> should probably still be fine.

>


OK, I see. It should be easy to avoid the A-A deadlock during coding.

Thanks,
Yongji
Yongji Xie Jan. 28, 2021, 6:08 a.m. UTC | #10
On Thu, Jan 28, 2021 at 12:31 PM Jason Wang <jasowang@redhat.com> wrote:
>

>

> On 2021/1/28 上午11:52, Yongji Xie wrote:

> > On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:

> >>

> >> On 2021/1/27 下午5:11, Yongji Xie wrote:

> >>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:

> >>>> On 2021/1/20 下午2:52, Yongji Xie wrote:

> >>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:

> >>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:

> >>>>>>> Now we have a global percpu counter to limit the recursion depth

> >>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.

> >>>>>>> But in stack overflow case, it should be OK to increase the

> >>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx

> >>>>>>> to limit the recursion depth for deadlock case. Then it could be

> >>>>>>> fine to increase the global percpu counter later.

> >>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.

> >>>>>>

> >>>>>> How about simply check if eventfd_signal_count() is greater than 2?

> >>>>>>

> >>>>> It can't avoid deadlock in this way.

> >>>> I may miss something but the count is to avoid recursive eventfd call.

> >>>> So for VDUSE what we suffers is e.g the interrupt injection path:

> >>>>

> >>>> userspace write IRQFD -> vq->cb() -> another IRQFD.

> >>>>

> >>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

> >>>>

> >>> Actually I mean the deadlock described in commit f0b493e ("io_uring:

> >>> prevent potential eventfd recursion on poll"). It can break this bug

> >>> fix if we just increase EVENTFD_WAKEUP_DEPTH.

> >>

> >> Ok, so can wait do something similar in that commit? (using async stuffs

> >> like wq).

> >>

> > We can do that. But it will reduce the performance. Because the

> > eventfd recursion will be triggered every time kvm kick eventfd in

> > vhost-vdpa cases:

> >

> > KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD

> >

> > Thanks,

> > Yongji

>

>

> Right, I think in the future we need to find a way to let KVM to wakeup

> VDUSE directly.

>


Yes, this would be better.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..5d82903161f5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1698,7 +1698,8 @@  static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
 		req->done = true;
-		if (iocb->ki_eventfd && eventfd_signal_count()) {
+		if (iocb->ki_eventfd &&
+			eventfd_signal_count(iocb->ki_eventfd)) {
 			iocb = NULL;
 			INIT_WORK(&req->work, aio_poll_put_work);
 			schedule_work(&req->work);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..2df24f9bada3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,6 +25,8 @@ 
 #include <linux/idr.h>
 #include <linux/uio.h>
 
+#define EVENTFD_WAKE_DEPTH 0
+
 DEFINE_PER_CPU(int, eventfd_wake_count);
 
 static DEFINE_IDA(eventfd_ida);
@@ -42,9 +44,17 @@  struct eventfd_ctx {
 	 */
 	__u64 count;
 	unsigned int flags;
+	int __percpu *wake_count;
 	int id;
 };
 
+bool eventfd_signal_count(struct eventfd_ctx *ctx)
+{
+	return (this_cpu_read(*ctx->wake_count) ||
+		this_cpu_read(eventfd_wake_count) > EVENTFD_WAKE_DEPTH);
+}
+EXPORT_SYMBOL_GPL(eventfd_signal_count);
+
 /**
  * eventfd_signal - Adds @n to the eventfd counter.
  * @ctx: [in] Pointer to the eventfd context.
@@ -71,17 +81,19 @@  __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 	 * it returns true, the eventfd_signal() call should be deferred to a
 	 * safe context.
 	 */
-	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+	if (WARN_ON_ONCE(eventfd_signal_count(ctx)))
 		return 0;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	this_cpu_inc(eventfd_wake_count);
+	this_cpu_inc(*ctx->wake_count);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 	this_cpu_dec(eventfd_wake_count);
+	this_cpu_dec(*ctx->wake_count);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
@@ -92,6 +104,7 @@  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
 	if (ctx->id >= 0)
 		ida_simple_remove(&eventfd_ida, ctx->id);
+	free_percpu(ctx->wake_count);
 	kfree(ctx);
 }
 
@@ -423,6 +436,11 @@  static int do_eventfd(unsigned int count, int flags)
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
+	ctx->wake_count = alloc_percpu(int);
+	if (!ctx->wake_count) {
+		kfree(ctx);
+		return -ENOMEM;
+	}
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524baed0..1a11ebbd74a9 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -45,10 +45,7 @@  void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
-static inline bool eventfd_signal_count(void)
-{
-	return this_cpu_read(eventfd_wake_count);
-}
+bool eventfd_signal_count(struct eventfd_ctx *ctx);
 
 #else /* CONFIG_EVENTFD */