@@ -183,8 +183,9 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
bool cancelled;
+ bool work_scheduled;
+ bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
@@ -1645,14 +1646,26 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
+ spin_lock(&req->head->lock);
if (!mask && !READ_ONCE(req->cancelled)) {
- add_wait_queue(req->head, &req->wait);
+ /*
+ * The request isn't actually ready to be completed yet.
+ * Reschedule completion if another wakeup came in.
+ */
+ if (req->work_need_resched) {
+ schedule_work(&req->work);
+ req->work_need_resched = false;
+ } else {
+ req->work_scheduled = false;
+ }
+ spin_unlock(&req->head->lock);
spin_unlock_irq(&ctx->ctx_lock);
return;
}
+ list_del_init(&req->wait.entry);
+ spin_unlock(&req->head->lock);
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
@@ -1666,9 +1679,9 @@ static int aio_poll_cancel(struct kiocb *iocb)
spin_lock(&req->head->lock);
WRITE_ONCE(req->cancelled, true);
- if (!list_empty(&req->wait.entry)) {
- list_del_init(&req->wait.entry);
+ if (!req->work_scheduled) {
schedule_work(&aiocb->poll.work);
+ req->work_scheduled = true;
}
spin_unlock(&req->head->lock);
@@ -1687,20 +1700,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;
- list_del_init(&req->wait.entry);
-
- if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ /*
+ * Complete the request inline if possible. This requires that three
+ * conditions be met:
+ * 1. An event mask must have been passed. If a plain wakeup was done
+ * instead, then mask == 0 and we have to call vfs_poll() to get
+ * the events, so inline completion isn't possible.
+ * 2. The completion work must not have already been scheduled.
+ * 3. ctx_lock must not be busy. We have to use trylock because we
+ * already hold the waitqueue lock, so this inverts the normal
+ * locking order. Use irqsave/irqrestore because not all
+ * filesystems (e.g. fuse) call this function with IRQs disabled,
+ * yet IRQs have to be disabled before ctx_lock is obtained.
+ */
+ if (mask && !req->work_scheduled &&
+ spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
- /*
- * Try to complete the iocb inline if we can. Use
- * irqsave/irqrestore because not all filesystems (e.g. fuse)
- * call this function with IRQs disabled and because IRQs
- * have to be disabled before ctx_lock is obtained.
- */
+ list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
if (iocb->ki_eventfd && eventfd_signal_count()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
@@ -1710,7 +1729,20 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
- schedule_work(&req->work);
+ /*
+ * Schedule the completion work if needed. If it was already
+ * scheduled, record that another wakeup came in.
+ *
+ * Don't remove the request from the waitqueue here, as it might
+ * not actually be complete yet (we won't know until vfs_poll()
+ * is called), and we must not miss any wakeups.
+ */
+ if (req->work_scheduled) {
+ req->work_need_resched = true;
+ } else {
+ schedule_work(&req->work);
+ req->work_scheduled = true;
+ }
}
return 1;
}
@@ -1757,8 +1789,9 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->done = false;
req->cancelled = false;
+ req->work_scheduled = false;
+ req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
@@ -1773,17 +1806,27 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
spin_lock_irq(&ctx->ctx_lock);
if (likely(req->head)) {
spin_lock(&req->head->lock);
- if (unlikely(list_empty(&req->wait.entry))) {
- if (apt.error)
+ if (list_empty(&req->wait.entry) || req->work_scheduled) {
+ /*
+ * aio_poll_wake() already either scheduled the async
+ * completion work, or completed the request inline.
+ */
+ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
+ /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
+ /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
- } else if (!req->done) { /* actually waiting for an event */
+ } else if (!list_empty(&req->wait.entry)) {
+ /*
+ * Actually waiting for an event, so add the request to
+ * active_reqs so that it can be cancelled if needed.
+ */
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}