diff mbox series

[V2,11/13] block: move blk_exit_queue into disk_release

Message ID 20220122111054.1126146-12-ming.lei@redhat.com
State New
Headers show
Series block: don't drain file system I/O on del_gendisk | expand

Commit Message

Ming Lei Jan. 22, 2022, 11:10 a.m. UTC
There can't be FS IO in disk_release(), so move blk_exit_queue() there.

We still need to freeze queue here since request is freed after bio is
ended, meantime passthrough request relies on scheduler tags too, but
either q->q_usage_counter is in atomic mode, such as scsi, or it has
been drained already, such as most of other drivers, so the added queue
freeze is pretty fast, and RCU grace period isn't supposed to be
involved.

disk can be released before or after queue is cleaned up, and we have to
free scheduler request pool before blk_cleanup_queue returns, meantime
the request pool has to be freed before exiting the scheduler. So move
scheduler pool freeing into both the two functions.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c    |  3 +++
 block/blk-sysfs.c | 16 ----------------
 block/genhd.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 17 deletions(-)

Comments

Christoph Hellwig Jan. 24, 2022, 1:22 p.m. UTC | #1
On Sat, Jan 22, 2022 at 07:10:52PM +0800, Ming Lei wrote:
>  3 files changed, 41 insertions(+), 17 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d51b0aa2e4e4..72ae9955cf27 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3101,6 +3101,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>  	struct blk_mq_tags *drv_tags;
>  	struct page *page;
>  
> +	if (list_empty(&tags->page_list))
> +		return;

Please split this out and document why it is needed.

> +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
> +static void blk_exit_queue(struct request_queue *q)
> +{
> +	/*
> +	 * Since the I/O scheduler exit code may access cgroup information,
> +	 * perform I/O scheduler exit before disassociating from the block
> +	 * cgroup controller.
> +	 */
> +	if (q->elevator) {
> +		ioc_clear_queue(q);
> +
> +		mutex_lock(&q->sysfs_lock);
> +		blk_mq_sched_free_rqs(q);
> +		elevator_exit(q);
> +		mutex_unlock(&q->sysfs_lock);
> +	}
> +}

Given that this all deals with the I/O scheduler the naming seems
wrong.  It almost seems like we'd want to move ioc_clear_queue and
blk_mq_sched_free_rqs into elevator_exit to have somewhat sensible
helpers.  That would add extra locking of q->sysfs_lock for
ioc_clear_queue, but given that elevator_switch_mq already does
that, this seems harmless.
Ming Lei Jan. 24, 2022, 11:38 p.m. UTC | #2
On Mon, Jan 24, 2022 at 02:22:21PM +0100, Christoph Hellwig wrote:
> On Sat, Jan 22, 2022 at 07:10:52PM +0800, Ming Lei wrote:
> >  3 files changed, 41 insertions(+), 17 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index d51b0aa2e4e4..72ae9955cf27 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -3101,6 +3101,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> >  	struct blk_mq_tags *drv_tags;
> >  	struct page *page;
> >  
> > +	if (list_empty(&tags->page_list))
> > +		return;
> 
> Please split this out and document why it is needed.

Now blk_mq_sched_free_rqs() can be called from either blk_cleanup_queue
and disk_release(). If sched rq pool is freed in one of them, it can't
to be freed from another one.

> 
> > +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
> > +static void blk_exit_queue(struct request_queue *q)
> > +{
> > +	/*
> > +	 * Since the I/O scheduler exit code may access cgroup information,
> > +	 * perform I/O scheduler exit before disassociating from the block
> > +	 * cgroup controller.
> > +	 */
> > +	if (q->elevator) {
> > +		ioc_clear_queue(q);
> > +
> > +		mutex_lock(&q->sysfs_lock);
> > +		blk_mq_sched_free_rqs(q);
> > +		elevator_exit(q);
> > +		mutex_unlock(&q->sysfs_lock);
> > +	}
> > +}
> 
> Given that this all deals with the I/O scheduler the naming seems
> wrong.  It almost seems like we'd want to move ioc_clear_queue and
> blk_mq_sched_free_rqs into elevator_exit to have somewhat sensible
> helpers.

Looks good idea.



thanks, 
Ming
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d51b0aa2e4e4..72ae9955cf27 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3101,6 +3101,9 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	struct blk_mq_tags *drv_tags;
 	struct page *page;
 
+	if (list_empty(&tags->page_list))
+		return;
+
 	if (blk_mq_is_shared_tags(set->flags))
 		drv_tags = set->shared_tags;
 	else
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5f14fd333182..e0f29b56e8e2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -739,20 +739,6 @@  static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
 }
 
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
-	/*
-	 * Since the I/O scheduler exit code may access cgroup information,
-	 * perform I/O scheduler exit before disassociating from the block
-	 * cgroup controller.
-	 */
-	if (q->elevator) {
-		ioc_clear_queue(q);
-		elevator_exit(q);
-	}
-}
-
 /**
  * blk_release_queue - releases all allocated resources of the request_queue
  * @kobj: pointer to a kobject, whose container is a request_queue
@@ -786,8 +772,6 @@  static void blk_release_queue(struct kobject *kobj)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 
-	blk_exit_queue(q);
-
 	blk_free_queue_stats(q->stats);
 	kfree(q->poll_stat);
 
diff --git a/block/genhd.c b/block/genhd.c
index a86027619683..f1aef5d13afa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1085,11 +1085,48 @@  static const struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
+/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+static void blk_exit_queue(struct request_queue *q)
+{
+	/*
+	 * Since the I/O scheduler exit code may access cgroup information,
+	 * perform I/O scheduler exit before disassociating from the block
+	 * cgroup controller.
+	 */
+	if (q->elevator) {
+		ioc_clear_queue(q);
+
+		mutex_lock(&q->sysfs_lock);
+		blk_mq_sched_free_rqs(q);
+		elevator_exit(q);
+		mutex_unlock(&q->sysfs_lock);
+	}
+}
+
 static void disk_release_queue(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 
-	blk_mq_cancel_work_sync(q);
+	if (queue_is_mq(q)) {
+		blk_mq_cancel_work_sync(q);
+
+		/*
+		 * All FS bios have been done, however FS request may not
+		 * be freed yet since we end bio before freeing request,
+		 * meantime passthrough request replies on scheduler tags,
+		 * so queue needs to be frozen here.
+		 *
+		 * Most of drivers release disk after blk_cleanup_queue()
+		 * returns, and SCSI may release disk before calling
+		 * blk_cleanup_queue, but request queue has been in atomic
+		 * mode already, see scsi_disk_release(), so the following
+		 * queue freeze is pretty fast, and RCU grace period isn't
+		 * supposed to be involved.
+		 */
+		blk_mq_freeze_queue(q);
+		blk_exit_queue(q);
+		__blk_mq_unfreeze_queue(q, true);
+	}
 
 	/*
 	 * Remove all references to @q from the block cgroup controller before