diff mbox series

[v2,10/11] blk-mq: Use shared tags for shared sbitmap support

Message ID 1628519378-211232-11-git-send-email-john.garry@huawei.com
State New
Headers show
Series blk-mq: Reduce static requests memory footprint for shared sbitmap | expand

Commit Message

John Garry Aug. 9, 2021, 2:29 p.m. UTC
Currently we use separate sbitmap pairs and active_queues atomic_t for
shared sbitmap support.

However a full set of static requests are used per HW queue, which is
quite wasteful, considering that the total number of requests usable at
any given time across all HW queues is limited by the shared sbitmap depth.

As such, it is considerably more memory efficient in the case of shared
sbitmap to allocate a set of static rqs per tag set or request queue, and
not per HW queue.

So replace the sbitmap pairs and active_queues atomic_t with a shared
tags per tagset and request queue.

Continue to use term "shared sbitmap" for now, as the meaning is known.

Signed-off-by: John Garry <john.garry@huawei.com>

---
 block/blk-mq-sched.c   | 77 ++++++++++++++++++++-----------------
 block/blk-mq-tag.c     | 65 ++++++++++++-------------------
 block/blk-mq-tag.h     |  4 +-
 block/blk-mq.c         | 86 +++++++++++++++++++++++++-----------------
 block/blk-mq.h         |  8 ++--
 include/linux/blk-mq.h | 13 +++----
 include/linux/blkdev.h |  3 +-
 7 files changed, 131 insertions(+), 125 deletions(-)

-- 
2.26.2

Comments

Ming Lei Aug. 18, 2021, 8:16 a.m. UTC | #1
On Mon, Aug 09, 2021 at 10:29:37PM +0800, John Garry wrote:
> Currently we use separate sbitmap pairs and active_queues atomic_t for

> shared sbitmap support.

> 

> However a full set of static requests are used per HW queue, which is

> quite wasteful, considering that the total number of requests usable at

> any given time across all HW queues is limited by the shared sbitmap depth.

> 

> As such, it is considerably more memory efficient in the case of shared

> sbitmap to allocate a set of static rqs per tag set or request queue, and

> not per HW queue.

> 

> So replace the sbitmap pairs and active_queues atomic_t with a shared

> tags per tagset and request queue.


This idea looks good and the current implementation is simplified a bit
too.

> 

> Continue to use term "shared sbitmap" for now, as the meaning is known.


I guess shared tags is better.

> 

> Signed-off-by: John Garry <john.garry@huawei.com>

> ---

>  block/blk-mq-sched.c   | 77 ++++++++++++++++++++-----------------

>  block/blk-mq-tag.c     | 65 ++++++++++++-------------------

>  block/blk-mq-tag.h     |  4 +-

>  block/blk-mq.c         | 86 +++++++++++++++++++++++++-----------------

>  block/blk-mq.h         |  8 ++--

>  include/linux/blk-mq.h | 13 +++----

>  include/linux/blkdev.h |  3 +-

>  7 files changed, 131 insertions(+), 125 deletions(-)

> 

> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c

> index ac0408ebcd47..1101a2e4da9a 100644

> --- a/block/blk-mq-sched.c

> +++ b/block/blk-mq-sched.c

> @@ -522,14 +522,19 @@ static int blk_mq_sched_alloc_map_and_request(struct request_queue *q,

>  	struct blk_mq_tag_set *set = q->tag_set;

>  	int ret;

>  

> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {

> +		hctx->sched_tags = q->shared_sbitmap_tags;

> +		return 0;

> +	}

> +

>  	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,

> -					       set->reserved_tags, set->flags);

> +					       set->reserved_tags);

>  	if (!hctx->sched_tags)

>  		return -ENOMEM;

>  

>  	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);

>  	if (ret) {

> -		blk_mq_free_rq_map(hctx->sched_tags, set->flags);

> +		blk_mq_free_rq_map(hctx->sched_tags);

>  		hctx->sched_tags = NULL;

>  	}

>  

> @@ -544,35 +549,39 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)

>  

>  	queue_for_each_hw_ctx(q, hctx, i) {

>  		if (hctx->sched_tags) {

> -			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);

> +			if (!blk_mq_is_sbitmap_shared(q->tag_set->flags))

> +				blk_mq_free_rq_map(hctx->sched_tags);

>  			hctx->sched_tags = NULL;

>  		}

>  	}

>  }

>  

> +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)

> +{

> +	blk_mq_free_rq_map(queue->shared_sbitmap_tags);

> +	queue->shared_sbitmap_tags = NULL;

> +}

> +

>  static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)

>  {

>  	struct blk_mq_tag_set *set = queue->tag_set;

> -	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);

> -	struct blk_mq_hw_ctx *hctx;

> -	int ret, i;

> +	struct blk_mq_tags *tags;

> +	int ret;

>  

>  	/*

>  	 * Set initial depth at max so that we don't need to reallocate for

>  	 * updating nr_requests.

>  	 */

> -	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,

> -				  &queue->sched_breserved_tags,

> -				  MAX_SCHED_RQ, set->reserved_tags,

> -				  set->numa_node, alloc_policy);

> -	if (ret)

> -		return ret;

> +	tags = queue->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,

> +					  set->queue_depth,

> +					  set->reserved_tags);

> +	if (!queue->shared_sbitmap_tags)

> +		return -ENOMEM;

>  

> -	queue_for_each_hw_ctx(queue, hctx, i) {

> -		hctx->sched_tags->bitmap_tags =

> -					&queue->sched_bitmap_tags;

> -		hctx->sched_tags->breserved_tags =

> -					&queue->sched_breserved_tags;

> +	ret = blk_mq_alloc_rqs(set, tags, 0, set->queue_depth);

> +	if (ret) {

> +		blk_mq_exit_sched_shared_sbitmap(queue);

> +		return ret;


There are two such patterns for allocate rq map and request pool
together, please put them into one helper(such as blk_mq_alloc_map_and_rqs)
which can return the allocated tags and handle failure inline. Also we may
convert current users into this helper.

>  	}

>  

>  	blk_mq_tag_update_sched_shared_sbitmap(queue);

> @@ -580,12 +589,6 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)

>  	return 0;

>  }

>  

> -static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)

> -{

> -	sbitmap_queue_free(&queue->sched_bitmap_tags);

> -	sbitmap_queue_free(&queue->sched_breserved_tags);

> -}

> -

>  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)

>  {

>  	struct blk_mq_hw_ctx *hctx;

> @@ -607,21 +610,21 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)

>  	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,

>  				   BLKDEV_DEFAULT_RQ);

>  

> -	queue_for_each_hw_ctx(q, hctx, i) {

> -		ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);

> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {

> +		ret = blk_mq_init_sched_shared_sbitmap(q);

>  		if (ret)

> -			goto err_free_map_and_request;

> +			return ret;

>  	}

>  

> -	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {

> -		ret = blk_mq_init_sched_shared_sbitmap(q);

> +	queue_for_each_hw_ctx(q, hctx, i) {

> +		ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);

>  		if (ret)

>  			goto err_free_map_and_request;

>  	}

>  

>  	ret = e->ops.init_sched(q, e);

>  	if (ret)

> -		goto err_free_sbitmap;

> +		goto err_free_map_and_request;

>  

>  	blk_mq_debugfs_register_sched(q);

>  

> @@ -641,12 +644,12 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)

>  

>  	return 0;

>  

> -err_free_sbitmap:

> -	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))

> -		blk_mq_exit_sched_shared_sbitmap(q);

>  err_free_map_and_request:

>  	blk_mq_sched_free_requests(q);

>  	blk_mq_sched_tags_teardown(q);

> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))

> +		blk_mq_exit_sched_shared_sbitmap(q);

> +

>  	q->elevator = NULL;

>  	return ret;

>  }

> @@ -660,9 +663,13 @@ void blk_mq_sched_free_requests(struct request_queue *q)

>  	struct blk_mq_hw_ctx *hctx;

>  	int i;

>  

> -	queue_for_each_hw_ctx(q, hctx, i) {

> -		if (hctx->sched_tags)

> -			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);

> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {

> +		blk_mq_free_rqs(q->tag_set, q->shared_sbitmap_tags, 0);

> +	} else {

> +		queue_for_each_hw_ctx(q, hctx, i) {

> +			if (hctx->sched_tags)

> +				blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);

> +		}

>  	}

>  }

>  

> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

> index 5f06ad6efc8f..e97bbf477b96 100644

> --- a/block/blk-mq-tag.c

> +++ b/block/blk-mq-tag.c

> @@ -27,10 +27,11 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)

>  	if (blk_mq_is_sbitmap_shared(hctx->flags)) {

>  		struct request_queue *q = hctx->queue;

>  		struct blk_mq_tag_set *set = q->tag_set;

> +		struct blk_mq_tags *tags = set->shared_sbitmap_tags;

>  

>  		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&

>  		    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))

> -			atomic_inc(&set->active_queues_shared_sbitmap);

> +			atomic_inc(&tags->active_queues);

>  	} else {

>  		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&

>  		    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))

> @@ -61,10 +62,12 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)

>  	struct blk_mq_tag_set *set = q->tag_set;

>  

>  	if (blk_mq_is_sbitmap_shared(hctx->flags)) {

> +		struct blk_mq_tags *tags = set->shared_sbitmap_tags;

> +

>  		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,

>  					&q->queue_flags))

>  			return;

> -		atomic_dec(&set->active_queues_shared_sbitmap);

> +		atomic_dec(&tags->active_queues);

>  	} else {

>  		if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))

>  			return;

> @@ -510,38 +513,16 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,

>  	return 0;

>  }

>  

> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)

> -{

> -	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);

> -	int i, ret;

> -

> -	ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,

> -				  set->queue_depth, set->reserved_tags,

> -				  set->numa_node, alloc_policy);

> -	if (ret)

> -		return ret;

> -

> -	for (i = 0; i < set->nr_hw_queues; i++) {

> -		struct blk_mq_tags *tags = set->tags[i];

> -

> -		tags->bitmap_tags = &set->__bitmap_tags;

> -		tags->breserved_tags = &set->__breserved_tags;

> -	}

> -

> -	return 0;

> -}

> -

>  void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)

>  {

> -	sbitmap_queue_free(&set->__bitmap_tags);

> -	sbitmap_queue_free(&set->__breserved_tags);

> +	blk_mq_free_rq_map(set->shared_sbitmap_tags);

> +	set->shared_sbitmap_tags = NULL;

>  }

>  

>  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,

>  				     unsigned int reserved_tags,

> -				     int node, unsigned int flags)

> +				     int node, int alloc_policy)

>  {

> -	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);

>  	struct blk_mq_tags *tags;

>  

>  	if (total_tags > BLK_MQ_TAG_MAX) {

> @@ -557,9 +538,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,

>  	tags->nr_reserved_tags = reserved_tags;

>  	spin_lock_init(&tags->lock);

>  

> -	if (blk_mq_is_sbitmap_shared(flags))

> -		return tags;

> -

>  	if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {

>  		kfree(tags);

>  		return NULL;

> @@ -567,12 +545,10 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,

>  	return tags;

>  }

>  

> -void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)

> +void blk_mq_free_tags(struct blk_mq_tags *tags)

>  {

> -	if (!blk_mq_is_sbitmap_shared(flags)) {

> -		sbitmap_queue_free(tags->bitmap_tags);

> -		sbitmap_queue_free(tags->breserved_tags);

> -	}

> +	sbitmap_queue_free(tags->bitmap_tags);

> +	sbitmap_queue_free(tags->breserved_tags);

>  	kfree(tags);

>  }

>  

> @@ -604,18 +580,25 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,

>  		if (tdepth > MAX_SCHED_RQ)

>  			return -EINVAL;

>  

> +		if (blk_mq_is_sbitmap_shared(set->flags)) {

> +			/* No point in allowing this to happen */

> +			if (tdepth > set->queue_depth)

> +				return -EINVAL;

> +			return 0;

> +		}


The above looks wrong, it isn't unusual to see small queue depth
hardware meantime we often have scheduler queue depth of 2 * set->queue_depth.

> +

>  		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,

> -				tags->nr_reserved_tags, set->flags);

> +				tags->nr_reserved_tags);

>  		if (!new)

>  			return -ENOMEM;

>  		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);

>  		if (ret) {

> -			blk_mq_free_rq_map(new, set->flags);

> +			blk_mq_free_rq_map(new);

>  			return -ENOMEM;

>  		}

>  

>  		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);

> -		blk_mq_free_rq_map(*tagsptr, set->flags);

> +		blk_mq_free_rq_map(*tagsptr);

>  		*tagsptr = new;

>  	} else {

>  		/*

> @@ -631,12 +614,14 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,

>  

>  void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)

>  {

> -	sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);

> +	struct blk_mq_tags *tags = set->shared_sbitmap_tags;

> +

> +	sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags);

>  }

>  

>  void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q)

>  {

> -	sbitmap_queue_resize(&q->sched_bitmap_tags,

> +	sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags,

>  			     q->nr_requests - q->tag_set->reserved_tags);

>  }

>  

> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h

> index 88f3c6485543..c9fc52ee07c4 100644

> --- a/block/blk-mq-tag.h

> +++ b/block/blk-mq-tag.h

> @@ -30,8 +30,8 @@ struct blk_mq_tags {

>  

>  extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,

>  					unsigned int reserved_tags,

> -					int node, unsigned int flags);

> -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);

> +					int node, int alloc_policy);

> +extern void blk_mq_free_tags(struct blk_mq_tags *tags);

>  extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,

>  			       struct sbitmap_queue *breserved_tags,

>  			       unsigned int queue_depth,

> diff --git a/block/blk-mq.c b/block/blk-mq.c

> index 4d6723cfa582..d3dd5fab3426 100644

> --- a/block/blk-mq.c

> +++ b/block/blk-mq.c

> @@ -2348,6 +2348,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,

>  	struct blk_mq_tags *drv_tags;

>  	struct page *page;

>  

> +	if (blk_mq_is_sbitmap_shared(set->flags))

> +		drv_tags = set->shared_sbitmap_tags;

> +	else

>  		drv_tags = set->tags[hctx_idx];


Here I guess you need to avoid to double ->exit_request()?

>  

>  	if (tags->static_rqs && set->ops->exit_request) {

> @@ -2377,21 +2380,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,

>  	}

>  }

>  

> -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)

> +void blk_mq_free_rq_map(struct blk_mq_tags *tags)

>  {

>  	kfree(tags->rqs);

>  	tags->rqs = NULL;

>  	kfree(tags->static_rqs);

>  	tags->static_rqs = NULL;

>  

> -	blk_mq_free_tags(tags, flags);

> +	blk_mq_free_tags(tags);

>  }

>  

>  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,

>  					unsigned int hctx_idx,

>  					unsigned int nr_tags,

> -					unsigned int reserved_tags,

> -					unsigned int flags)

> +					unsigned int reserved_tags)

>  {

>  	struct blk_mq_tags *tags;

>  	int node;

> @@ -2400,7 +2402,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,

>  	if (node == NUMA_NO_NODE)

>  		node = set->numa_node;

>  

> -	tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);

> +	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,

> +				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));

>  	if (!tags)

>  		return NULL;

>  

> @@ -2408,7 +2411,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,

>  				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,

>  				 node);

>  	if (!tags->rqs) {

> -		blk_mq_free_tags(tags, flags);

> +		blk_mq_free_tags(tags);

>  		return NULL;

>  	}

>  

> @@ -2417,7 +2420,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,

>  					node);

>  	if (!tags->static_rqs) {

>  		kfree(tags->rqs);

> -		blk_mq_free_tags(tags, flags);

> +		blk_mq_free_tags(tags);

>  		return NULL;

>  	}

>  

> @@ -2859,8 +2862,14 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,

>  	unsigned int flags = set->flags;

>  	int ret = 0;

>  

> +	if (blk_mq_is_sbitmap_shared(flags)) {

> +		set->tags[hctx_idx] = set->shared_sbitmap_tags;

> +

> +		return true;

> +	}

> +

>  	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,

> -					set->queue_depth, set->reserved_tags, flags);

> +					set->queue_depth, set->reserved_tags);

>  	if (!set->tags[hctx_idx])

>  		return false;

>  

> @@ -2869,7 +2878,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,

>  	if (!ret)

>  		return true;

>  

> -	blk_mq_free_rq_map(set->tags[hctx_idx], flags);

> +	blk_mq_free_rq_map(set->tags[hctx_idx]);

>  	set->tags[hctx_idx] = NULL;

>  	return false;

>  }

> @@ -2877,11 +2886,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,

>  static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,

>  					 unsigned int hctx_idx)

>  {

> -	unsigned int flags = set->flags;

> -

>  	if (set->tags && set->tags[hctx_idx]) {

> -		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);

> -		blk_mq_free_rq_map(set->tags[hctx_idx], flags);

> +		if (!blk_mq_is_sbitmap_shared(set->flags)) {


I remember you hate negative check, :-)

> +			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);

> +			blk_mq_free_rq_map(set->tags[hctx_idx]);


We can add one helper of blk_mq_free_map_and_rqs(), and there seems
several such pattern.

> +		}

>  		set->tags[hctx_idx] = NULL;

>  	}

>  }

> @@ -3348,6 +3357,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)

>  {

>  	int i;

>  

> +	if (blk_mq_is_sbitmap_shared(set->flags)) {

> +		int ret;

> +

> +		set->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,

> +						  set->queue_depth,

> +						  set->reserved_tags);

> +		if (!set->shared_sbitmap_tags)

> +			return -ENOMEM;

> +

> +		ret = blk_mq_alloc_rqs(set, set->shared_sbitmap_tags, 0,

> +				       set->queue_depth);

> +		if (ret)

> +			goto out_free_sbitmap_tags;

> +	}

> +

>  	for (i = 0; i < set->nr_hw_queues; i++) {

>  		if (!__blk_mq_alloc_map_and_request(set, i))

>  			goto out_unwind;

> @@ -3359,6 +3383,11 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)

>  out_unwind:

>  	while (--i >= 0)

>  		blk_mq_free_map_and_requests(set, i);

> +	if (blk_mq_is_sbitmap_shared(set->flags))

> +		blk_mq_free_rqs(set, set->shared_sbitmap_tags, 0);

> +out_free_sbitmap_tags:

> +	if (blk_mq_is_sbitmap_shared(set->flags))

> +		blk_mq_exit_shared_sbitmap(set);


Once a helper of blk_mq_alloc_map_and_rqs() is added, the above failure
handling can be simplified too.


-- 
Ming
John Garry Aug. 18, 2021, 2:18 p.m. UTC | #2
On 18/08/2021 09:16, Ming Lei wrote:
> On Mon, Aug 09, 2021 at 10:29:37PM +0800, John Garry wrote:

>> Currently we use separate sbitmap pairs and active_queues atomic_t for

>> shared sbitmap support.

>>

>> However a full set of static requests are used per HW queue, which is

>> quite wasteful, considering that the total number of requests usable at

>> any given time across all HW queues is limited by the shared sbitmap depth.

>>

>> As such, it is considerably more memory efficient in the case of shared

>> sbitmap to allocate a set of static rqs per tag set or request queue, and

>> not per HW queue.

>>

>> So replace the sbitmap pairs and active_queues atomic_t with a shared

>> tags per tagset and request queue.

> 

> This idea looks good and the current implementation is simplified a bit

> too.


Good, but you did hint at it :)

> 

>>

>> Continue to use term "shared sbitmap" for now, as the meaning is known.

> 

> I guess shared tags is better.


Yeah, agreed. My preference would be to change later, once things settle 
down.

As I see, the only thing close to an ABI is the debugfs "flags" code, 
but that's debugfs, so not stable.

> 

>>

>> Signed-off-by: John Garry <john.garry@huawei.com>

>> ---

>>   block/blk-mq-sched.c   | 77 ++++++++++++++++++++-----------------

>>   block/blk-mq-tag.c     | 65 ++++++++++++-------------------

>>   block/blk-mq-tag.h     |  4 +-

>>   block/blk-mq.c         | 86 +++++++++++++++++++++++++-----------------

>>   block/blk-mq.h         |  8 ++--

>>   include/linux/blk-mq.h | 13 +++----

>>   include/linux/blkdev.h |  3 +-

>>   7 files changed, 131 insertions(+), 125 deletions(-)

>>

>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c


...

>> +

>>   static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)

>>   {

>>   	struct blk_mq_tag_set *set = queue->tag_set;

>> -	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);

>> -	struct blk_mq_hw_ctx *hctx;

>> -	int ret, i;

>> +	struct blk_mq_tags *tags;

>> +	int ret;

>>   

>>   	/*

>>   	 * Set initial depth at max so that we don't need to reallocate for

>>   	 * updating nr_requests.

>>   	 */

>> -	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,

>> -				  &queue->sched_breserved_tags,

>> -				  MAX_SCHED_RQ, set->reserved_tags,

>> -				  set->numa_node, alloc_policy);

>> -	if (ret)

>> -		return ret;

>> +	tags = queue->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,

>> +					  set->queue_depth,

>> +					  set->reserved_tags);

>> +	if (!queue->shared_sbitmap_tags)

>> +		return -ENOMEM;

>>   

>> -	queue_for_each_hw_ctx(queue, hctx, i) {

>> -		hctx->sched_tags->bitmap_tags =

>> -					&queue->sched_bitmap_tags;

>> -		hctx->sched_tags->breserved_tags =

>> -					&queue->sched_breserved_tags;

>> +	ret = blk_mq_alloc_rqs(set, tags, 0, set->queue_depth);

>> +	if (ret) {

>> +		blk_mq_exit_sched_shared_sbitmap(queue);

>> +		return ret;

> 

> There are two such patterns for allocate rq map and request pool

> together, please put them into one helper(such as blk_mq_alloc_map_and_rqs)

> which can return the allocated tags and handle failure inline. Also we may

> convert current users into this helper.


I'll have a look, but I will mention about "free" helper below

> 

>>   	}

>>   

>>   	blk_mq_tag_update_sched_shared_sbitmap(queue);

>> @@ -580,12 +589,6 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)

>>   	return 0;

>>   }

>>   

>> -static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)

>> -{

>> -	sbitmap_queue_free(&queue->sched_bitmap_tags);

>> -	sbitmap_queue_free(&queue->sched_breserved_tags);

>> -}

>> -


...

>>   

>> -void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)

>> +void blk_mq_free_tags(struct blk_mq_tags *tags)

>>   {

>> -	if (!blk_mq_is_sbitmap_shared(flags)) {

>> -		sbitmap_queue_free(tags->bitmap_tags);

>> -		sbitmap_queue_free(tags->breserved_tags);

>> -	}

>> +	sbitmap_queue_free(tags->bitmap_tags);

>> +	sbitmap_queue_free(tags->breserved_tags);

>>   	kfree(tags);

>>   }

>>   

>> @@ -604,18 +580,25 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,

>>   		if (tdepth > MAX_SCHED_RQ)

>>   			return -EINVAL;

>>   

>> +		if (blk_mq_is_sbitmap_shared(set->flags)) {

>> +			/* No point in allowing this to happen */

>> +			if (tdepth > set->queue_depth)

>> +				return -EINVAL;

>> +			return 0;

>> +		}

> 

> The above looks wrong, it isn't unusual to see small queue depth

> hardware meantime we often have scheduler queue depth of 2 * set->queue_depth.


ok, I suppose you're right.

> 

>> +

>>   		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,

>> -				tags->nr_reserved_tags, set->flags);

>> +				tags->nr_reserved_tags);

>>   		if (!new)

>>   			return -ENOMEM;

>>   		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);

>>   		if (ret) {

>> -			blk_mq_free_rq_map(new, set->flags);

>> +			blk_mq_free_rq_map(new);

>>   			return -ENOMEM;

>>   		}

>>   

>>   		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);

>> -		blk_mq_free_rq_map(*tagsptr, set->flags);

>> +		blk_mq_free_rq_map(*tagsptr);

>>   		*tagsptr = new;

>>   	} else {

>>   		/*

>> @@ -631,12 +614,14 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,

>>   

>>   void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)

>>   {

>> -	sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);

>> +	struct blk_mq_tags *tags = set->shared_sbitmap_tags;

>> +

>> +	sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags);

>>   }

>>   

>>   void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q)

>>   {

>> -	sbitmap_queue_resize(&q->sched_bitmap_tags,

>> +	sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags,

>>   			     q->nr_requests - q->tag_set->reserved_tags);

>>   }

>>   

>> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h

>> index 88f3c6485543..c9fc52ee07c4 100644

>> --- a/block/blk-mq-tag.h

>> +++ b/block/blk-mq-tag.h

>> @@ -30,8 +30,8 @@ struct blk_mq_tags {

>>   

>>   extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,

>>   					unsigned int reserved_tags,

>> -					int node, unsigned int flags);

>> -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);

>> +					int node, int alloc_policy);

>> +extern void blk_mq_free_tags(struct blk_mq_tags *tags);

>>   extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,

>>   			       struct sbitmap_queue *breserved_tags,

>>   			       unsigned int queue_depth,

>> diff --git a/block/blk-mq.c b/block/blk-mq.c

>> index 4d6723cfa582..d3dd5fab3426 100644

>> --- a/block/blk-mq.c

>> +++ b/block/blk-mq.c

>> @@ -2348,6 +2348,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,

>>   	struct blk_mq_tags *drv_tags;

>>   	struct page *page;

>>   

>> +	if (blk_mq_is_sbitmap_shared(set->flags))

>> +		drv_tags = set->shared_sbitmap_tags;

>> +	else

>>   		drv_tags = set->tags[hctx_idx];

> 

> Here I guess you need to avoid to double ->exit_request()?


I'll check that doesn't occur, but I didn't think it did.

> 

>>   

>>   	if (tags->static_rqs && set->ops->exit_request) {

>> @@ -2377,21 +2380,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,

>>   	}

>>   }

>>   

>> -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)

>> +void blk_mq_free_rq_map(struct blk_mq_tags *tags)

>>   {

>>   	kfree(tags->rqs);

>>   	tags->rqs = NULL;

>>   	kfree(tags->static_rqs);

>>   	tags->static_rqs = NULL;

>>   

>> -	blk_mq_free_tags(tags, flags);

>> +	blk_mq_free_tags(tags);

>>   }

>>   


...

>>   }

>> @@ -2877,11 +2886,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,

>>   static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,

>>   					 unsigned int hctx_idx)

>>   {

>> -	unsigned int flags = set->flags;

>> -

>>   	if (set->tags && set->tags[hctx_idx]) {

>> -		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);

>> -		blk_mq_free_rq_map(set->tags[hctx_idx], flags);

>> +		if (!blk_mq_is_sbitmap_shared(set->flags)) {

> 

> I remember you hate negative check, :-)


Not always, but sometimes I think the code harder to read with them.

> 

>> +			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);

>> +			blk_mq_free_rq_map(set->tags[hctx_idx]);

> 

> We can add one helper of blk_mq_free_map_and_rqs(), and there seems

> several such pattern.


ok, I can check, but I don't think it's useful in the blk-mq sched code 
as the tags and rqs are freed separately there, so not sure on how much 
we gain.

> 

>> +		}

>>   		set->tags[hctx_idx] = NULL;

>>   	}

>>   }

>> @@ -3348,6 +3357,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)

>>   {

>>   	int i;

>>   

>> +	if (blk_mq_is_sbitmap_shared(set->flags)) {

>> +		int ret;

>> +

>> +		set->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,

>> +						  set->queue_depth,

>> +						  set->reserved_tags);

>> +		if (!set->shared_sbitmap_tags)

>> +			return -ENOMEM;

>> +

>> +		ret = blk_mq_alloc_rqs(set, set->shared_sbitmap_tags, 0,

>> +				       set->queue_depth);

>> +		if (ret)

>> +			goto out_free_sbitmap_tags;

>> +	}

>> +

>>   	for (i = 0; i < set->nr_hw_queues; i++) {

>>   		if (!__blk_mq_alloc_map_and_request(set, i))

>>   			goto out_unwind;

>> @@ -3359,6 +3383,11 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)

>>   out_unwind:

>>   	while (--i >= 0)

>>   		blk_mq_free_map_and_requests(set, i);

>> +	if (blk_mq_is_sbitmap_shared(set->flags))

>> +		blk_mq_free_rqs(set, set->shared_sbitmap_tags, 0);

>> +out_free_sbitmap_tags:

>> +	if (blk_mq_is_sbitmap_shared(set->flags))

>> +		blk_mq_exit_shared_sbitmap(set);

> 

> Once a helper of blk_mq_alloc_map_and_rqs() is added, the above failure

> handling can be simplified too.

> 

> 


OK

Thanks a lot,
John
John Garry Aug. 23, 2021, 8:55 a.m. UTC | #3
On 18/08/2021 09:16, Ming Lei wrote:
>> ret = blk_mq_alloc_rqs(set, tags, 0, set->queue_depth);

>> +	if (ret) {

>> +		blk_mq_exit_sched_shared_sbitmap(queue);

>> +		return ret;

> There are two such patterns for allocate rq map and request pool

> together, please put them into one helper(such as blk_mq_alloc_map_and_rqs)

> which can return the allocated tags and handle failure inline. Also we may

> convert current users into this helper.

> 

>>   	}


Hi Ming,

Do you have a preference for the API:

int blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags **tags, unsigned int hctx_idx, unsigned int depth);

void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
  struct blk_mq_tags **tags, unsigned int hctx_idx);


or

struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth);

void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
  struct blk_mq_tags *tags, unsigned int hctx_idx);


The advantage of the first is that we don't need the pattern:
blk_mq_free_map_and_requests(tags)
tags = NULL;

But then passing struct blk_mq_tags ** is a bit overly complicated.

Thanks,
John
diff mbox series

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ac0408ebcd47..1101a2e4da9a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -522,14 +522,19 @@  static int blk_mq_sched_alloc_map_and_request(struct request_queue *q,
 	struct blk_mq_tag_set *set = q->tag_set;
 	int ret;
 
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+		hctx->sched_tags = q->shared_sbitmap_tags;
+		return 0;
+	}
+
 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
-					       set->reserved_tags, set->flags);
+					       set->reserved_tags);
 	if (!hctx->sched_tags)
 		return -ENOMEM;
 
 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 	if (ret) {
-		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
+		blk_mq_free_rq_map(hctx->sched_tags);
 		hctx->sched_tags = NULL;
 	}
 
@@ -544,35 +549,39 @@  static void blk_mq_sched_tags_teardown(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->sched_tags) {
-			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
+			if (!blk_mq_is_sbitmap_shared(q->tag_set->flags))
+				blk_mq_free_rq_map(hctx->sched_tags);
 			hctx->sched_tags = NULL;
 		}
 	}
 }
 
+static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
+{
+	blk_mq_free_rq_map(queue->shared_sbitmap_tags);
+	queue->shared_sbitmap_tags = NULL;
+}
+
 static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
 {
 	struct blk_mq_tag_set *set = queue->tag_set;
-	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-	struct blk_mq_hw_ctx *hctx;
-	int ret, i;
+	struct blk_mq_tags *tags;
+	int ret;
 
 	/*
 	 * Set initial depth at max so that we don't need to reallocate for
 	 * updating nr_requests.
 	 */
-	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
-				  &queue->sched_breserved_tags,
-				  MAX_SCHED_RQ, set->reserved_tags,
-				  set->numa_node, alloc_policy);
-	if (ret)
-		return ret;
+	tags = queue->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,
+					  set->queue_depth,
+					  set->reserved_tags);
+	if (!queue->shared_sbitmap_tags)
+		return -ENOMEM;
 
-	queue_for_each_hw_ctx(queue, hctx, i) {
-		hctx->sched_tags->bitmap_tags =
-					&queue->sched_bitmap_tags;
-		hctx->sched_tags->breserved_tags =
-					&queue->sched_breserved_tags;
+	ret = blk_mq_alloc_rqs(set, tags, 0, set->queue_depth);
+	if (ret) {
+		blk_mq_exit_sched_shared_sbitmap(queue);
+		return ret;
 	}
 
 	blk_mq_tag_update_sched_shared_sbitmap(queue);
@@ -580,12 +589,6 @@  static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
 	return 0;
 }
 
-static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
-{
-	sbitmap_queue_free(&queue->sched_bitmap_tags);
-	sbitmap_queue_free(&queue->sched_breserved_tags);
-}
-
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -607,21 +610,21 @@  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
 				   BLKDEV_DEFAULT_RQ);
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+		ret = blk_mq_init_sched_shared_sbitmap(q);
 		if (ret)
-			goto err_free_map_and_request;
+			return ret;
 	}
 
-	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
-		ret = blk_mq_init_sched_shared_sbitmap(q);
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);
 		if (ret)
 			goto err_free_map_and_request;
 	}
 
 	ret = e->ops.init_sched(q, e);
 	if (ret)
-		goto err_free_sbitmap;
+		goto err_free_map_and_request;
 
 	blk_mq_debugfs_register_sched(q);
 
@@ -641,12 +644,12 @@  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	return 0;
 
-err_free_sbitmap:
-	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
-		blk_mq_exit_sched_shared_sbitmap(q);
 err_free_map_and_request:
 	blk_mq_sched_free_requests(q);
 	blk_mq_sched_tags_teardown(q);
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+		blk_mq_exit_sched_shared_sbitmap(q);
+
 	q->elevator = NULL;
 	return ret;
 }
@@ -660,9 +663,13 @@  void blk_mq_sched_free_requests(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->sched_tags)
-			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+		blk_mq_free_rqs(q->tag_set, q->shared_sbitmap_tags, 0);
+	} else {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (hctx->sched_tags)
+				blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+		}
 	}
 }
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5f06ad6efc8f..e97bbf477b96 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -27,10 +27,11 @@  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
 		struct request_queue *q = hctx->queue;
 		struct blk_mq_tag_set *set = q->tag_set;
+		struct blk_mq_tags *tags = set->shared_sbitmap_tags;
 
 		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
 		    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
-			atomic_inc(&set->active_queues_shared_sbitmap);
+			atomic_inc(&tags->active_queues);
 	} else {
 		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
 		    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
@@ -61,10 +62,12 @@  void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_tag_set *set = q->tag_set;
 
 	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+		struct blk_mq_tags *tags = set->shared_sbitmap_tags;
+
 		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
 					&q->queue_flags))
 			return;
-		atomic_dec(&set->active_queues_shared_sbitmap);
+		atomic_dec(&tags->active_queues);
 	} else {
 		if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 			return;
@@ -510,38 +513,16 @@  static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 	return 0;
 }
 
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
-{
-	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
-	int i, ret;
-
-	ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
-				  set->queue_depth, set->reserved_tags,
-				  set->numa_node, alloc_policy);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		struct blk_mq_tags *tags = set->tags[i];
-
-		tags->bitmap_tags = &set->__bitmap_tags;
-		tags->breserved_tags = &set->__breserved_tags;
-	}
-
-	return 0;
-}
-
 void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
 {
-	sbitmap_queue_free(&set->__bitmap_tags);
-	sbitmap_queue_free(&set->__breserved_tags);
+	blk_mq_free_rq_map(set->shared_sbitmap_tags);
+	set->shared_sbitmap_tags = NULL;
 }
 
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 				     unsigned int reserved_tags,
-				     int node, unsigned int flags)
+				     int node, int alloc_policy)
 {
-	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
 	struct blk_mq_tags *tags;
 
 	if (total_tags > BLK_MQ_TAG_MAX) {
@@ -557,9 +538,6 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_reserved_tags = reserved_tags;
 	spin_lock_init(&tags->lock);
 
-	if (blk_mq_is_sbitmap_shared(flags))
-		return tags;
-
 	if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
 		kfree(tags);
 		return NULL;
@@ -567,12 +545,10 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	return tags;
 }
 
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	if (!blk_mq_is_sbitmap_shared(flags)) {
-		sbitmap_queue_free(tags->bitmap_tags);
-		sbitmap_queue_free(tags->breserved_tags);
-	}
+	sbitmap_queue_free(tags->bitmap_tags);
+	sbitmap_queue_free(tags->breserved_tags);
 	kfree(tags);
 }
 
@@ -604,18 +580,25 @@  int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 		if (tdepth > MAX_SCHED_RQ)
 			return -EINVAL;
 
+		if (blk_mq_is_sbitmap_shared(set->flags)) {
+			/* No point in allowing this to happen */
+			if (tdepth > set->queue_depth)
+				return -EINVAL;
+			return 0;
+		}
+
 		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
-				tags->nr_reserved_tags, set->flags);
+				tags->nr_reserved_tags);
 		if (!new)
 			return -ENOMEM;
 		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
 		if (ret) {
-			blk_mq_free_rq_map(new, set->flags);
+			blk_mq_free_rq_map(new);
 			return -ENOMEM;
 		}
 
 		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
-		blk_mq_free_rq_map(*tagsptr, set->flags);
+		blk_mq_free_rq_map(*tagsptr);
 		*tagsptr = new;
 	} else {
 		/*
@@ -631,12 +614,14 @@  int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 
 void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
 {
-	sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+	struct blk_mq_tags *tags = set->shared_sbitmap_tags;
+
+	sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags);
 }
 
 void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q)
 {
-	sbitmap_queue_resize(&q->sched_bitmap_tags,
+	sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags,
 			     q->nr_requests - q->tag_set->reserved_tags);
 }
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 88f3c6485543..c9fc52ee07c4 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -30,8 +30,8 @@  struct blk_mq_tags {
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
 					unsigned int reserved_tags,
-					int node, unsigned int flags);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+					int node, int alloc_policy);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
 			       struct sbitmap_queue *breserved_tags,
 			       unsigned int queue_depth,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4d6723cfa582..d3dd5fab3426 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2348,6 +2348,9 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	struct blk_mq_tags *drv_tags;
 	struct page *page;
 
+	if (blk_mq_is_sbitmap_shared(set->flags))
+		drv_tags = set->shared_sbitmap_tags;
+	else
 		drv_tags = set->tags[hctx_idx];
 
 	if (tags->static_rqs && set->ops->exit_request) {
@@ -2377,21 +2380,20 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	}
 }
 
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 {
 	kfree(tags->rqs);
 	tags->rqs = NULL;
 	kfree(tags->static_rqs);
 	tags->static_rqs = NULL;
 
-	blk_mq_free_tags(tags, flags);
+	blk_mq_free_tags(tags);
 }
 
 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					unsigned int hctx_idx,
 					unsigned int nr_tags,
-					unsigned int reserved_tags,
-					unsigned int flags)
+					unsigned int reserved_tags)
 {
 	struct blk_mq_tags *tags;
 	int node;
@@ -2400,7 +2402,8 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
-	tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
+	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
+				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
@@ -2408,7 +2411,7 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 node);
 	if (!tags->rqs) {
-		blk_mq_free_tags(tags, flags);
+		blk_mq_free_tags(tags);
 		return NULL;
 	}
 
@@ -2417,7 +2420,7 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					node);
 	if (!tags->static_rqs) {
 		kfree(tags->rqs);
-		blk_mq_free_tags(tags, flags);
+		blk_mq_free_tags(tags);
 		return NULL;
 	}
 
@@ -2859,8 +2862,14 @@  static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
 	unsigned int flags = set->flags;
 	int ret = 0;
 
+	if (blk_mq_is_sbitmap_shared(flags)) {
+		set->tags[hctx_idx] = set->shared_sbitmap_tags;
+
+		return true;
+	}
+
 	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
-					set->queue_depth, set->reserved_tags, flags);
+					set->queue_depth, set->reserved_tags);
 	if (!set->tags[hctx_idx])
 		return false;
 
@@ -2869,7 +2878,7 @@  static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
 	if (!ret)
 		return true;
 
-	blk_mq_free_rq_map(set->tags[hctx_idx], flags);
+	blk_mq_free_rq_map(set->tags[hctx_idx]);
 	set->tags[hctx_idx] = NULL;
 	return false;
 }
@@ -2877,11 +2886,11 @@  static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 					 unsigned int hctx_idx)
 {
-	unsigned int flags = set->flags;
-
 	if (set->tags && set->tags[hctx_idx]) {
-		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-		blk_mq_free_rq_map(set->tags[hctx_idx], flags);
+		if (!blk_mq_is_sbitmap_shared(set->flags)) {
+			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+			blk_mq_free_rq_map(set->tags[hctx_idx]);
+		}
 		set->tags[hctx_idx] = NULL;
 	}
 }
@@ -3348,6 +3357,21 @@  static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
 
+	if (blk_mq_is_sbitmap_shared(set->flags)) {
+		int ret;
+
+		set->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,
+						  set->queue_depth,
+						  set->reserved_tags);
+		if (!set->shared_sbitmap_tags)
+			return -ENOMEM;
+
+		ret = blk_mq_alloc_rqs(set, set->shared_sbitmap_tags, 0,
+				       set->queue_depth);
+		if (ret)
+			goto out_free_sbitmap_tags;
+	}
+
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (!__blk_mq_alloc_map_and_request(set, i))
 			goto out_unwind;
@@ -3359,6 +3383,11 @@  static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 out_unwind:
 	while (--i >= 0)
 		blk_mq_free_map_and_requests(set, i);
+	if (blk_mq_is_sbitmap_shared(set->flags))
+		blk_mq_free_rqs(set, set->shared_sbitmap_tags, 0);
+out_free_sbitmap_tags:
+	if (blk_mq_is_sbitmap_shared(set->flags))
+		blk_mq_exit_shared_sbitmap(set);
 
 	return -ENOMEM;
 }
@@ -3492,6 +3521,9 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->ops->init_request && set->ops->init_request_no_hctx)
 		return -EINVAL;
 
+	if (set->ops->init_request && blk_mq_is_sbitmap_shared(set->flags))
+		return -EINVAL;
+
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
@@ -3541,23 +3573,11 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (ret)
 		goto out_free_mq_map;
 
-	if (blk_mq_is_sbitmap_shared(set->flags)) {
-		atomic_set(&set->active_queues_shared_sbitmap, 0);
-
-		if (blk_mq_init_shared_sbitmap(set)) {
-			ret = -ENOMEM;
-			goto out_free_mq_rq_maps;
-		}
-	}
-
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
 
-out_free_mq_rq_maps:
-	for (i = 0; i < set->nr_hw_queues; i++)
-		blk_mq_free_map_and_requests(set, i);
 out_free_mq_map:
 	for (i = 0; i < set->nr_maps; i++) {
 		kfree(set->map[i].mq_map);
@@ -3589,11 +3609,15 @@  void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i, j;
 
-	for (i = 0; i < set->nr_hw_queues; i++)
-		blk_mq_free_map_and_requests(set, i);
+	if (blk_mq_is_sbitmap_shared(set->flags)) {
+		struct blk_mq_tags *tags = set->shared_sbitmap_tags;
 
-	if (blk_mq_is_sbitmap_shared(set->flags))
+		blk_mq_free_rqs(set, tags, 0);
 		blk_mq_exit_shared_sbitmap(set);
+	}
+
+	for (i = 0; i < set->nr_hw_queues; i++)
+		blk_mq_free_map_and_requests(set, i);
 
 	for (j = 0; j < set->nr_maps; j++) {
 		kfree(set->map[j].mq_map);
@@ -3631,12 +3655,6 @@  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		if (hctx->sched_tags) {
 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
 						      nr, true);
-			if (blk_mq_is_sbitmap_shared(set->flags)) {
-				hctx->sched_tags->bitmap_tags =
-					&q->sched_bitmap_tags;
-				hctx->sched_tags->breserved_tags =
-					&q->sched_breserved_tags;
-			}
 		} else {
 			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
 						      false);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d08779f77a26..f595521a4b3d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -54,12 +54,11 @@  void blk_mq_put_rq_ref(struct request *rq);
  */
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					unsigned int hctx_idx,
 					unsigned int nr_tags,
-					unsigned int reserved_tags,
-					unsigned int flags);
+					unsigned int reserved_tags);
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx, unsigned int depth);
 
@@ -334,10 +333,11 @@  static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
 		struct request_queue *q = hctx->queue;
 		struct blk_mq_tag_set *set = q->tag_set;
+		struct blk_mq_tags *tags = set->shared_sbitmap_tags;
 
 		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
 			return true;
-		users = atomic_read(&set->active_queues_shared_sbitmap);
+		users = atomic_read(&tags->active_queues);
 	} else {
 		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 			return true;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c838b24944c2..e67068bf648c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -232,13 +232,11 @@  enum hctx_type {
  * @flags:	   Zero or more BLK_MQ_F_* flags.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *		   tag set.
- * @active_queues_shared_sbitmap:
- * 		   number of active request queues per tag set.
- * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
- * @__breserved_tags:
- *		   A shared reserved tags sbitmap, used over all hctx's
  * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
  *		   elements.
+ * @shared_sbitmap_tags:
+ *		   Shared sbitmap set of tags. Has @nr_hw_queues elements. If
+ *		   set, shared by all @tags.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:	   List of the request queues that use this tag set. See also
  *		   request_queue.tag_set_list.
@@ -255,12 +253,11 @@  struct blk_mq_tag_set {
 	unsigned int		timeout;
 	unsigned int		flags;
 	void			*driver_data;
-	atomic_t		active_queues_shared_sbitmap;
 
-	struct sbitmap_queue	__bitmap_tags;
-	struct sbitmap_queue	__breserved_tags;
 	struct blk_mq_tags	**tags;
 
+	struct blk_mq_tags	*shared_sbitmap_tags;
+
 	struct mutex		tag_list_lock;
 	struct list_head	tag_list;
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56870a43ae4c..f5a039c251e3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -467,8 +467,7 @@  struct request_queue {
 
 	atomic_t		nr_active_requests_shared_sbitmap;
 
-	struct sbitmap_queue	sched_bitmap_tags;
-	struct sbitmap_queue	sched_breserved_tags;
+	struct blk_mq_tags	*shared_sbitmap_tags;
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP