diff mbox series

[v4,2/7] block: Support configuring limits below the page size

Message ID 20230130212656.876311-3-bvanassche@acm.org
State New
Headers show
Series Add support for limits below the page size | expand

Commit Message

Bart Van Assche Jan. 30, 2023, 9:26 p.m. UTC
Allow block drivers to configure the following:
* Maximum number of hardware sectors values smaller than
  PAGE_SIZE >> SECTOR_SHIFT. With PAGE_SIZE = 4096 this means that values
  below 8 are supported.
* A maximum segment size below the page size. This is most useful
  for page sizes above 4096 bytes.

The blk_sub_page_segments static branch will be used in later patches to
prevent that performance of block drivers that support segments >=
PAGE_SIZE and max_hw_sectors >= PAGE_SIZE >> SECTOR_SHIFT would be affected.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-core.c       |  1 +
 block/blk-mq-debugfs.c |  5 +++
 block/blk-settings.c   | 82 +++++++++++++++++++++++++++++++++++++-----
 block/blk.h            | 10 ++++++
 include/linux/blkdev.h |  2 ++
 5 files changed, 92 insertions(+), 8 deletions(-)

Comments

Luis Chamberlain Feb. 1, 2023, 11:50 p.m. UTC | #1
On Mon, Jan 30, 2023 at 01:26:51PM -0800, Bart Van Assche wrote:
> Allow block drivers to configure the following:
> * Maximum number of hardware sectors values smaller than
>   PAGE_SIZE >> SECTOR_SHIFT. With PAGE_SIZE = 4096 this means that values
>   below 8 are supported.
> * A maximum segment size below the page size. This is most useful
>   for page sizes above 4096 bytes.
> 
> The blk_sub_page_segments static branch will be used in later patches to
> prevent that performance of block drivers that support segments >=
> PAGE_SIZE and max_hw_sectors >= PAGE_SIZE >> SECTOR_SHIFT would be affected.

This commit log seems to not make it clear if there is a functional
change or not.

> @@ -100,6 +106,55 @@ void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
>  }
>  EXPORT_SYMBOL(blk_queue_bounce_limit);
>  
> +/* For debugfs. */
> +int blk_sub_page_limit_queues_get(void *data, u64 *val)
> +{
> +	*val = READ_ONCE(blk_nr_sub_page_limit_queues);
> +
> +	return 0;
> +}
> +
> +/**
> + * blk_enable_sub_page_limits - enable support for max_segment_size values smaller than PAGE_SIZE and for max_hw_sectors values below PAGE_SIZE >> SECTOR_SHIFT

That's a really long line for kdoc, can't we just trim it?

And why not update the docs to reflect the change?

> @@ -122,12 +177,17 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
>  void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
>  {
>  	struct queue_limits *limits = &q->limits;
> +	unsigned int min_max_hw_sectors = PAGE_SIZE >> SECTOR_SHIFT;
>  	unsigned int max_sectors;
>  
> -	if ((max_hw_sectors << 9) < PAGE_SIZE) {
> -		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
> -		printk(KERN_INFO "%s: set to minimum %d\n",
> -		       __func__, max_hw_sectors);
> +	if (max_hw_sectors < min_max_hw_sectors) {
> +		blk_enable_sub_page_limits(limits);
> +		min_max_hw_sectors = 1;
> +	}

Up to this part this a non-functional update, and so why not a
separate patch to clarify that.

> +
> +	if (max_hw_sectors < min_max_hw_sectors) {
> +		max_hw_sectors = min_max_hw_sectors;
> +		pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);

But if I understand correctly here we're now changing
max_hw_sectors from 1 to whatever the driver set on
blk_queue_max_hw_sectors() if its smaller than PAGE_SIZE.

To determine if this is a functional change it begs the
question as to how many block drivers have a max hw sector
smaller than the equivalent PAGE_SIZE and wonder if that
could regress.

>  	}
>  
>  	max_hw_sectors = round_down(max_hw_sectors,
> @@ -282,10 +342,16 @@ EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
>   **/
>  void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
>  {
> -	if (max_size < PAGE_SIZE) {
> -		max_size = PAGE_SIZE;
> -		printk(KERN_INFO "%s: set to minimum %d\n",
> -		       __func__, max_size);
> +	unsigned int min_max_segment_size = PAGE_SIZE;
> +
> +	if (max_size < min_max_segment_size) {
> +		blk_enable_sub_page_limits(&q->limits);
> +		min_max_segment_size = SECTOR_SIZE;
> +	}
> +
> +	if (max_size < min_max_segment_size) {
> +		max_size = min_max_segment_size;
> +		pr_info("%s: set to minimum %u\n", __func__, max_size);

And similar thing here.

  Luis
Bart Van Assche Feb. 7, 2023, 12:02 a.m. UTC | #2
On 2/1/23 15:50, Luis Chamberlain wrote:
> On Mon, Jan 30, 2023 at 01:26:51PM -0800, Bart Van Assche wrote:
>> @@ -122,12 +177,17 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
>>   void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
>>   {
>>   	struct queue_limits *limits = &q->limits;
>> +	unsigned int min_max_hw_sectors = PAGE_SIZE >> SECTOR_SHIFT;
>>   	unsigned int max_sectors;
>>   
>> -	if ((max_hw_sectors << 9) < PAGE_SIZE) {
>> -		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
>> -		printk(KERN_INFO "%s: set to minimum %d\n",
>> -		       __func__, max_hw_sectors);
>> +	if (max_hw_sectors < min_max_hw_sectors) {
>> +		blk_enable_sub_page_limits(limits);
>> +		min_max_hw_sectors = 1;
>> +	}
> 
> Up to this part this a non-functional update, and so why not a
> separate patch to clarify that.

Will do.

>> +
>> +	if (max_hw_sectors < min_max_hw_sectors) {
>> +		max_hw_sectors = min_max_hw_sectors;
>> +		pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
> 
> But if I understand correctly here we're now changing
> max_hw_sectors from 1 to whatever the driver set on
> blk_queue_max_hw_sectors() if its smaller than PAGE_SIZE.
> 
> To determine if this is a functional change it begs the
> question as to how many block drivers have a max hw sector
> smaller than the equivalent PAGE_SIZE and wonder if that
> could regress.

If a block driver passes an argument to blk_queue_max_hw_sectors() or 
blk_queue_max_segment_size() that is smaller than what is supported by 
the block layer, data corruption will be triggered if the block driver 
or the hardware supported by the block driver does not support the 
larger values chosen by the block layer. Changing limits that will 
trigger data corruption into limits that may work seems like an 
improvement to me?

Thanks,

Bart.
Luis Chamberlain Feb. 7, 2023, 12:19 a.m. UTC | #3
On Mon, Feb 06, 2023 at 04:02:46PM -0800, Bart Van Assche wrote:
> On 2/1/23 15:50, Luis Chamberlain wrote:
> > On Mon, Jan 30, 2023 at 01:26:51PM -0800, Bart Van Assche wrote:
> > > @@ -122,12 +177,17 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
> > >   void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
> > >   {
> > >   	struct queue_limits *limits = &q->limits;
> > > +	unsigned int min_max_hw_sectors = PAGE_SIZE >> SECTOR_SHIFT;
> > >   	unsigned int max_sectors;
> > > -	if ((max_hw_sectors << 9) < PAGE_SIZE) {
> > > -		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
> > > -		printk(KERN_INFO "%s: set to minimum %d\n",
> > > -		       __func__, max_hw_sectors);
> > > +	if (max_hw_sectors < min_max_hw_sectors) {
> > > +		blk_enable_sub_page_limits(limits);
> > > +		min_max_hw_sectors = 1;
> > > +	}
> > 
> > Up to this part this a non-functional update, and so why not a
> > separate patch to clarify that.
> 
> Will do.
> 
> > > +
> > > +	if (max_hw_sectors < min_max_hw_sectors) {
> > > +		max_hw_sectors = min_max_hw_sectors;
> > > +		pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
> > 
> > But if I understand correctly here we're now changing
> > max_hw_sectors from 1 to whatever the driver set on
> > blk_queue_max_hw_sectors() if its smaller than PAGE_SIZE.
> > 
> > To determine if this is a functional change it begs the
> > question as to how many block drivers have a max hw sector
> > smaller than the equivalent PAGE_SIZE and wonder if that
> > could regress.
> 
> If a block driver passes an argument to blk_queue_max_hw_sectors() or
> blk_queue_max_segment_size() that is smaller than what is supported by the
> block layer, data corruption will be triggered if the block driver or the
> hardware supported by the block driver does not support the larger values
> chosen by the block layer. Changing limits that will trigger data corruption
> into limits that may work seems like an improvement to me?

Wow, clearly! Without a doubt!

But I'm trying to do a careful review here.

The commit log did not describe what *does* happen in these situations today,
and you seem to now be suggesting in the worst case corruption can happen.
That changes the patch context quite a bit!

My question above still stands though, how many block drivers have a max
hw sector smaller than the equivalent PAGE_SIZE. If you make your
change, even if it fixes some new use case where corruption is seen, can
it regress some old use cases for some old controllers?

  Luis
Bart Van Assche Feb. 7, 2023, 12:31 a.m. UTC | #4
On 2/6/23 16:19, Luis Chamberlain wrote:
> But I'm trying to do a careful review here.

That's appreciated :-)

> The commit log did not describe what *does* happen in these situations today,
> and you seem to now be suggesting in the worst case corruption can happen.
> That changes the patch context quite a bit!
> 
> My question above still stands though, how many block drivers have a max
> hw sector smaller than the equivalent PAGE_SIZE. If you make your
> change, even if it fixes some new use case where corruption is seen, can
> it regress some old use cases for some old controllers?

The blk_queue_max_hw_sectors() change has been requested by a 
contributor to the MMC driver (I'm not familiar with the MMC driver).

I'm not aware of any storage controllers for which the maximum segment 
size is below 4 KiB.

For some storage controllers, e.g. the UFS Exynos controller, the 
maximum supported segment size is 4 KiB. This patch series makes such 
storage controllers compatible with larger page sizes, e.g. 16 KiB.

Does this answer your question?

Thanks,

Bart.
Luis Chamberlain Feb. 7, 2023, 2:08 a.m. UTC | #5
On Mon, Feb 06, 2023 at 04:31:58PM -0800, Bart Van Assche wrote:
> On 2/6/23 16:19, Luis Chamberlain wrote:
> > But I'm trying to do a careful review here.
> 
> That's appreciated :-)
> 
> > The commit log did not describe what *does* happen in these situations today,
> > and you seem to now be suggesting in the worst case corruption can happen.
> > That changes the patch context quite a bit!
> > 
> > My question above still stands though, how many block drivers have a max
> > hw sector smaller than the equivalent PAGE_SIZE. If you make your
> > change, even if it fixes some new use case where corruption is seen, can
> > it regress some old use cases for some old controllers?
> 
> The blk_queue_max_hw_sectors() change has been requested by a contributor to
> the MMC driver (I'm not familiar with the MMC driver).
> 
> I'm not aware of any storage controllers for which the maximum segment size
> is below 4 KiB.

Then the commit log should mention that. Because do you admit that it
could possible change their behaviour?

> For some storage controllers, e.g. the UFS Exynos controller, the maximum
> supported segment size is 4 KiB. This patch series makes such storage
> controllers compatible with larger page sizes, e.g. 16 KiB.
> 
> Does this answer your question?

Does mine answer the reason to why I am asking it? If we are sure these
don't exist then please mention it in the commit log. And also more
importantly the possible corruption issue you describe which could
happen! Was a corruption actually observed in real life or reported!?

  Luis
Bart Van Assche Feb. 7, 2023, 6:26 p.m. UTC | #6
On 2/6/23 18:08, Luis Chamberlain wrote:
> On Mon, Feb 06, 2023 at 04:31:58PM -0800, Bart Van Assche wrote:
>> On 2/6/23 16:19, Luis Chamberlain wrote:
>>> But I'm trying to do a careful review here.
>>
>> That's appreciated :-)
>>
>>> The commit log did not describe what *does* happen in these situations today,
>>> and you seem to now be suggesting in the worst case corruption can happen.
>>> That changes the patch context quite a bit!
>>>
>>> My question above still stands though, how many block drivers have a max
>>> hw sector smaller than the equivalent PAGE_SIZE. If you make your
>>> change, even if it fixes some new use case where corruption is seen, can
>>> it regress some old use cases for some old controllers?
>>
>> The blk_queue_max_hw_sectors() change has been requested by a contributor to
>> the MMC driver (I'm not familiar with the MMC driver).
>>
>> I'm not aware of any storage controllers for which the maximum segment size
>> is below 4 KiB.
> 
> Then the commit log should mention that. Because do you admit that it
> could possible change their behaviour?

I will make the commit message more detailed.

>> For some storage controllers, e.g. the UFS Exynos controller, the maximum
>> supported segment size is 4 KiB. This patch series makes such storage
>> controllers compatible with larger page sizes, e.g. 16 KiB.
>>
>> Does this answer your question?
> 
> Does mine answer the reason to why I am asking it? If we are sure these
> don't exist then please mention it in the commit log. And also more
> importantly the possible corruption issue you describe which could
> happen! Was a corruption actually observed in real life or reported!?

Incorrect data transfers have been observed in our tests. We noticed in 
our tests with the Exynos controller and PAGE_SIZE = 16 KiB that booting 
fails without this patch series. I think booting failed because the DMA 
engine in this controller was asked to perform transfers that fall 
outside the supported limits. I expect similar behavior for all other 
storage controllers with a DMA engine.

Bart.
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 0dacc2df9588..b193040c7c73 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -270,6 +270,7 @@  static void blk_free_queue(struct request_queue *q)
 
 	blk_free_queue_stats(q->stats);
 	kfree(q->poll_stat);
+	blk_disable_sub_page_limits(&q->limits);
 
 	if (queue_is_mq(q))
 		blk_mq_release(q);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 60d1de0ce624..4f06e02961f3 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -875,7 +875,12 @@  void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 	hctx->sched_debugfs_dir = NULL;
 }
 
+DEFINE_DEBUGFS_ATTRIBUTE(blk_sub_page_limit_queues_fops,
+			blk_sub_page_limit_queues_get, NULL, "%llu\n");
+
 void blk_mq_debugfs_init(void)
 {
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
+	debugfs_create_file("sub_page_limit_queues", 0400, blk_debugfs_root,
+			    NULL, &blk_sub_page_limit_queues_fops);
 }
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9c9713c9269c..46d43cef8377 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -18,6 +18,11 @@ 
 #include "blk.h"
 #include "blk-wbt.h"
 
+/* Protects blk_nr_sub_page_limit_queues and blk_sub_page_limits changes. */
+static DEFINE_MUTEX(blk_sub_page_limit_lock);
+static uint32_t blk_nr_sub_page_limit_queues;
+DEFINE_STATIC_KEY_FALSE(blk_sub_page_limits);
+
 void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
 {
 	q->rq_timeout = timeout;
@@ -58,6 +63,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->zoned = BLK_ZONED_NONE;
 	lim->zone_write_granularity = 0;
 	lim->dma_alignment = 511;
+	lim->sub_page_limits = false;
 }
 
 /**
@@ -100,6 +106,55 @@  void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/* For debugfs. */
+int blk_sub_page_limit_queues_get(void *data, u64 *val)
+{
+	*val = READ_ONCE(blk_nr_sub_page_limit_queues);
+
+	return 0;
+}
+
+/**
+ * blk_enable_sub_page_limits - enable support for max_segment_size values smaller than PAGE_SIZE and for max_hw_sectors values below PAGE_SIZE >> SECTOR_SHIFT
+ * @lim: request queue limits for which to enable support of these features.
+ *
+ * Support for these features is not enabled all the time because of the
+ * runtime overhead of these features.
+ */
+static void blk_enable_sub_page_limits(struct queue_limits *lim)
+{
+	if (lim->sub_page_limits)
+		return;
+
+	lim->sub_page_limits = true;
+
+	mutex_lock(&blk_sub_page_limit_lock);
+	if (++blk_nr_sub_page_limit_queues == 1)
+		static_branch_enable(&blk_sub_page_limits);
+	mutex_unlock(&blk_sub_page_limit_lock);
+}
+
+/**
+ * blk_disable_sub_page_limits - disable support for max_segment_size values smaller than PAGE_SIZE and for max_hw_sectors values below PAGE_SIZE >> SECTOR_SHIFT
+ * @lim: request queue limits for which to enable support of these features.
+ *
+ * Support for these features is not enabled all the time because of the
+ * runtime overhead of these features.
+ */
+void blk_disable_sub_page_limits(struct queue_limits *lim)
+{
+	if (!lim->sub_page_limits)
+		return;
+
+	lim->sub_page_limits = false;
+
+	mutex_lock(&blk_sub_page_limit_lock);
+	WARN_ON_ONCE(blk_nr_sub_page_limit_queues <= 0);
+	if (--blk_nr_sub_page_limit_queues == 0)
+		static_branch_disable(&blk_sub_page_limits);
+	mutex_unlock(&blk_sub_page_limit_lock);
+}
+
 /**
  * blk_queue_max_hw_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
@@ -122,12 +177,17 @@  EXPORT_SYMBOL(blk_queue_bounce_limit);
 void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
 	struct queue_limits *limits = &q->limits;
+	unsigned int min_max_hw_sectors = PAGE_SIZE >> SECTOR_SHIFT;
 	unsigned int max_sectors;
 
-	if ((max_hw_sectors << 9) < PAGE_SIZE) {
-		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
-		printk(KERN_INFO "%s: set to minimum %d\n",
-		       __func__, max_hw_sectors);
+	if (max_hw_sectors < min_max_hw_sectors) {
+		blk_enable_sub_page_limits(limits);
+		min_max_hw_sectors = 1;
+	}
+
+	if (max_hw_sectors < min_max_hw_sectors) {
+		max_hw_sectors = min_max_hw_sectors;
+		pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
 	}
 
 	max_hw_sectors = round_down(max_hw_sectors,
@@ -282,10 +342,16 @@  EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
  **/
 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 {
-	if (max_size < PAGE_SIZE) {
-		max_size = PAGE_SIZE;
-		printk(KERN_INFO "%s: set to minimum %d\n",
-		       __func__, max_size);
+	unsigned int min_max_segment_size = PAGE_SIZE;
+
+	if (max_size < min_max_segment_size) {
+		blk_enable_sub_page_limits(&q->limits);
+		min_max_segment_size = SECTOR_SIZE;
+	}
+
+	if (max_size < min_max_segment_size) {
+		max_size = min_max_segment_size;
+		pr_info("%s: set to minimum %u\n", __func__, max_size);
 	}
 
 	/* see blk_queue_virt_boundary() for the explanation */
diff --git a/block/blk.h b/block/blk.h
index 4c3b3325219a..9a56d7002efc 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,7 @@  struct elevator_type;
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
 extern struct dentry *blk_debugfs_root;
+DECLARE_STATIC_KEY_FALSE(blk_sub_page_limits);
 
 struct blk_flush_queue {
 	unsigned int		flush_pending_idx:1;
@@ -32,6 +33,15 @@  struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 					      gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
+static inline bool blk_queue_sub_page_limits(const struct queue_limits *lim)
+{
+	return static_branch_unlikely(&blk_sub_page_limits) &&
+		lim->sub_page_limits;
+}
+
+int blk_sub_page_limit_queues_get(void *data, u64 *val);
+void blk_disable_sub_page_limits(struct queue_limits *q);
+
 void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b9637d63e6f0..af04bf241714 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -319,6 +319,8 @@  struct queue_limits {
 	 * due to possible offsets.
 	 */
 	unsigned int		dma_alignment;
+
+	bool			sub_page_limits;
 };
 
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,