diff mbox series

[PATCHv9,3/7] block: allow ability to limit partition write hints

Message ID 20241025213645.3464331-4-kbusch@meta.com
State New
Headers show
Series write hints with nvme fdp, scsi streams | expand

Commit Message

Keith Busch Oct. 25, 2024, 9:36 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

When multiple partitions are used, you may want to enforce different
subsets of the available write hints for each partition. Provide a
bitmap attribute of the available write hints, and allow an admin to
write a different mask to set the partition's allowed write hints.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bdev.c              | 15 +++++++++++++
 block/partitions/core.c   | 46 +++++++++++++++++++++++++++++++++++++--
 include/linux/blk_types.h |  1 +
 3 files changed, 60 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig Oct. 28, 2024, 11:58 a.m. UTC | #1
On Fri, Oct 25, 2024 at 02:36:41PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> When multiple partitions are used, you may want to enforce different
> subsets of the available write hints for each partition. Provide a
> bitmap attribute of the available write hints, and allow an admin to
> write a different mask to set the partition's allowed write hints.

Trying my best Greg impersonator voice: This needs to be documented
in Documentation/ABI/stable/sysfs-block.

That would have also helped me understanding it.  AFAIK the split here
is an opt-in, which means the use case I explained in the previous
case would still not work out of the box, right?

> +	max_write_hints = bdev_max_write_hints(bdev);
> +	if (max_write_hints) {
> +		int size = BITS_TO_LONGS(max_write_hints) * sizeof(long);
> +
> +		bdev->write_hint_mask = kmalloc(size, GFP_KERNEL);
> +		if (!bdev->write_hint_mask) {
> +			free_percpu(bdev->bd_stats);
> +			iput(inode);
> +			return NULL;
> +		}
> +		memset(bdev->write_hint_mask, 0xff, size);
> +	}

This could simply use bitmap_alloc().  Similarly the other uses
would probably benefit from using the bitmap API.

> +	struct block_device *bdev = dev_to_bdev(dev);
> +	unsigned short max_write_hints = bdev_max_write_hints(bdev);
> +
> +	if (max_write_hints)
> +		return sprintf(buf, "%*pb\n", max_write_hints, bdev->write_hint_mask);
> +	else
> +		return sprintf(buf, "0");

No need for the else.  And if you write this as:

	if (!max_write_hints)
		return sprintf(buf, "0");
	return sprintf(buf, "%*pb\n", max_write_hints, bdev->write_hint_mask);

you'd also avoid the overly long line.

> +
> +static ssize_t part_write_hint_mask_store(struct device *dev,
> +					  struct device_attribute *attr,
> +					  const char *buf, size_t count)
> +{
> +	struct block_device *bdev = dev_to_bdev(dev);
> +	unsigned short max_write_hints = bdev_max_write_hints(bdev);
> +	unsigned long *new_mask;
> +	int size;
> +
> +	if (!max_write_hints)
> +		return count;
> +
> +	size = BITS_TO_LONGS(max_write_hints) * sizeof(long);
> +	new_mask = kzalloc(size, GFP_KERNEL);
> +	if (!new_mask)
> +		return -ENOMEM;
> +
> +	bitmap_parse(buf, count, new_mask, max_write_hints);
> +	bitmap_copy(bdev->write_hint_mask, new_mask, max_write_hints);

What protects access to bdev->write_hint_mask?
Kanchan Joshi Oct. 28, 2024, 2:40 p.m. UTC | #2
On 10/26/2024 3:06 AM, Keith Busch wrote:
> +static ssize_t part_write_hint_mask_store(struct device *dev,
> +					  struct device_attribute *attr,
> +					  const char *buf, size_t count)
> +{
> +	struct block_device *bdev = dev_to_bdev(dev);
> +	unsigned short max_write_hints = bdev_max_write_hints(bdev);
> +	unsigned long *new_mask;
> +	int size;
> +
> +	if (!max_write_hints)
> +		return count;
> +
> +	size = BITS_TO_LONGS(max_write_hints) * sizeof(long);
> +	new_mask = kzalloc(size, GFP_KERNEL);
> +	if (!new_mask)
> +		return -ENOMEM;
> +
> +	bitmap_parse(buf, count, new_mask, max_write_hints);
> +	bitmap_copy(bdev->write_hint_mask, new_mask, max_write_hints);

kfree(new_mask) here.
Keith Busch Oct. 28, 2024, 2:49 p.m. UTC | #3
On Mon, Oct 28, 2024 at 12:58:05PM +0100, Christoph Hellwig wrote:
> On Fri, Oct 25, 2024 at 02:36:41PM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > When multiple partitions are used, you may want to enforce different
> > subsets of the available write hints for each partition. Provide a
> > bitmap attribute of the available write hints, and allow an admin to
> > write a different mask to set the partition's allowed write hints.
> 
> Trying my best Greg impersonator voice: This needs to be documented
> in Documentation/ABI/stable/sysfs-block.
> 
> That would have also helped me understanding it.  AFAIK the split here
> is an opt-in, which means the use case I explained in the previous
> case would still not work out of the box, right?

Right.
Bart Van Assche Oct. 28, 2024, 6:27 p.m. UTC | #4
On 10/25/24 2:36 PM, Keith Busch wrote:
> When multiple partitions are used, you may want to enforce different
> subsets of the available write hints for each partition. Provide a
> bitmap attribute of the available write hints, and allow an admin to
> write a different mask to set the partition's allowed write hints.

After /proc/irq/*/smp_affinity was introduced (a bitmask),
/proc/irq/*/smp_affinity_list (set of ranges) was introduced as a more
user-friendly alternative. Is the same expected to happen with the
write_hint_mask? If so, shouldn't we skip the bitmask user space
interface and directly introduce the more user friendly interface (set
of ranges)?

Thanks,

Bart.
Keith Busch Oct. 28, 2024, 7:46 p.m. UTC | #5
On Mon, Oct 28, 2024 at 11:27:33AM -0700, Bart Van Assche wrote:
> On 10/25/24 2:36 PM, Keith Busch wrote:
> > When multiple partitions are used, you may want to enforce different
> > subsets of the available write hints for each partition. Provide a
> > bitmap attribute of the available write hints, and allow an admin to
> > write a different mask to set the partition's allowed write hints.
> 
> After /proc/irq/*/smp_affinity was introduced (a bitmask),
> /proc/irq/*/smp_affinity_list (set of ranges) was introduced as a more
> user-friendly alternative. Is the same expected to happen with the
> write_hint_mask? If so, shouldn't we skip the bitmask user space
> interface and directly introduce the more user friendly interface (set
> of ranges)?

I don't much of have an opinion either way. One thing I like for the
bitmask representation is you write 0 to turn it off vs. the list type
writes a null string. Writing 0 to disable just feels more natural to
me, but not a big deal.
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index 738e3c8457e7f..5d23648db457b 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -414,6 +414,7 @@  void __init bdev_cache_init(void)
 
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
+	unsigned short max_write_hints;
 	struct block_device *bdev;
 	struct inode *inode;
 
@@ -440,6 +441,20 @@  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+
+	max_write_hints = bdev_max_write_hints(bdev);
+	if (max_write_hints) {
+		int size = BITS_TO_LONGS(max_write_hints) * sizeof(long);
+
+		bdev->write_hint_mask = kmalloc(size, GFP_KERNEL);
+		if (!bdev->write_hint_mask) {
+			free_percpu(bdev->bd_stats);
+			iput(inode);
+			return NULL;
+		}
+		memset(bdev->write_hint_mask, 0xff, size);
+	}
+
 	return bdev;
 }
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b8..c0ea0a7b6fa87 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -203,6 +203,42 @@  static ssize_t part_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
 }
 
+static ssize_t part_write_hint_mask_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	unsigned short max_write_hints = bdev_max_write_hints(bdev);
+
+	if (max_write_hints)
+		return sprintf(buf, "%*pb\n", max_write_hints, bdev->write_hint_mask);
+	else
+		return sprintf(buf, "0");
+}
+
+static ssize_t part_write_hint_mask_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	unsigned short max_write_hints = bdev_max_write_hints(bdev);
+	unsigned long *new_mask;
+	int size;
+
+	if (!max_write_hints)
+		return count;
+
+	size = BITS_TO_LONGS(max_write_hints) * sizeof(long);
+	new_mask = kzalloc(size, GFP_KERNEL);
+	if (!new_mask)
+		return -ENOMEM;
+
+	bitmap_parse(buf, count, new_mask, max_write_hints);
+	bitmap_copy(bdev->write_hint_mask, new_mask, max_write_hints);
+
+	return count;
+}
+
 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
 static DEVICE_ATTR(start, 0444, part_start_show, NULL);
 static DEVICE_ATTR(size, 0444, part_size_show, NULL);
@@ -211,6 +247,8 @@  static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(write_hint_mask, 0644, part_write_hint_mask_show,
+		   part_write_hint_mask_store);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
@@ -225,6 +263,7 @@  static struct attribute *part_attrs[] = {
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+	&dev_attr_write_hint_mask.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -245,8 +284,11 @@  static const struct attribute_group *part_attr_groups[] = {
 
 static void part_release(struct device *dev)
 {
-	put_disk(dev_to_bdev(dev)->bd_disk);
-	bdev_drop(dev_to_bdev(dev));
+	struct block_device *part = dev_to_bdev(dev);
+
+	kfree(part->write_hint_mask);
+	put_disk(part->bd_disk);
+	bdev_drop(part);
 }
 
 static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6737795220e18..af430e543f7f7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -73,6 +73,7 @@  struct block_device {
 #ifdef CONFIG_SECURITY
 	void			*bd_security;
 #endif
+	unsigned long		*write_hint_mask;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path