diff mbox series

[v3,10/15] block: Add fops atomic write support

Message ID 20240124113841.31824-11-john.g.garry@oracle.com
State New
Headers show
Series block atomic writes | expand

Commit Message

John Garry Jan. 24, 2024, 11:38 a.m. UTC
Add support for atomic writes, as follows:
- Ensure that the IO follows all the atomic writes rules, like must be
  naturally aligned
- Set REQ_ATOMIC

We just ignore IOCB_ATOMIC for reads always.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/fops.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

Comments

Nilay Shroff Feb. 13, 2024, 9:36 a.m. UTC | #1
>+static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos,
>+				      struct iov_iter *iter)
>+{
>+	struct request_queue *q = bdev_get_queue(bdev);
>+	unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q);
>+	unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q);
>+
>+	if (!iter_is_ubuf(iter))
>+		return false;
>+	if (iov_iter_count(iter) & (min_bytes - 1))
>+		return false;
>+	if (!is_power_of_2(iov_iter_count(iter)))
>+		return false;
>+	if (pos & (iov_iter_count(iter) - 1))
>+		return false;
>+	if (iov_iter_count(iter) > max_bytes)
>+		return false;
>+	return true;
>+}

Here do we need to also validate whether the IO doesn't straddle 
the atmic bondary limit (if it's non-zero)? We do check that IO 
doesn't straddle the atomic boundary limit but that happens very 
late in the IO code path either during blk-merge or in NVMe driver 
code.

Thanks,
--Nilay
John Garry Feb. 13, 2024, 9:58 a.m. UTC | #2
On 13/02/2024 09:36, Nilay Shroff wrote:
>> +static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos,
> 
>> +				      struct iov_iter *iter)
> 
>> +{
> 
>> +	struct request_queue *q = bdev_get_queue(bdev);
> 
>> +	unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q);
> 
>> +	unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q);
> 
>> +
> 
>> +	if (!iter_is_ubuf(iter))
> 
>> +		return false;
> 
>> +	if (iov_iter_count(iter) & (min_bytes - 1))
> 
>> +		return false;
> 
>> +	if (!is_power_of_2(iov_iter_count(iter)))
> 
>> +		return false;
> 
>> +	if (pos & (iov_iter_count(iter) - 1))
> 
>> +		return false;
> 
>> +	if (iov_iter_count(iter) > max_bytes)
> 
>> +		return false;
> 
>> +	return true;
> 
>> +}
> 
> 
> 
> Here do we need to also validate whether the IO doesn't straddle
> 
> the atmic bondary limit (if it's non-zero)? We do check that IO
> 
> doesn't straddle the atomic boundary limit but that happens very
> 
> late in the IO code path either during blk-merge or in NVMe driver
> 
> code.

It's relied that atomic_write_unit_max is <= atomic_write_boundary and 
both are a power-of-2. Please see the NVMe patch, which this is checked. 
Indeed, it would not make sense if atomic_write_unit_max > 
atomic_write_boundary (when non-zero).

So if the write is naturally aligned and its size is <= 
atomic_write_unit_max, then it cannot be straddling a boundary.

Thanks,
John
Nilay Shroff Feb. 13, 2024, 11:08 a.m. UTC | #3
On 2/13/24 15:28, John Garry wrote:
> On 13/02/2024 09:36, Nilay Shroff wrote:
>>> +static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos,
>>
>>> +                      struct iov_iter *iter)
>>
>>> +{
>>
>>> +    struct request_queue *q = bdev_get_queue(bdev);
>>
>>> +    unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q);
>>
>>> +    unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q);
>>
>>> +
>>
>>> +    if (!iter_is_ubuf(iter))
>>
>>> +        return false;
>>
>>> +    if (iov_iter_count(iter) & (min_bytes - 1))
>>
>>> +        return false;
>>
>>> +    if (!is_power_of_2(iov_iter_count(iter)))
>>
>>> +        return false;
>>
>>> +    if (pos & (iov_iter_count(iter) - 1))
>>
>>> +        return false;
>>
>>> +    if (iov_iter_count(iter) > max_bytes)
>>
>>> +        return false;
>>
>>> +    return true;
>>
>>> +}
>>
>>
>>
>> Here do we need to also validate whether the IO doesn't straddle
>>
>> the atmic bondary limit (if it's non-zero)? We do check that IO
>>
>> doesn't straddle the atomic boundary limit but that happens very
>>
>> late in the IO code path either during blk-merge or in NVMe driver
>>
>> code.
> 
> It's relied that atomic_write_unit_max is <= atomic_write_boundary and both are a power-of-2. Please see the NVMe patch, which this is checked. Indeed, it would not make sense if atomic_write_unit_max > atomic_write_boundary (when non-zero).
> 
> So if the write is naturally aligned and its size is <= atomic_write_unit_max, then it cannot be straddling a boundary.

Ok fine but in case the device doesn't support namespace atomic boundary size (i.e. NABSPF is zero) then still do we need 
to restrict IO which crosses the atomic boundary? 

I am quoting this from NVMe spec (Command Set Specification, revision 1.0a, Section 2.1.4.3) : 
"To ensure backwards compatibility, the values reported for AWUN, AWUPF, and ACWU shall be set such that 
they  are  supported  even  if  a  write  crosses  an  atomic  boundary.  If  a  controller  does  not  
guarantee atomicity across atomic boundaries, the controller shall set AWUN, AWUPF, and ACWU to 0h (1 LBA)." 

Thanks,
--Nilay
John Garry Feb. 13, 2024, 11:52 a.m. UTC | #4
On 13/02/2024 11:08, Nilay Shroff wrote:
>> It's relied that atomic_write_unit_max is <= atomic_write_boundary and both are a power-of-2. Please see the NVMe patch, which this is checked. Indeed, it would not make sense if atomic_write_unit_max > atomic_write_boundary (when non-zero).
>>
>> So if the write is naturally aligned and its size is <= atomic_write_unit_max, then it cannot be straddling a boundary.
> Ok fine but in case the device doesn't support namespace atomic boundary size (i.e. NABSPF is zero) then still do we need
> to restrict IO which crosses the atomic boundary?

Is there a boundary if NABSPF is zero?

> 
> I am quoting this from NVMe spec (Command Set Specification, revision 1.0a, Section 2.1.4.3) :
> "To ensure backwards compatibility, the values reported for AWUN, AWUPF, and ACWU shall be set such that
> they  are  supported  even  if  a  write  crosses  an  atomic  boundary.  If  a  controller  does  not
> guarantee atomicity across atomic boundaries, the controller shall set AWUN, AWUPF, and ACWU to 0h (1 LBA)."

How about respond to the NVMe patch in this series, asking this question?

I have my idea on how the boundary is determined, but I think that the 
spec could be made clearer.

Thanks,
John
Nilay Shroff Feb. 14, 2024, 9:38 a.m. UTC | #5
On 2/13/24 17:22, John Garry wrote:
> On 13/02/2024 11:08, Nilay Shroff wrote:
>>> It's relied that atomic_write_unit_max is <= atomic_write_boundary and both are a power-of-2. Please see the NVMe patch, which this is checked. Indeed, it would not make sense if atomic_write_unit_max > atomic_write_boundary (when non-zero).
>>>
>>> So if the write is naturally aligned and its size is <= atomic_write_unit_max, then it cannot be straddling a boundary.
>> Ok fine but in case the device doesn't support namespace atomic boundary size (i.e. NABSPF is zero) then still do we need
>> to restrict IO which crosses the atomic boundary?
> 
> Is there a boundary if NABSPF is zero?
If NABSPF is zero then there's no boundary and so we may not need to worry about IO crossing boundary.

Even though, the atomic boundary is not defined, this function doesn't allow atomic write crossing atomic_write_unit_max_bytes.
For instance, if AWUPF is 63 and an IO starts atomic write from logical block #32 and the number of logical blocks to be written
in this IO equals to #64 then it's not allowed. However if this same IO starts from logical block #0 then it's allowed.
So my point here's that can this restriction be avoided when atomic boundary is zero (or not defined)? 

Also, it seems that the restriction implemented for atomic write to succeed are very strict. For example, atomic-write can't
succeed if an IO starts from logical block #8 and the number of logical blocks to be written in this IO equals to #16. 
In this particular case, IO is well within atomic-boundary (if it's defined) and atomic-size-limit, so why do we NOT want to 
allow it? Is it intentional? I think, the spec doesn't mention about such limitation.

> 
>>
>> I am quoting this from NVMe spec (Command Set Specification, revision 1.0a, Section 2.1.4.3) :
>> "To ensure backwards compatibility, the values reported for AWUN, AWUPF, and ACWU shall be set such that
>> they  are  supported  even  if  a  write  crosses  an  atomic  boundary.  If  a  controller  does  not
>> guarantee atomicity across atomic boundaries, the controller shall set AWUN, AWUPF, and ACWU to 0h (1 LBA)."
> 
> How about respond to the NVMe patch in this series, asking this question?
> 
Yes I will send this query to the NVMe patch in this series.

Thanks,
--Nilay
John Garry Feb. 14, 2024, 11:29 a.m. UTC | #6
On 14/02/2024 09:38, Nilay Shroff wrote:
> 
> 
> On 2/13/24 17:22, John Garry wrote:
>> On 13/02/2024 11:08, Nilay Shroff wrote:
>>>> It's relied that atomic_write_unit_max is <= atomic_write_boundary and both are a power-of-2. Please see the NVMe patch, which this is checked. Indeed, it would not make sense if atomic_write_unit_max > atomic_write_boundary (when non-zero).
>>>>
>>>> So if the write is naturally aligned and its size is <= atomic_write_unit_max, then it cannot be straddling a boundary.
>>> Ok fine but in case the device doesn't support namespace atomic boundary size (i.e. NABSPF is zero) then still do we need
>>> to restrict IO which crosses the atomic boundary?
>>
>> Is there a boundary if NABSPF is zero?
> If NABSPF is zero then there's no boundary and so we may not need to worry about IO crossing boundary.
> 
> Even though, the atomic boundary is not defined, this function doesn't allow atomic write crossing atomic_write_unit_max_bytes.
> For instance, if AWUPF is 63 and an IO starts atomic write from logical block #32 and the number of logical blocks to be written

When you say "IO", you need to be clearer. Do you mean a write from 
userspace or a merged atomic write?

If userspace issues an atomic write which is 64 blocks at offset 32, 
then it will be rejected.

It will be rejected as it is not naturally aligned, e.g. a 64 block 
writes can only be at offset 0, 64, 128,

> in this IO equals to #64 then it's not allowed.
>  However if this same IO starts from logical block #0 then it's allowed.
> So my point here's that can this restriction be avoided when atomic boundary is zero (or not defined)?

We want a consistent set of rules for userspace to follow, whether the 
atomic boundary is zero or non-zero.

Currently the atomic boundary only comes into play for merging writes, 
i.e. we cannot merge a write in which the resultant IO straddles a boundary.

> 
> Also, it seems that the restriction implemented for atomic write to succeed are very strict. For example, atomic-write can't
> succeed if an IO starts from logical block #8 and the number of logical blocks to be written in this IO equals to #16.
> In this particular case, IO is well within atomic-boundary (if it's defined) and atomic-size-limit, so why do we NOT want to
> allow it? Is it intentional? I think, the spec doesn't mention about such limitation.

According to the NVMe spec, this is ok. However we don't want the user 
to have to deal with things like NVMe boundaries. Indeed, for FSes, we 
do not have a direct linear map from FS blocks to physical blocks, so it 
would be impossible for the user to know about a boundary condition in 
this context.

We are trying to formulate rules which work for the somewhat orthogonal 
HW features of both SCSI and NVMe for both block devices and FSes, while 
also dealing with alignment concerns of extent-based FSes, like XFS.

> 
>>
>>>
>>> I am quoting this from NVMe spec (Command Set Specification, revision 1.0a, Section 2.1.4.3) :
>>> "To ensure backwards compatibility, the values reported for AWUN, AWUPF, and ACWU shall be set such that
>>> they  are  supported  even  if  a  write  crosses  an  atomic  boundary.  If  a  controller  does  not
>>> guarantee atomicity across atomic boundaries, the controller shall set AWUN, AWUPF, and ACWU to 0h (1 LBA)."
>>
>> How about respond to the NVMe patch in this series, asking this question?
>>
> Yes I will send this query to the NVMe patch in this series.

Thanks,
John
Nilay Shroff Feb. 14, 2024, 11:47 a.m. UTC | #7
On 2/14/24 16:59, John Garry wrote:
> On 14/02/2024 09:38, Nilay Shroff wrote:
>>
>>
>> On 2/13/24 17:22, John Garry wrote:
>>> On 13/02/2024 11:08, Nilay Shroff wrote:
>>>>> It's relied that atomic_write_unit_max is <= atomic_write_boundary and both are a power-of-2. Please see the NVMe patch, which this is checked. Indeed, it would not make sense if atomic_write_unit_max > atomic_write_boundary (when non-zero).
>>>>>
>>>>> So if the write is naturally aligned and its size is <= atomic_write_unit_max, then it cannot be straddling a boundary.
>>>> Ok fine but in case the device doesn't support namespace atomic boundary size (i.e. NABSPF is zero) then still do we need
>>>> to restrict IO which crosses the atomic boundary?
>>>
>>> Is there a boundary if NABSPF is zero?
>> If NABSPF is zero then there's no boundary and so we may not need to worry about IO crossing boundary.
>>
>> Even though, the atomic boundary is not defined, this function doesn't allow atomic write crossing atomic_write_unit_max_bytes.
>> For instance, if AWUPF is 63 and an IO starts atomic write from logical block #32 and the number of logical blocks to be written
> 
> When you say "IO", you need to be clearer. Do you mean a write from userspace or a merged atomic write?
Yes I meant write from the userspace. Sorry for the confusion here.
> 
> If userspace issues an atomic write which is 64 blocks at offset 32, then it will be rejected.
> 
> It will be rejected as it is not naturally aligned, e.g. a 64 block writes can only be at offset 0, 64, 128,
So it means that even though h/w may support atomic-write crossing natural alignment boundary, the kernel would still reject it.
> 
>> in this IO equals to #64 then it's not allowed.
>>  However if this same IO starts from logical block #0 then it's allowed.
>> So my point here's that can this restriction be avoided when atomic boundary is zero (or not defined)?
> 
> We want a consistent set of rules for userspace to follow, whether the atomic boundary is zero or non-zero.
> 
> Currently the atomic boundary only comes into play for merging writes, i.e. we cannot merge a write in which the resultant IO straddles a boundary.
> 
>>
>> Also, it seems that the restriction implemented for atomic write to succeed are very strict. For example, atomic-write can't
>> succeed if an IO starts from logical block #8 and the number of logical blocks to be written in this IO equals to #16.
>> In this particular case, IO is well within atomic-boundary (if it's defined) and atomic-size-limit, so why do we NOT want to
>> allow it? Is it intentional? I think, the spec doesn't mention about such limitation.
> 
> According to the NVMe spec, this is ok. However we don't want the user to have to deal with things like NVMe boundaries. Indeed, for FSes, we do not have a direct linear map from FS blocks to physical blocks, so it would be impossible for the user to know about a boundary condition in this context.
> 
> We are trying to formulate rules which work for the somewhat orthogonal HW features of both SCSI and NVMe for both block devices and FSes, while also dealing with alignment concerns of extent-based FSes, like XFS.
Hmm OK, thanks for that explanation. 

Thanks,
--Nilay
diff mbox series

Patch

diff --git a/block/fops.c b/block/fops.c
index 0cf8cf72cdfa..9c8234373da9 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -41,6 +41,26 @@  static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 		!bdev_iter_is_aligned(bdev, iter);
 }
 
+static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos,
+				      struct iov_iter *iter)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q);
+	unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q);
+
+	if (!iter_is_ubuf(iter))
+		return false;
+	if (iov_iter_count(iter) & (min_bytes - 1))
+		return false;
+	if (!is_power_of_2(iov_iter_count(iter)))
+		return false;
+	if (pos & (iov_iter_count(iter) - 1))
+		return false;
+	if (iov_iter_count(iter) > max_bytes)
+		return false;
+	return true;
+}
+
 #define DIO_INLINE_BIO_VECS 4
 
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
@@ -48,6 +68,8 @@  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 {
 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
+	bool is_read = iov_iter_rw(iter) == READ;
+	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
@@ -56,6 +78,9 @@  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	if (blkdev_dio_unaligned(bdev, pos, iter))
 		return -EINVAL;
 
+	if (atomic_write && !blkdev_atomic_write_valid(bdev, pos, iter))
+		return -EINVAL;
+
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
 	else {
@@ -65,7 +90,7 @@  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 			return -ENOMEM;
 	}
 
-	if (iov_iter_rw(iter) == READ) {
+	if (is_read) {
 		bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
 		if (user_backed_iter(iter))
 			should_dirty = true;
@@ -74,6 +99,8 @@  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	}
 	bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 	bio.bi_ioprio = iocb->ki_ioprio;
+	if (atomic_write)
+		bio.bi_opf |= REQ_ATOMIC;
 
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
@@ -171,6 +198,9 @@  static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
+	if ((iocb->ki_flags & IOCB_ATOMIC) && !is_read)
+		return -EINVAL;
+
 	if (blkdev_dio_unaligned(bdev, pos, iter))
 		return -EINVAL;
 
@@ -305,6 +335,7 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	bool is_read = iov_iter_rw(iter) == READ;
 	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
@@ -313,6 +344,9 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	if (blkdev_dio_unaligned(bdev, pos, iter))
 		return -EINVAL;
 
+	if (atomic_write && !blkdev_atomic_write_valid(bdev, pos, iter))
+		return -EINVAL;
+
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
 	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -350,6 +384,9 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		task_io_account_write(bio->bi_iter.bi_size);
 	}
 
+	if (atomic_write)
+		bio->bi_opf |= REQ_ATOMIC;
+
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		bio->bi_opf |= REQ_NOWAIT;
 
@@ -620,6 +657,11 @@  static int blkdev_open(struct inode *inode, struct file *filp)
 	if (bdev_nowait(handle->bdev))
 		filp->f_mode |= FMODE_NOWAIT;
 
+	if (queue_atomic_write_unit_min_bytes(bdev_get_queue(handle->bdev)) &&
+	    (filp->f_flags & O_DIRECT)) {
+		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+	}
+
 	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
 	filp->private_data = handle;