diff mbox series

[v4,03/11] fs: Initial atomic write support

Message ID 20240219130109.341523-4-john.g.garry@oracle.com
State Superseded
Headers show
Series block atomic writes | expand

Commit Message

John Garry Feb. 19, 2024, 1:01 p.m. UTC
From: Prasad Singamsetty <prasad.singamsetty@oracle.com>

An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.

Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.

For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.

A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max

Both min and max values must be a power-of-2.

Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.

Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.

Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.

Helper function atomic_write_valid() can be used by FSes to verify
compliant writes.

Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
#jpg: merge into single patch and much rewrite
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/aio.c                |  8 ++++----
 fs/btrfs/ioctl.c        |  2 +-
 fs/read_write.c         |  2 +-
 include/linux/fs.h      | 36 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/fs.h |  5 ++++-
 io_uring/rw.c           |  4 ++--
 6 files changed, 47 insertions(+), 10 deletions(-)

Comments

Ritesh Harjani (IBM) Feb. 24, 2024, 6:16 p.m. UTC | #1
John Garry <john.g.garry@oracle.com> writes:

> From: Prasad Singamsetty <prasad.singamsetty@oracle.com>
>
> An atomic write is a write issued with torn-write protection, meaning
> that for a power failure or any other hardware failure, all or none of the
> data from the write will be stored, but never a mix of old and new data.
>
> Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
> write is to be issued with torn-write prevention, according to special
> alignment and length rules.
>
> For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
> iocb->ki_flags field to indicate the same.
>
> A call to statx will give the relevant atomic write info for a file:
> - atomic_write_unit_min
> - atomic_write_unit_max
> - atomic_write_segments_max
>
> Both min and max values must be a power-of-2.
>
> Applications can avail of atomic write feature by ensuring that the total
> length of a write is a power-of-2 in size and also sized between
> atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
> must ensure that the write is at a naturally-aligned offset in the file
> wrt the total write length. The value in atomic_write_segments_max
> indicates the upper limit for IOV_ITER iovcnt.
>
> Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
> flag set will have RWF_ATOMIC rejected and not just ignored.
>
> Add a type argument to kiocb_set_rw_flags() to allows reads which have
> RWF_ATOMIC set to be rejected.
>
> Helper function atomic_write_valid() can be used by FSes to verify
> compliant writes.
>
> Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
> #jpg: merge into single patch and much rewrite

^^^ this might be a miss I guess.

> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/aio.c                |  8 ++++----
>  fs/btrfs/ioctl.c        |  2 +-
>  fs/read_write.c         |  2 +-
>  include/linux/fs.h      | 36 +++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/fs.h |  5 ++++-
>  io_uring/rw.c           |  4 ++--
>  6 files changed, 47 insertions(+), 10 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index bb2ff48991f3..21bcbc076fd0 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1502,7 +1502,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
>  	iocb_put(iocb);
>  }
>  
> -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
> +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int type)

maybe rw_type?

>  {
>  	int ret;
>  
> @@ -1528,7 +1528,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
>  	} else
>  		req->ki_ioprio = get_current_ioprio();
>  
> -	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
> +	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, type);
>  	if (unlikely(ret))
>  		return ret;
>  
> @@ -1580,7 +1580,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
>  	struct file *file;
>  	int ret;
>  
> -	ret = aio_prep_rw(req, iocb);
> +	ret = aio_prep_rw(req, iocb, READ);
>  	if (ret)
>  		return ret;
>  	file = req->ki_filp;
> @@ -1607,7 +1607,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
>  	struct file *file;
>  	int ret;
>  
> -	ret = aio_prep_rw(req, iocb);
> +	ret = aio_prep_rw(req, iocb, WRITE);
>  	if (ret)
>  		return ret;
>  	file = req->ki_filp;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index ac3316e0d11c..455f06d94b11 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -4555,7 +4555,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
>  		goto out_iov;
>  
>  	init_sync_kiocb(&kiocb, file);
> -	ret = kiocb_set_rw_flags(&kiocb, 0);
> +	ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
>  	if (ret)
>  		goto out_iov;
>  	kiocb.ki_pos = pos;
> diff --git a/fs/read_write.c b/fs/read_write.c
> index d4c036e82b6c..a7dc1819192d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>  	ssize_t ret;
>  
>  	init_sync_kiocb(&kiocb, filp);
> -	ret = kiocb_set_rw_flags(&kiocb, flags);
> +	ret = kiocb_set_rw_flags(&kiocb, flags, type);
>  	if (ret)
>  		return ret;
>  	kiocb.ki_pos = (ppos ? *ppos : 0);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 023f37c60709..7271640fd600 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -43,6 +43,7 @@
>  #include <linux/cred.h>
>  #include <linux/mnt_idmapping.h>
>  #include <linux/slab.h>
> +#include <linux/uio.h>
>  
>  #include <asm/byteorder.h>
>  #include <uapi/linux/fs.h>
> @@ -119,6 +120,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  #define FMODE_PWRITE		((__force fmode_t)0x10)
>  /* File is opened for execution with sys_execve / sys_uselib */
>  #define FMODE_EXEC		((__force fmode_t)0x20)
> +
> +/* File supports atomic writes */
> +#define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)0x40)
> +
>  /* 32bit hashes as llseek() offset (for directories) */
>  #define FMODE_32BITHASH         ((__force fmode_t)0x200)
>  /* 64bit hashes as llseek() offset (for directories) */
> @@ -328,6 +333,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(__force int) RWF_SYNC
>  #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
>  #define IOCB_APPEND		(__force int) RWF_APPEND
> +#define IOCB_ATOMIC		(__force int) RWF_ATOMIC
>  

You might also want to add this definition in here too

#define TRACE_IOCB_STRINGS \
<...>
<...>
{ IOCB_ATOMIC, "ATOMIC" }


>  /* non-RWF related bits - start at 16 */
>  #define IOCB_EVENTFD		(1 << 16)
> @@ -3321,7 +3327,7 @@ static inline int iocb_flags(struct file *file)
>  	return res;
>  }
>  
> -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int type)

maybe rw_type? 

>  {
>  	int kiocb_flags = 0;
>  
> @@ -3338,6 +3344,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  			return -EOPNOTSUPP;
>  		kiocb_flags |= IOCB_NOIO;
>  	}
> +	if (flags & RWF_ATOMIC) {
> +		if (type == READ)
> +			return -EOPNOTSUPP;
> +		if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
> +			return -EOPNOTSUPP;
> +	}
>  	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
>  	if (flags & RWF_SYNC)
>  		kiocb_flags |= IOCB_DSYNC;
> @@ -3523,4 +3535,26 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
>  extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
>  			   int advice);
>  
> +static inline bool atomic_write_valid(loff_t pos, struct iov_iter *iter,
> +			   unsigned int unit_min, unsigned int unit_max)
> +{
> +	size_t len = iov_iter_count(iter);
> +
> +	if (!iter_is_ubuf(iter))
> +		return false;

There is no mention about this limitation in the commit message of this
patch. Maybe it will be good to capture why this limitation to only
support ubuf and/or any plans to lift this restriction in future
in the commit message?


> +
> +	if (len == unit_min || len == unit_max) {
> +		/* ok if exactly min or max */
> +	} else if (len < unit_min || len > unit_max) {
> +		return false;
> +	} else if (!is_power_of_2(len)) {
> +		return false;
> +	}

Checking for len == unit_min || len == unit_max is redundant when
unit_min and unit_max are already power of 2.


> +
> +	if (pos & (len - 1))
> +		return false;
> +
> +	return true;
> +}
> +
>  #endif /* _LINUX_FS_H */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 48ad69f7722e..a0975ae81e64 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* Atomic Write */
> +#define RWF_ATOMIC	((__force __kernel_rwf_t)0x00000040)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ATOMIC)
>  
>  /* Pagemap ioctl */
>  #define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index d5e79d9bdc71..f8c022301cf4 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -719,7 +719,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
>  	struct kiocb *kiocb = &rw->kiocb;
>  	struct io_ring_ctx *ctx = req->ctx;
>  	struct file *file = req->file;
> -	int ret;
> +	int ret, type = (mode == FMODE_WRITE) ? WRITE : READ;
>  
>  	if (unlikely(!file || !(file->f_mode & mode)))
>  		return -EBADF;
> @@ -728,7 +728,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
>  		req->flags |= io_file_get_flags(file);
>  
>  	kiocb->ki_flags = file->f_iocb_flags;
> -	ret = kiocb_set_rw_flags(kiocb, rw->flags);
> +	ret = kiocb_set_rw_flags(kiocb, rw->flags, type);
>  	if (unlikely(ret))
>  		return ret;
>  	kiocb->ki_flags |= IOCB_ALLOC_CACHE;
> -- 
> 2.31.1
Ritesh Harjani (IBM) Feb. 24, 2024, 6:20 p.m. UTC | #2
Ritesh Harjani (IBM) <ritesh.list@gmail.com> writes:

> John Garry <john.g.garry@oracle.com> writes:
>
>> From: Prasad Singamsetty <prasad.singamsetty@oracle.com>
>>
>> An atomic write is a write issued with torn-write protection, meaning
>> that for a power failure or any other hardware failure, all or none of the
>> data from the write will be stored, but never a mix of old and new data.
>>
>> Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
>> write is to be issued with torn-write prevention, according to special
>> alignment and length rules.
>>
>> For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
>> iocb->ki_flags field to indicate the same.
>>
>> A call to statx will give the relevant atomic write info for a file:
>> - atomic_write_unit_min
>> - atomic_write_unit_max
>> - atomic_write_segments_max
>>
>> Both min and max values must be a power-of-2.
>>
>> Applications can avail of atomic write feature by ensuring that the total
>> length of a write is a power-of-2 in size and also sized between
>> atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
>> must ensure that the write is at a naturally-aligned offset in the file
>> wrt the total write length. The value in atomic_write_segments_max
>> indicates the upper limit for IOV_ITER iovcnt.
>>
>> Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
>> flag set will have RWF_ATOMIC rejected and not just ignored.
>>
>> Add a type argument to kiocb_set_rw_flags() to allows reads which have
>> RWF_ATOMIC set to be rejected.
>>
>> Helper function atomic_write_valid() can be used by FSes to verify
>> compliant writes.

Minor nit. 
maybe generic_atomic_write_valid()?
John Garry Feb. 26, 2024, 8:51 a.m. UTC | #3
...

>>
>> Helper function atomic_write_valid() can be used by FSes to verify
>> compliant writes.
>>
>> Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
>> #jpg: merge into single patch and much rewrite
> 
> ^^^ this might be a miss I guess.

I'm not sure what you mean. Here I am just briefly commenting on much 
changes which I made.

> 
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/aio.c                |  8 ++++----
>>   fs/btrfs/ioctl.c        |  2 +-
>>   fs/read_write.c         |  2 +-
>>   include/linux/fs.h      | 36 +++++++++++++++++++++++++++++++++++-
>>   include/uapi/linux/fs.h |  5 ++++-
>>   io_uring/rw.c           |  4 ++--
>>   6 files changed, 47 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/aio.c b/fs/aio.c
>> index bb2ff48991f3..21bcbc076fd0 100644
>> --- a/fs/aio.c
>> +++ b/fs/aio.c
>> @@ -1502,7 +1502,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
>>   	iocb_put(iocb);
>>   }
>>   
>> -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
>> +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int type)
> 
> maybe rw_type?

ok

> 
>>   {
>>   	int ret;
>>   
>> @@ -1528,7 +1528,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
>>   	} else

...

>> +
>>   /* 32bit hashes as llseek() offset (for directories) */
>>   #define FMODE_32BITHASH         ((__force fmode_t)0x200)
>>   /* 64bit hashes as llseek() offset (for directories) */
>> @@ -328,6 +333,7 @@ enum rw_hint {
>>   #define IOCB_SYNC		(__force int) RWF_SYNC
>>   #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
>>   #define IOCB_APPEND		(__force int) RWF_APPEND
>> +#define IOCB_ATOMIC		(__force int) RWF_ATOMIC
>>   
> 
> You might also want to add this definition in here too
> 
> #define TRACE_IOCB_STRINGS \
> <...>
> <...>
> { IOCB_ATOMIC, "ATOMIC" }

ok

I suppose that new flag RWF_NOAPPEND in linux-next also should have this

>>   
>> +static inline bool atomic_write_valid(loff_t pos, struct iov_iter *iter,
>> +			   unsigned int unit_min, unsigned int unit_max)
>> +{
>> +	size_t len = iov_iter_count(iter);
>> +
>> +	if (!iter_is_ubuf(iter))
>> +		return false;
> 
> There is no mention about this limitation in the commit message of this
> patch. Maybe it will be good to capture why this limitation to only
> support ubuf and/or any plans to lift this restriction in future
> in the commit message?

ok, I can mention this in the commit message.

> 
> 
>> +
>> +	if (len == unit_min || len == unit_max) {
>> +		/* ok if exactly min or max */
>> +	} else if (len < unit_min || len > unit_max) {
>> +		return false;
>> +	} else if (!is_power_of_2(len)) {
>> +		return false;
>> +	}
> 
> Checking for len == unit_min || len == unit_max is redundant when
> unit_min and unit_max are already power of 2.

Sure, but it was an optimization, considering that typically we will be 
issuing unit_max in anticipated FS scenario.

Anyway, I will be changing this according to an earlier comment.

Thanks,
John
John Garry Feb. 26, 2024, 8:58 a.m. UTC | #4
On 24/02/2024 18:20, Ritesh Harjani (IBM) wrote:
>>> Helper function atomic_write_valid() can be used by FSes to verify
>>> compliant writes.
> Minor nit.
> maybe generic_atomic_write_valid()?

Having "generic" in the name implies that there are other ways in which 
we can check if an atomic write is valid, but really this function 
should be good to use in scenarios so far considered.

Thanks,
John
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..21bcbc076fd0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1502,7 +1502,7 @@  static void aio_complete_rw(struct kiocb *kiocb, long res)
 	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int type)
 {
 	int ret;
 
@@ -1528,7 +1528,7 @@  static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	} else
 		req->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, type);
 	if (unlikely(ret))
 		return ret;
 
@@ -1580,7 +1580,7 @@  static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, READ);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1607,7 +1607,7 @@  static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, WRITE);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac3316e0d11c..455f06d94b11 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4555,7 +4555,7 @@  static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 		goto out_iov;
 
 	init_sync_kiocb(&kiocb, file);
-	ret = kiocb_set_rw_flags(&kiocb, 0);
+	ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
 	if (ret)
 		goto out_iov;
 	kiocb.ki_pos = pos;
diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..a7dc1819192d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -730,7 +730,7 @@  static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	ret = kiocb_set_rw_flags(&kiocb, flags);
+	ret = kiocb_set_rw_flags(&kiocb, flags, type);
 	if (ret)
 		return ret;
 	kiocb.ki_pos = (ppos ? *ppos : 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 023f37c60709..7271640fd600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,7 @@ 
 #include <linux/cred.h>
 #include <linux/mnt_idmapping.h>
 #include <linux/slab.h>
+#include <linux/uio.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -119,6 +120,10 @@  typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_PWRITE		((__force fmode_t)0x10)
 /* File is opened for execution with sys_execve / sys_uselib */
 #define FMODE_EXEC		((__force fmode_t)0x20)
+
+/* File supports atomic writes */
+#define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)0x40)
+
 /* 32bit hashes as llseek() offset (for directories) */
 #define FMODE_32BITHASH         ((__force fmode_t)0x200)
 /* 64bit hashes as llseek() offset (for directories) */
@@ -328,6 +333,7 @@  enum rw_hint {
 #define IOCB_SYNC		(__force int) RWF_SYNC
 #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
 #define IOCB_APPEND		(__force int) RWF_APPEND
+#define IOCB_ATOMIC		(__force int) RWF_ATOMIC
 
 /* non-RWF related bits - start at 16 */
 #define IOCB_EVENTFD		(1 << 16)
@@ -3321,7 +3327,7 @@  static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int type)
 {
 	int kiocb_flags = 0;
 
@@ -3338,6 +3344,12 @@  static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		kiocb_flags |= IOCB_NOIO;
 	}
+	if (flags & RWF_ATOMIC) {
+		if (type == READ)
+			return -EOPNOTSUPP;
+		if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+			return -EOPNOTSUPP;
+	}
 	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
@@ -3523,4 +3535,26 @@  extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 			   int advice);
 
+static inline bool atomic_write_valid(loff_t pos, struct iov_iter *iter,
+			   unsigned int unit_min, unsigned int unit_max)
+{
+	size_t len = iov_iter_count(iter);
+
+	if (!iter_is_ubuf(iter))
+		return false;
+
+	if (len == unit_min || len == unit_max) {
+		/* ok if exactly min or max */
+	} else if (len < unit_min || len > unit_max) {
+		return false;
+	} else if (!is_power_of_2(len)) {
+		return false;
+	}
+
+	if (pos & (len - 1))
+		return false;
+
+	return true;
+}
+
 #endif /* _LINUX_FS_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 48ad69f7722e..a0975ae81e64 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -301,9 +301,12 @@  typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* Atomic Write */
+#define RWF_ATOMIC	((__force __kernel_rwf_t)0x00000040)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ATOMIC)
 
 /* Pagemap ioctl */
 #define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..f8c022301cf4 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -719,7 +719,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 	struct kiocb *kiocb = &rw->kiocb;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
-	int ret;
+	int ret, type = (mode == FMODE_WRITE) ? WRITE : READ;
 
 	if (unlikely(!file || !(file->f_mode & mode)))
 		return -EBADF;
@@ -728,7 +728,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		req->flags |= io_file_get_flags(file);
 
 	kiocb->ki_flags = file->f_iocb_flags;
-	ret = kiocb_set_rw_flags(kiocb, rw->flags);
+	ret = kiocb_set_rw_flags(kiocb, rw->flags, type);
 	if (unlikely(ret))
 		return ret;
 	kiocb->ki_flags |= IOCB_ALLOC_CACHE;