diff mbox series

[v8,06/19] block, fs: Propagate write hints to the block device inode

Message ID 20231219000815.2739120-7-bvanassche@acm.org
State Superseded
Headers show
Series Pass data lifetime information to SCSI disk devices | expand

Commit Message

Bart Van Assche Dec. 19, 2023, 12:07 a.m. UTC
Write hints applied with F_SET_RW_HINT on a block device affect the
shmem inode only. Propagate these hints to the block device inode
because that is the inode used when writing back dirty pages.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/fops.c       | 11 +++++++++++
 fs/fcntl.c         |  4 ++++
 include/linux/fs.h |  1 +
 3 files changed, 16 insertions(+)

Comments

Christoph Hellwig Dec. 28, 2023, 7:12 a.m. UTC | #1
On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote:
> Write hints applied with F_SET_RW_HINT on a block device affect the
> shmem inode only. Propagate these hints to the block device inode
> because that is the inode used when writing back dirty pages.

What shmem inode?

> @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>  
>  	inode_lock(inode);
>  	inode->i_write_hint = hint;
> +	apply_whint = inode->i_fop->apply_whint;
> +	if (apply_whint)
> +		apply_whint(file, hint);

Setting the hint in file->f_mapping->inode is the right thing here,
not adding a method.
Bart Van Assche Dec. 28, 2023, 10:41 p.m. UTC | #2
On 12/27/23 23:12, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote:
>> Write hints applied with F_SET_RW_HINT on a block device affect the
>> shmem inode only. Propagate these hints to the block device inode
>> because that is the inode used when writing back dirty pages.
> 
> What shmem inode?

The inode associated with the /dev file, e.g. /dev/sda. That is another
inode than the inode associated with the struct block_device instance.
Without this patch, when opening /dev/sda and calling fcntl(), the shmem
inode is modified but the struct block_device inode not. I think that
the code path for allocation of the shmem inode is as follows:

shmem_mknod()
   shmem_get_inode()
     __shmem_get_inode()
         new_inode(sb)
           alloc_inode(sb)
             ops->alloc_inode(sb) = shmem_alloc_inode(sb)
             inode_init_always(sb, inode)
               inode->i_mapping = &inode->i_data;

>> @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>>   
>>   	inode_lock(inode);
>>   	inode->i_write_hint = hint;
>> +	apply_whint = inode->i_fop->apply_whint;
>> +	if (apply_whint)
>> +		apply_whint(file, hint);
> 
> Setting the hint in file->f_mapping->inode is the right thing here,
> not adding a method.

Is my understanding correct that the only way to reach the struct
block_device instance from the shmem code is by dereferencing
file->private_data? Shouldn't all dereferences of that pointer happen
in source file block/fops.c since the file->private_data pointer is
assigned in that file?

Please note that suggestions to improve this patch are definitely
welcome. As you probably know I'm not that familiar with the filesystem
code.

Thanks,

Bart.
Christoph Hellwig Jan. 3, 2024, 9:02 a.m. UTC | #3
On Thu, Dec 28, 2023 at 02:41:59PM -0800, Bart Van Assche wrote:
> On 12/27/23 23:12, Christoph Hellwig wrote:
>> On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote:
>>> Write hints applied with F_SET_RW_HINT on a block device affect the
>>> shmem inode only. Propagate these hints to the block device inode
>>> because that is the inode used when writing back dirty pages.
>>
>> What shmem inode?
>
> The inode associated with the /dev file, e.g. /dev/sda. That is another
> inode than the inode associated with the struct block_device instance.
> Without this patch, when opening /dev/sda and calling fcntl(), the shmem
> inode is modified but the struct block_device inode not. I think that
> the code path for allocation of the shmem inode is as follows:

So the block device node.  That can sit on any file system (or at least
any Unix-y file system that supports device nodes).

>>> @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>>>     	inode_lock(inode);
>>>   	inode->i_write_hint = hint;
>>> +	apply_whint = inode->i_fop->apply_whint;
>>> +	if (apply_whint)
>>> +		apply_whint(file, hint);
>>
>> Setting the hint in file->f_mapping->inode is the right thing here,
>> not adding a method.
>
> Is my understanding correct that the only way to reach the struct
> block_device instance from the shmem code is by dereferencing
> file->private_data?

No.  See blkdev_open:

	filp->f_mapping = handle->bdev->bd_inode->i_mapping;

So you can use file->f_mapping->inode as I said in my previous mail.
Bart Van Assche Jan. 3, 2024, 11:09 p.m. UTC | #4
On 1/3/24 01:02, Christoph Hellwig wrote:
> So you can use file->f_mapping->inode as I said in my previous mail.

Since struct address_space does not have a member with the name "inode",
I assume that you meant "host" instead of "inode"? If so, how about
modifying patch 06 of this series as shown below? With the patch below
my tests still pass.

Thanks,

Bart.


---
  block/fops.c       | 11 -----------
  fs/fcntl.c         | 15 +++++++++++----
  include/linux/fs.h |  1 -
  3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 138b388b5cb1..787ce52bc2c6 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -620,16 +620,6 @@ static int blkdev_release(struct inode *inode, 
struct file *filp)
  	return 0;
  }

-static void blkdev_apply_whint(struct file *file, enum rw_hint hint)
-{
-	struct bdev_handle *handle = file->private_data;
-	struct inode *bd_inode = handle->bdev->bd_inode;
-
-	inode_lock(bd_inode);
-	bd_inode->i_write_hint = hint;
-	inode_unlock(bd_inode);
-}
-
  static ssize_t
  blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
  {
@@ -864,7 +854,6 @@ const struct file_operations def_blk_fops = {
  	.splice_read	= filemap_splice_read,
  	.splice_write	= iter_file_splice_write,
  	.fallocate	= blkdev_fallocate,
-	.apply_whint	= blkdev_apply_whint,
  };

  static __init int blkdev_init(void)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 18407bf5bb9b..cfb52c3a4577 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file, 
unsigned int cmd,
  static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
  			      unsigned long arg)
  {
-	void (*apply_whint)(struct file *, enum rw_hint);
  	struct inode *inode = file_inode(file);
  	u64 __user *argp = (u64 __user *)arg;
  	u64 hint;
@@ -318,11 +317,19 @@ static long fcntl_set_rw_hint(struct file *file, 
unsigned int cmd,

  	inode_lock(inode);
  	inode->i_write_hint = hint;
-	apply_whint = inode->i_fop->apply_whint;
-	if (apply_whint)
-		apply_whint(file, hint);
  	inode_unlock(inode);

+	/*
+	 * file->f_mapping->host may differ from inode. As an example,
+	 * blkdev_open() modifies file->f_mapping.
+	 */
+	if (file->f_mapping->host != inode) {
+		inode = file->f_mapping->host;
+		inode_lock(inode);
+		inode->i_write_hint = hint;
+		inode_unlock(inode);
+	}
+
  	return 0;
  }

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 293017ea2466..a08014b68d6e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1944,7 +1944,6 @@ struct file_operations {
  	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
  	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
  				unsigned int poll_flags);
-	void (*apply_whint)(struct file *, enum rw_hint hint);
  } __randomize_layout;

  /* Wrap a directory iterator that needs exclusive inode access */
Bart Van Assche Jan. 18, 2024, 6:54 p.m. UTC | #5
On 1/18/24 10:51, Kanchan Joshi wrote:
> Are you considering to change this so that hint is set only on one inode
> (and not on two)?
> IOW, should not this fragment be like below:
> 
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file,
> unsigned int cmd,
>    static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>                                 unsigned long arg)
>    {
> -       void (*apply_whint)(struct file *, enum rw_hint);
>           struct inode *inode = file_inode(file);
>           u64 __user *argp = (u64 __user *)arg;
>           u64 hint;
> @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file,
> unsigned int cmd,
>           if (!rw_hint_valid(hint))
>                   return -EINVAL;
> 
> +       /*
> +        * file->f_mapping->host may differ from inode. As an example
> +        * blkdev_open() modifies file->f_mapping
> +        */
> +       if (file->f_mapping->host != inode)
> +               inode = file->f_mapping->host;
> +
>           inode_lock(inode);
>           inode->i_write_hint = hint;
> -       apply_whint = inode->i_fop->apply_whint;
> -       if (apply_whint)
> -               apply_whint(file, hint);
>           inode_unlock(inode);

I think the above proposal would introduce a bug: it would break the
F_GET_RW_HINT implementation.

Thanks,

Bart.
Kanchan Joshi Jan. 19, 2024, 1:56 p.m. UTC | #6
On 1/19/2024 12:24 AM, Bart Van Assche wrote:
> On 1/18/24 10:51, Kanchan Joshi wrote:
>> Are you considering to change this so that hint is set only on one inode
>> (and not on two)?
>> IOW, should not this fragment be like below:
>>
>> --- a/fs/fcntl.c
>> +++ b/fs/fcntl.c
>> @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file,
>> unsigned int cmd,
>>    static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>>                                 unsigned long arg)
>>    {
>> -       void (*apply_whint)(struct file *, enum rw_hint);
>>           struct inode *inode = file_inode(file);
>>           u64 __user *argp = (u64 __user *)arg;
>>           u64 hint;
>> @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file,
>> unsigned int cmd,
>>           if (!rw_hint_valid(hint))
>>                   return -EINVAL;
>>
>> +       /*
>> +        * file->f_mapping->host may differ from inode. As an example
>> +        * blkdev_open() modifies file->f_mapping
>> +        */
>> +       if (file->f_mapping->host != inode)
>> +               inode = file->f_mapping->host;
>> +
>>           inode_lock(inode);
>>           inode->i_write_hint = hint;
>> -       apply_whint = inode->i_fop->apply_whint;
>> -       if (apply_whint)
>> -               apply_whint(file, hint);
>>           inode_unlock(inode);
> 
> I think the above proposal would introduce a bug: it would break the
> F_GET_RW_HINT implementation.

Right. I expected to keep the exact change in GET, too, but that will 
not be free from the side-effect.
The buffered-write path (block_write_full_page) picks the hint from one 
inode, and the direct-write path (__blkdev_direct_IO_simple) picks the 
hint from a different inode.
So, updating both seems needed here.
Kanchan Joshi Jan. 22, 2024, 9:31 a.m. UTC | #7
On 1/19/2024 7:26 PM, Kanchan Joshi wrote:
> On 1/19/2024 12:24 AM, Bart Van Assche wrote:
>> On 1/18/24 10:51, Kanchan Joshi wrote:
>>> Are you considering to change this so that hint is set only on one inode
>>> (and not on two)?
>>> IOW, should not this fragment be like below:
>>>
>>> --- a/fs/fcntl.c
>>> +++ b/fs/fcntl.c
>>> @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file,
>>> unsigned int cmd,
>>>     static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
>>>                                  unsigned long arg)
>>>     {
>>> -       void (*apply_whint)(struct file *, enum rw_hint);
>>>            struct inode *inode = file_inode(file);
>>>            u64 __user *argp = (u64 __user *)arg;
>>>            u64 hint;
>>> @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file,
>>> unsigned int cmd,
>>>            if (!rw_hint_valid(hint))
>>>                    return -EINVAL;
>>>
>>> +       /*
>>> +        * file->f_mapping->host may differ from inode. As an example
>>> +        * blkdev_open() modifies file->f_mapping
>>> +        */
>>> +       if (file->f_mapping->host != inode)
>>> +               inode = file->f_mapping->host;
>>> +
>>>            inode_lock(inode);
>>>            inode->i_write_hint = hint;
>>> -       apply_whint = inode->i_fop->apply_whint;
>>> -       if (apply_whint)
>>> -               apply_whint(file, hint);
>>>            inode_unlock(inode);
>>
>> I think the above proposal would introduce a bug: it would break the
>> F_GET_RW_HINT implementation.
> 
> Right. I expected to keep the exact change in GET, too, but that will
> not be free from the side-effect.
> The buffered-write path (block_write_full_page) picks the hint from one
> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the
> hint from a different inode.
> So, updating both seems needed here.

I stand corrected. It's possible to do away with two updates.
The direct-io code (patch 8) should rather be changed to pick the hint 
from bdev inode (and not from file inode).
With that change, this patch only need to set the hint into only one 
inode (bdev one). What do you think?
Bart Van Assche Jan. 22, 2024, 8:09 p.m. UTC | #8
On 1/22/24 01:31, Kanchan Joshi wrote:
> On 1/19/2024 7:26 PM, Kanchan Joshi wrote:
>> On 1/19/2024 12:24 AM, Bart Van Assche wrote:
>>> I think the above proposal would introduce a bug: it would break the
>>> F_GET_RW_HINT implementation.
>>
>> Right. I expected to keep the exact change in GET, too, but that will
>> not be free from the side-effect.
>> The buffered-write path (block_write_full_page) picks the hint from one
>> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the
>> hint from a different inode.
>> So, updating both seems needed here.
> 
> I stand corrected. It's possible to do away with two updates.
> The direct-io code (patch 8) should rather be changed to pick the hint
> from bdev inode (and not from file inode).
> With that change, this patch only need to set the hint into only one
> inode (bdev one). What do you think?

I think that would break direct I/O submitted by a filesystem.

Bart.
Kanchan Joshi Jan. 23, 2024, 12:16 p.m. UTC | #9
On 1/23/2024 1:39 AM, Bart Van Assche wrote:
> On 1/22/24 01:31, Kanchan Joshi wrote:
>> On 1/19/2024 7:26 PM, Kanchan Joshi wrote:
>>> On 1/19/2024 12:24 AM, Bart Van Assche wrote:
>>>> I think the above proposal would introduce a bug: it would break the
>>>> F_GET_RW_HINT implementation.
>>>
>>> Right. I expected to keep the exact change in GET, too, but that will
>>> not be free from the side-effect.
>>> The buffered-write path (block_write_full_page) picks the hint from one
>>> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the
>>> hint from a different inode.
>>> So, updating both seems needed here.
>>
>> I stand corrected. It's possible to do away with two updates.
>> The direct-io code (patch 8) should rather be changed to pick the hint
>> from bdev inode (and not from file inode).
>> With that change, this patch only need to set the hint into only one
>> inode (bdev one). What do you think?
> 
> I think that would break direct I/O submitted by a filesystem.
> 

By breakage do you mean not being able to set/get the hint correctly?
I tested with XFS and Ext4 direct I/O. No breakage.
Bart Van Assche Jan. 23, 2024, 3:29 p.m. UTC | #10
On 1/23/24 04:16, Kanchan Joshi wrote:
> On 1/23/2024 1:39 AM, Bart Van Assche wrote:
>> On 1/22/24 01:31, Kanchan Joshi wrote:
>>> On 1/19/2024 7:26 PM, Kanchan Joshi wrote:
>>>> On 1/19/2024 12:24 AM, Bart Van Assche wrote:
>>>>> I think the above proposal would introduce a bug: it would break the
>>>>> F_GET_RW_HINT implementation.
>>>>
>>>> Right. I expected to keep the exact change in GET, too, but that will
>>>> not be free from the side-effect.
>>>> The buffered-write path (block_write_full_page) picks the hint from one
>>>> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the
>>>> hint from a different inode.
>>>> So, updating both seems needed here.
>>>
>>> I stand corrected. It's possible to do away with two updates.
>>> The direct-io code (patch 8) should rather be changed to pick the hint
>>> from bdev inode (and not from file inode).
>>> With that change, this patch only need to set the hint into only one
>>> inode (bdev one). What do you think?
>>
>> I think that would break direct I/O submitted by a filesystem.
> 
> By breakage do you mean not being able to set/get the hint correctly?
> I tested with XFS and Ext4 direct I/O. No breakage.

The approach that you proposed is wrong from a conceptual point of view.
Zero, one or more block devices can be associated with a filesystem. It
would be wrong to try to access all associated block devices from inside
the F_SET_RW_HINT implementation. I don't think that there is any API in
the Linux kernel for iterating over all the block devices associated with
a filesystem.

Bart.
diff mbox series

Patch

diff --git a/block/fops.c b/block/fops.c
index 787ce52bc2c6..138b388b5cb1 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -620,6 +620,16 @@  static int blkdev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void blkdev_apply_whint(struct file *file, enum rw_hint hint)
+{
+	struct bdev_handle *handle = file->private_data;
+	struct inode *bd_inode = handle->bdev->bd_inode;
+
+	inode_lock(bd_inode);
+	bd_inode->i_write_hint = hint;
+	inode_unlock(bd_inode);
+}
+
 static ssize_t
 blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -854,6 +864,7 @@  const struct file_operations def_blk_fops = {
 	.splice_read	= filemap_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= blkdev_fallocate,
+	.apply_whint	= blkdev_apply_whint,
 };
 
 static __init int blkdev_init(void)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index fc73c5fae43c..18407bf5bb9b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -306,6 +306,7 @@  static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
 static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
 			      unsigned long arg)
 {
+	void (*apply_whint)(struct file *, enum rw_hint);
 	struct inode *inode = file_inode(file);
 	u64 __user *argp = (u64 __user *)arg;
 	u64 hint;
@@ -317,6 +318,9 @@  static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
 
 	inode_lock(inode);
 	inode->i_write_hint = hint;
+	apply_whint = inode->i_fop->apply_whint;
+	if (apply_whint)
+		apply_whint(file, hint);
 	inode_unlock(inode);
 
 	return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a08014b68d6e..293017ea2466 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1944,6 +1944,7 @@  struct file_operations {
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
 				unsigned int poll_flags);
+	void (*apply_whint)(struct file *, enum rw_hint hint);
 } __randomize_layout;
 
 /* Wrap a directory iterator that needs exclusive inode access */