diff mbox series

[05/14] ext4: Convert to use mapping->invalidate_lock

Message ID 20210615091814.28626-5-jack@suse.cz
State Superseded
Headers show
Series fs: Hole punch vs page cache filling races | expand

Commit Message

Jan Kara June 15, 2021, 9:17 a.m. UTC
Convert ext4 to use mapping->invalidate_lock instead of its private
EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this
conversion we fix a long standing race between hole punching and read(2)
/ readahead(2) paths that can lead to stale page cache contents.

CC: <linux-ext4@vger.kernel.org>
CC: Ted Tso <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h     | 10 ----------
 fs/ext4/extents.c  | 25 +++++++++++++-----------
 fs/ext4/file.c     | 13 +++++++------
 fs/ext4/inode.c    | 47 +++++++++++++++++-----------------------------
 fs/ext4/ioctl.c    |  4 ++--
 fs/ext4/super.c    | 13 +++++--------
 fs/ext4/truncate.h |  8 +++++---
 7 files changed, 50 insertions(+), 70 deletions(-)

Comments

Darrick J. Wong June 17, 2021, 4:22 p.m. UTC | #1
On Tue, Jun 15, 2021 at 11:17:55AM +0200, Jan Kara wrote:
> Convert ext4 to use mapping->invalidate_lock instead of its private

> EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this

> conversion we fix a long standing race between hole punching and read(2)

> / readahead(2) paths that can lead to stale page cache contents.

> 

> CC: <linux-ext4@vger.kernel.org>

> CC: Ted Tso <tytso@mit.edu>


Hmm, still no ACK from Ted?

This looks like a pretty straightforward i_mmap_sem conversion, though
in general I'd like /some/ kind of response from anyone in the ext4
community who has been writing code more recently than me...

Reviewed-by: Darrick J. Wong <djwong@kernel.org>


--D

> Signed-off-by: Jan Kara <jack@suse.cz>

> ---

>  fs/ext4/ext4.h     | 10 ----------

>  fs/ext4/extents.c  | 25 +++++++++++++-----------

>  fs/ext4/file.c     | 13 +++++++------

>  fs/ext4/inode.c    | 47 +++++++++++++++++-----------------------------

>  fs/ext4/ioctl.c    |  4 ++--

>  fs/ext4/super.c    | 13 +++++--------

>  fs/ext4/truncate.h |  8 +++++---

>  7 files changed, 50 insertions(+), 70 deletions(-)

> 

> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

> index 37002663d521..ed64b4b217a1 100644

> --- a/fs/ext4/ext4.h

> +++ b/fs/ext4/ext4.h

> @@ -1077,15 +1077,6 @@ struct ext4_inode_info {

>  	 * by other means, so we have i_data_sem.

>  	 */

>  	struct rw_semaphore i_data_sem;

> -	/*

> -	 * i_mmap_sem is for serializing page faults with truncate / punch hole

> -	 * operations. We have to make sure that new page cannot be faulted in

> -	 * a section of the inode that is being punched. We cannot easily use

> -	 * i_data_sem for this since we need protection for the whole punch

> -	 * operation and i_data_sem ranks below transaction start so we have

> -	 * to occasionally drop it.

> -	 */

> -	struct rw_semaphore i_mmap_sem;

>  	struct inode vfs_inode;

>  	struct jbd2_inode *jinode;

>  

> @@ -2962,7 +2953,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);

>  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,

>  			     loff_t lstart, loff_t lend);

>  extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);

> -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);

>  extern qsize_t *ext4_get_reserved_space(struct inode *inode);

>  extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);

>  extern void ext4_da_release_space(struct inode *inode, int to_free);

> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

> index cbf37b2cf871..db5d38af9ba8 100644

> --- a/fs/ext4/extents.c

> +++ b/fs/ext4/extents.c

> @@ -4470,6 +4470,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

>  			    loff_t len, int mode)

>  {

>  	struct inode *inode = file_inode(file);

> +	struct address_space *mapping = file->f_mapping;

>  	handle_t *handle = NULL;

>  	unsigned int max_blocks;

>  	loff_t new_size = 0;

> @@ -4556,17 +4557,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,

>  		 * Prevent page faults from reinstantiating pages we have

>  		 * released from page cache.

>  		 */

> -		down_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock(mapping);

>  

>  		ret = ext4_break_layouts(inode);

>  		if (ret) {

> -			up_write(&EXT4_I(inode)->i_mmap_sem);

> +			filemap_invalidate_unlock(mapping);

>  			goto out_mutex;

>  		}

>  

>  		ret = ext4_update_disksize_before_punch(inode, offset, len);

>  		if (ret) {

> -			up_write(&EXT4_I(inode)->i_mmap_sem);

> +			filemap_invalidate_unlock(mapping);

>  			goto out_mutex;

>  		}

>  		/* Now release the pages and zero block aligned part of pages */

> @@ -4575,7 +4576,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

>  

>  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,

>  					     flags);

> -		up_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock(mapping);

>  		if (ret)

>  			goto out_mutex;

>  	}

> @@ -5217,6 +5218,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,

>  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

>  {

>  	struct super_block *sb = inode->i_sb;

> +	struct address_space *mapping = inode->i_mapping;

>  	ext4_lblk_t punch_start, punch_stop;

>  	handle_t *handle;

>  	unsigned int credits;

> @@ -5270,7 +5272,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

>  	 * Prevent page faults from reinstantiating pages we have released from

>  	 * page cache.

>  	 */

> -	down_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_lock(mapping);

>  

>  	ret = ext4_break_layouts(inode);

>  	if (ret)

> @@ -5285,15 +5287,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

>  	 * Write tail of the last page before removed range since it will get

>  	 * removed from the page cache below.

>  	 */

> -	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);

> +	ret = filemap_write_and_wait_range(mapping, ioffset, offset);

>  	if (ret)

>  		goto out_mmap;

>  	/*

>  	 * Write data that will be shifted to preserve them when discarding

>  	 * page cache below. We are also protected from pages becoming dirty

> -	 * by i_mmap_sem.

> +	 * by i_rwsem and invalidate_lock.

>  	 */

> -	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,

> +	ret = filemap_write_and_wait_range(mapping, offset + len,

>  					   LLONG_MAX);

>  	if (ret)

>  		goto out_mmap;

> @@ -5346,7 +5348,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

>  	ext4_journal_stop(handle);

>  	ext4_fc_stop_ineligible(sb);

>  out_mmap:

> -	up_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock(mapping);

>  out_mutex:

>  	inode_unlock(inode);

>  	return ret;

> @@ -5363,6 +5365,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

>  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

>  {

>  	struct super_block *sb = inode->i_sb;

> +	struct address_space *mapping = inode->i_mapping;

>  	handle_t *handle;

>  	struct ext4_ext_path *path;

>  	struct ext4_extent *extent;

> @@ -5421,7 +5424,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

>  	 * Prevent page faults from reinstantiating pages we have released from

>  	 * page cache.

>  	 */

> -	down_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_lock(mapping);

>  

>  	ret = ext4_break_layouts(inode);

>  	if (ret)

> @@ -5522,7 +5525,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

>  	ext4_journal_stop(handle);

>  	ext4_fc_stop_ineligible(sb);

>  out_mmap:

> -	up_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock(mapping);

>  out_mutex:

>  	inode_unlock(inode);

>  	return ret;

> diff --git a/fs/ext4/file.c b/fs/ext4/file.c

> index 816dedcbd541..d3b4ed91aa68 100644

> --- a/fs/ext4/file.c

> +++ b/fs/ext4/file.c

> @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,

>  	 */

>  	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&

>  		(vmf->vma->vm_flags & VM_SHARED);

> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;

>  	pfn_t pfn;

>  

>  	if (write) {

>  		sb_start_pagefault(sb);

>  		file_update_time(vmf->vma->vm_file);

> -		down_read(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock_shared(mapping);

>  retry:

>  		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,

>  					       EXT4_DATA_TRANS_BLOCKS(sb));

>  		if (IS_ERR(handle)) {

> -			up_read(&EXT4_I(inode)->i_mmap_sem);

> +			filemap_invalidate_unlock_shared(mapping);

>  			sb_end_pagefault(sb);

>  			return VM_FAULT_SIGBUS;

>  		}

>  	} else {

> -		down_read(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock_shared(mapping);

>  	}

>  	result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);

>  	if (write) {

> @@ -731,10 +732,10 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,

>  		/* Handling synchronous page fault? */

>  		if (result & VM_FAULT_NEEDDSYNC)

>  			result = dax_finish_sync_fault(vmf, pe_size, pfn);

> -		up_read(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock_shared(mapping);

>  		sb_end_pagefault(sb);

>  	} else {

> -		up_read(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock_shared(mapping);

>  	}

>  

>  	return result;

> @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {

>  #endif

>  

>  static const struct vm_operations_struct ext4_file_vm_ops = {

> -	.fault		= ext4_filemap_fault,

> +	.fault		= filemap_fault,

>  	.map_pages	= filemap_map_pages,

>  	.page_mkwrite   = ext4_page_mkwrite,

>  };

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

> index fe6045a46599..ee6e69d6f949 100644

> --- a/fs/ext4/inode.c

> +++ b/fs/ext4/inode.c

> @@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,

>  	return ret;

>  }

>  

> -static void ext4_wait_dax_page(struct ext4_inode_info *ei)

> +static void ext4_wait_dax_page(struct inode *inode)

>  {

> -	up_write(&ei->i_mmap_sem);

> +	filemap_invalidate_unlock(inode->i_mapping);

>  	schedule();

> -	down_write(&ei->i_mmap_sem);

> +	filemap_invalidate_lock(inode->i_mapping);

>  }

>  

>  int ext4_break_layouts(struct inode *inode)

>  {

> -	struct ext4_inode_info *ei = EXT4_I(inode);

>  	struct page *page;

>  	int error;

>  

> -	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))

> +	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))

>  		return -EINVAL;

>  

>  	do {

> @@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)

>  		error = ___wait_var_event(&page->_refcount,

>  				atomic_read(&page->_refcount) == 1,

>  				TASK_INTERRUPTIBLE, 0, 0,

> -				ext4_wait_dax_page(ei));

> +				ext4_wait_dax_page(inode));

>  	} while (error == 0);

>  

>  	return error;

> @@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

>  

>  	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);

>  	if (ext4_has_inline_data(inode)) {

> -		down_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock(mapping);

>  		ret = ext4_convert_inline_data(inode);

> -		up_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock(mapping);

>  		if (ret)

>  			return ret;

>  	}

> @@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

>  	 * Prevent page faults from reinstantiating pages we have released from

>  	 * page cache.

>  	 */

> -	down_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_lock(mapping);

>  

>  	ret = ext4_break_layouts(inode);

>  	if (ret)

> @@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

>  out_stop:

>  	ext4_journal_stop(handle);

>  out_dio:

> -	up_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock(mapping);

>  out_mutex:

>  	inode_unlock(inode);

>  	return ret;

> @@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,

>  			inode_dio_wait(inode);

>  		}

>  

> -		down_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock(inode->i_mapping);

>  

>  		rc = ext4_break_layouts(inode);

>  		if (rc) {

> -			up_write(&EXT4_I(inode)->i_mmap_sem);

> +			filemap_invalidate_unlock(inode->i_mapping);

>  			goto err_out;

>  		}

>  

> @@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,

>  				error = rc;

>  		}

>  out_mmap_sem:

> -		up_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock(inode->i_mapping);

>  	}

>  

>  	if (!error) {

> @@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)

>  	 * data (and journalled aops don't know how to handle these cases).

>  	 */

>  	if (val) {

> -		down_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_lock(inode->i_mapping);

>  		err = filemap_write_and_wait(inode->i_mapping);

>  		if (err < 0) {

> -			up_write(&EXT4_I(inode)->i_mmap_sem);

> +			filemap_invalidate_unlock(inode->i_mapping);

>  			return err;

>  		}

>  	}

> @@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)

>  	percpu_up_write(&sbi->s_writepages_rwsem);

>  

>  	if (val)

> -		up_write(&EXT4_I(inode)->i_mmap_sem);

> +		filemap_invalidate_unlock(inode->i_mapping);

>  

>  	/* Finally we can mark the inode as dirty. */

>  

> @@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

>  	sb_start_pagefault(inode->i_sb);

>  	file_update_time(vma->vm_file);

>  

> -	down_read(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_lock_shared(mapping);

>  

>  	err = ext4_convert_inline_data(inode);

>  	if (err)

> @@ -6176,7 +6175,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

>  out_ret:

>  	ret = block_page_mkwrite_return(err);

>  out:

> -	up_read(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock_shared(mapping);

>  	sb_end_pagefault(inode->i_sb);

>  	return ret;

>  out_error:

> @@ -6184,15 +6183,3 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

>  	ext4_journal_stop(handle);

>  	goto out;

>  }

> -

> -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)

> -{

> -	struct inode *inode = file_inode(vmf->vma->vm_file);

> -	vm_fault_t ret;

> -

> -	down_read(&EXT4_I(inode)->i_mmap_sem);

> -	ret = filemap_fault(vmf);

> -	up_read(&EXT4_I(inode)->i_mmap_sem);

> -

> -	return ret;

> -}

> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

> index 31627f7dc5cd..c5ed562b4185 100644

> --- a/fs/ext4/ioctl.c

> +++ b/fs/ext4/ioctl.c

> @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,

>  		goto journal_err_out;

>  	}

>  

> -	down_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_lock(inode->i_mapping);

>  	err = filemap_write_and_wait(inode->i_mapping);

>  	if (err)

>  		goto err_out;

> @@ -256,7 +256,7 @@ static long swap_inode_boot_loader(struct super_block *sb,

>  	ext4_double_up_write_data_sem(inode, inode_bl);

>  

>  err_out:

> -	up_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock(inode->i_mapping);

>  journal_err_out:

>  	unlock_two_nondirectories(inode, inode_bl);

>  	iput(inode_bl);

> diff --git a/fs/ext4/super.c b/fs/ext4/super.c

> index d29f6aa7d96e..c3c3cd8b0966 100644

> --- a/fs/ext4/super.c

> +++ b/fs/ext4/super.c

> @@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,

>  /*

>   * Lock ordering

>   *

> - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and

> - * i_mmap_rwsem (inode->i_mmap_rwsem)!

> - *

>   * page fault path:

> - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->

> - *   page lock -> i_data_sem (rw)

> + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start

> + *   -> page lock -> i_data_sem (rw)

>   *

>   * buffered write path:

>   * sb_start_write -> i_mutex -> mmap_lock

> @@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,

>   *   i_data_sem (rw)

>   *

>   * truncate:

> - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock

> - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->

> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->

> + *   page lock

> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->

>   *   i_data_sem (rw)

>   *

>   * direct IO:

> @@ -1350,7 +1348,6 @@ static void init_once(void *foo)

>  	INIT_LIST_HEAD(&ei->i_orphan);

>  	init_rwsem(&ei->xattr_sem);

>  	init_rwsem(&ei->i_data_sem);

> -	init_rwsem(&ei->i_mmap_sem);

>  	inode_init_once(&ei->vfs_inode);

>  	ext4_fc_init_inode(&ei->vfs_inode);

>  }

> diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h

> index bcbe3668c1d4..ce84aa2786c7 100644

> --- a/fs/ext4/truncate.h

> +++ b/fs/ext4/truncate.h

> @@ -11,14 +11,16 @@

>   */

>  static inline void ext4_truncate_failed_write(struct inode *inode)

>  {

> +	struct address_space *mapping = inode->i_mapping;

> +

>  	/*

>  	 * We don't need to call ext4_break_layouts() because the blocks we

>  	 * are truncating were never visible to userspace.

>  	 */

> -	down_write(&EXT4_I(inode)->i_mmap_sem);

> -	truncate_inode_pages(inode->i_mapping, inode->i_size);

> +	filemap_invalidate_lock(mapping);

> +	truncate_inode_pages(mapping, inode->i_size);

>  	ext4_truncate(inode);

> -	up_write(&EXT4_I(inode)->i_mmap_sem);

> +	filemap_invalidate_unlock(mapping);

>  }

>  

>  /*

> -- 

> 2.26.2

>
Jan Kara June 22, 2021, 9:54 a.m. UTC | #2
On Thu 17-06-21 09:22:40, Darrick J. Wong wrote:
> On Tue, Jun 15, 2021 at 11:17:55AM +0200, Jan Kara wrote:

> > Convert ext4 to use mapping->invalidate_lock instead of its private

> > EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this

> > conversion we fix a long standing race between hole punching and read(2)

> > / readahead(2) paths that can lead to stale page cache contents.

> > 

> > CC: <linux-ext4@vger.kernel.org>

> > CC: Ted Tso <tytso@mit.edu>

> 

> Hmm, still no ACK from Ted?


On ext4 call he mentioned he's fine with the patches and testing has passed
for him but he has not given an official tag...

> This looks like a pretty straightforward i_mmap_sem conversion, though

> in general I'd like /some/ kind of response from anyone in the ext4

> community who has been writing code more recently than me...

> 

> Reviewed-by: Darrick J. Wong <djwong@kernel.org>


Yeah, this was basically search-and-replace. Thanks for review!

								Honza


> 

> --D

> 

> > Signed-off-by: Jan Kara <jack@suse.cz>

> > ---

> >  fs/ext4/ext4.h     | 10 ----------

> >  fs/ext4/extents.c  | 25 +++++++++++++-----------

> >  fs/ext4/file.c     | 13 +++++++------

> >  fs/ext4/inode.c    | 47 +++++++++++++++++-----------------------------

> >  fs/ext4/ioctl.c    |  4 ++--

> >  fs/ext4/super.c    | 13 +++++--------

> >  fs/ext4/truncate.h |  8 +++++---

> >  7 files changed, 50 insertions(+), 70 deletions(-)

> > 

> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

> > index 37002663d521..ed64b4b217a1 100644

> > --- a/fs/ext4/ext4.h

> > +++ b/fs/ext4/ext4.h

> > @@ -1077,15 +1077,6 @@ struct ext4_inode_info {

> >  	 * by other means, so we have i_data_sem.

> >  	 */

> >  	struct rw_semaphore i_data_sem;

> > -	/*

> > -	 * i_mmap_sem is for serializing page faults with truncate / punch hole

> > -	 * operations. We have to make sure that new page cannot be faulted in

> > -	 * a section of the inode that is being punched. We cannot easily use

> > -	 * i_data_sem for this since we need protection for the whole punch

> > -	 * operation and i_data_sem ranks below transaction start so we have

> > -	 * to occasionally drop it.

> > -	 */

> > -	struct rw_semaphore i_mmap_sem;

> >  	struct inode vfs_inode;

> >  	struct jbd2_inode *jinode;

> >  

> > @@ -2962,7 +2953,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);

> >  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,

> >  			     loff_t lstart, loff_t lend);

> >  extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);

> > -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);

> >  extern qsize_t *ext4_get_reserved_space(struct inode *inode);

> >  extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);

> >  extern void ext4_da_release_space(struct inode *inode, int to_free);

> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

> > index cbf37b2cf871..db5d38af9ba8 100644

> > --- a/fs/ext4/extents.c

> > +++ b/fs/ext4/extents.c

> > @@ -4470,6 +4470,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

> >  			    loff_t len, int mode)

> >  {

> >  	struct inode *inode = file_inode(file);

> > +	struct address_space *mapping = file->f_mapping;

> >  	handle_t *handle = NULL;

> >  	unsigned int max_blocks;

> >  	loff_t new_size = 0;

> > @@ -4556,17 +4557,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,

> >  		 * Prevent page faults from reinstantiating pages we have

> >  		 * released from page cache.

> >  		 */

> > -		down_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock(mapping);

> >  

> >  		ret = ext4_break_layouts(inode);

> >  		if (ret) {

> > -			up_write(&EXT4_I(inode)->i_mmap_sem);

> > +			filemap_invalidate_unlock(mapping);

> >  			goto out_mutex;

> >  		}

> >  

> >  		ret = ext4_update_disksize_before_punch(inode, offset, len);

> >  		if (ret) {

> > -			up_write(&EXT4_I(inode)->i_mmap_sem);

> > +			filemap_invalidate_unlock(mapping);

> >  			goto out_mutex;

> >  		}

> >  		/* Now release the pages and zero block aligned part of pages */

> > @@ -4575,7 +4576,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

> >  

> >  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,

> >  					     flags);

> > -		up_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock(mapping);

> >  		if (ret)

> >  			goto out_mutex;

> >  	}

> > @@ -5217,6 +5218,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,

> >  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

> >  {

> >  	struct super_block *sb = inode->i_sb;

> > +	struct address_space *mapping = inode->i_mapping;

> >  	ext4_lblk_t punch_start, punch_stop;

> >  	handle_t *handle;

> >  	unsigned int credits;

> > @@ -5270,7 +5272,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

> >  	 * Prevent page faults from reinstantiating pages we have released from

> >  	 * page cache.

> >  	 */

> > -	down_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_lock(mapping);

> >  

> >  	ret = ext4_break_layouts(inode);

> >  	if (ret)

> > @@ -5285,15 +5287,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

> >  	 * Write tail of the last page before removed range since it will get

> >  	 * removed from the page cache below.

> >  	 */

> > -	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);

> > +	ret = filemap_write_and_wait_range(mapping, ioffset, offset);

> >  	if (ret)

> >  		goto out_mmap;

> >  	/*

> >  	 * Write data that will be shifted to preserve them when discarding

> >  	 * page cache below. We are also protected from pages becoming dirty

> > -	 * by i_mmap_sem.

> > +	 * by i_rwsem and invalidate_lock.

> >  	 */

> > -	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,

> > +	ret = filemap_write_and_wait_range(mapping, offset + len,

> >  					   LLONG_MAX);

> >  	if (ret)

> >  		goto out_mmap;

> > @@ -5346,7 +5348,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

> >  	ext4_journal_stop(handle);

> >  	ext4_fc_stop_ineligible(sb);

> >  out_mmap:

> > -	up_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock(mapping);

> >  out_mutex:

> >  	inode_unlock(inode);

> >  	return ret;

> > @@ -5363,6 +5365,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

> >  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

> >  {

> >  	struct super_block *sb = inode->i_sb;

> > +	struct address_space *mapping = inode->i_mapping;

> >  	handle_t *handle;

> >  	struct ext4_ext_path *path;

> >  	struct ext4_extent *extent;

> > @@ -5421,7 +5424,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

> >  	 * Prevent page faults from reinstantiating pages we have released from

> >  	 * page cache.

> >  	 */

> > -	down_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_lock(mapping);

> >  

> >  	ret = ext4_break_layouts(inode);

> >  	if (ret)

> > @@ -5522,7 +5525,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

> >  	ext4_journal_stop(handle);

> >  	ext4_fc_stop_ineligible(sb);

> >  out_mmap:

> > -	up_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock(mapping);

> >  out_mutex:

> >  	inode_unlock(inode);

> >  	return ret;

> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c

> > index 816dedcbd541..d3b4ed91aa68 100644

> > --- a/fs/ext4/file.c

> > +++ b/fs/ext4/file.c

> > @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,

> >  	 */

> >  	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&

> >  		(vmf->vma->vm_flags & VM_SHARED);

> > +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;

> >  	pfn_t pfn;

> >  

> >  	if (write) {

> >  		sb_start_pagefault(sb);

> >  		file_update_time(vmf->vma->vm_file);

> > -		down_read(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock_shared(mapping);

> >  retry:

> >  		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,

> >  					       EXT4_DATA_TRANS_BLOCKS(sb));

> >  		if (IS_ERR(handle)) {

> > -			up_read(&EXT4_I(inode)->i_mmap_sem);

> > +			filemap_invalidate_unlock_shared(mapping);

> >  			sb_end_pagefault(sb);

> >  			return VM_FAULT_SIGBUS;

> >  		}

> >  	} else {

> > -		down_read(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock_shared(mapping);

> >  	}

> >  	result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);

> >  	if (write) {

> > @@ -731,10 +732,10 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,

> >  		/* Handling synchronous page fault? */

> >  		if (result & VM_FAULT_NEEDDSYNC)

> >  			result = dax_finish_sync_fault(vmf, pe_size, pfn);

> > -		up_read(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock_shared(mapping);

> >  		sb_end_pagefault(sb);

> >  	} else {

> > -		up_read(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock_shared(mapping);

> >  	}

> >  

> >  	return result;

> > @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {

> >  #endif

> >  

> >  static const struct vm_operations_struct ext4_file_vm_ops = {

> > -	.fault		= ext4_filemap_fault,

> > +	.fault		= filemap_fault,

> >  	.map_pages	= filemap_map_pages,

> >  	.page_mkwrite   = ext4_page_mkwrite,

> >  };

> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

> > index fe6045a46599..ee6e69d6f949 100644

> > --- a/fs/ext4/inode.c

> > +++ b/fs/ext4/inode.c

> > @@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,

> >  	return ret;

> >  }

> >  

> > -static void ext4_wait_dax_page(struct ext4_inode_info *ei)

> > +static void ext4_wait_dax_page(struct inode *inode)

> >  {

> > -	up_write(&ei->i_mmap_sem);

> > +	filemap_invalidate_unlock(inode->i_mapping);

> >  	schedule();

> > -	down_write(&ei->i_mmap_sem);

> > +	filemap_invalidate_lock(inode->i_mapping);

> >  }

> >  

> >  int ext4_break_layouts(struct inode *inode)

> >  {

> > -	struct ext4_inode_info *ei = EXT4_I(inode);

> >  	struct page *page;

> >  	int error;

> >  

> > -	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))

> > +	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))

> >  		return -EINVAL;

> >  

> >  	do {

> > @@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)

> >  		error = ___wait_var_event(&page->_refcount,

> >  				atomic_read(&page->_refcount) == 1,

> >  				TASK_INTERRUPTIBLE, 0, 0,

> > -				ext4_wait_dax_page(ei));

> > +				ext4_wait_dax_page(inode));

> >  	} while (error == 0);

> >  

> >  	return error;

> > @@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

> >  

> >  	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);

> >  	if (ext4_has_inline_data(inode)) {

> > -		down_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock(mapping);

> >  		ret = ext4_convert_inline_data(inode);

> > -		up_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock(mapping);

> >  		if (ret)

> >  			return ret;

> >  	}

> > @@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

> >  	 * Prevent page faults from reinstantiating pages we have released from

> >  	 * page cache.

> >  	 */

> > -	down_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_lock(mapping);

> >  

> >  	ret = ext4_break_layouts(inode);

> >  	if (ret)

> > @@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

> >  out_stop:

> >  	ext4_journal_stop(handle);

> >  out_dio:

> > -	up_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock(mapping);

> >  out_mutex:

> >  	inode_unlock(inode);

> >  	return ret;

> > @@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,

> >  			inode_dio_wait(inode);

> >  		}

> >  

> > -		down_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock(inode->i_mapping);

> >  

> >  		rc = ext4_break_layouts(inode);

> >  		if (rc) {

> > -			up_write(&EXT4_I(inode)->i_mmap_sem);

> > +			filemap_invalidate_unlock(inode->i_mapping);

> >  			goto err_out;

> >  		}

> >  

> > @@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,

> >  				error = rc;

> >  		}

> >  out_mmap_sem:

> > -		up_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock(inode->i_mapping);

> >  	}

> >  

> >  	if (!error) {

> > @@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)

> >  	 * data (and journalled aops don't know how to handle these cases).

> >  	 */

> >  	if (val) {

> > -		down_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_lock(inode->i_mapping);

> >  		err = filemap_write_and_wait(inode->i_mapping);

> >  		if (err < 0) {

> > -			up_write(&EXT4_I(inode)->i_mmap_sem);

> > +			filemap_invalidate_unlock(inode->i_mapping);

> >  			return err;

> >  		}

> >  	}

> > @@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)

> >  	percpu_up_write(&sbi->s_writepages_rwsem);

> >  

> >  	if (val)

> > -		up_write(&EXT4_I(inode)->i_mmap_sem);

> > +		filemap_invalidate_unlock(inode->i_mapping);

> >  

> >  	/* Finally we can mark the inode as dirty. */

> >  

> > @@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

> >  	sb_start_pagefault(inode->i_sb);

> >  	file_update_time(vma->vm_file);

> >  

> > -	down_read(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_lock_shared(mapping);

> >  

> >  	err = ext4_convert_inline_data(inode);

> >  	if (err)

> > @@ -6176,7 +6175,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

> >  out_ret:

> >  	ret = block_page_mkwrite_return(err);

> >  out:

> > -	up_read(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock_shared(mapping);

> >  	sb_end_pagefault(inode->i_sb);

> >  	return ret;

> >  out_error:

> > @@ -6184,15 +6183,3 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)

> >  	ext4_journal_stop(handle);

> >  	goto out;

> >  }

> > -

> > -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)

> > -{

> > -	struct inode *inode = file_inode(vmf->vma->vm_file);

> > -	vm_fault_t ret;

> > -

> > -	down_read(&EXT4_I(inode)->i_mmap_sem);

> > -	ret = filemap_fault(vmf);

> > -	up_read(&EXT4_I(inode)->i_mmap_sem);

> > -

> > -	return ret;

> > -}

> > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

> > index 31627f7dc5cd..c5ed562b4185 100644

> > --- a/fs/ext4/ioctl.c

> > +++ b/fs/ext4/ioctl.c

> > @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,

> >  		goto journal_err_out;

> >  	}

> >  

> > -	down_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_lock(inode->i_mapping);

> >  	err = filemap_write_and_wait(inode->i_mapping);

> >  	if (err)

> >  		goto err_out;

> > @@ -256,7 +256,7 @@ static long swap_inode_boot_loader(struct super_block *sb,

> >  	ext4_double_up_write_data_sem(inode, inode_bl);

> >  

> >  err_out:

> > -	up_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock(inode->i_mapping);

> >  journal_err_out:

> >  	unlock_two_nondirectories(inode, inode_bl);

> >  	iput(inode_bl);

> > diff --git a/fs/ext4/super.c b/fs/ext4/super.c

> > index d29f6aa7d96e..c3c3cd8b0966 100644

> > --- a/fs/ext4/super.c

> > +++ b/fs/ext4/super.c

> > @@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,

> >  /*

> >   * Lock ordering

> >   *

> > - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and

> > - * i_mmap_rwsem (inode->i_mmap_rwsem)!

> > - *

> >   * page fault path:

> > - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->

> > - *   page lock -> i_data_sem (rw)

> > + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start

> > + *   -> page lock -> i_data_sem (rw)

> >   *

> >   * buffered write path:

> >   * sb_start_write -> i_mutex -> mmap_lock

> > @@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,

> >   *   i_data_sem (rw)

> >   *

> >   * truncate:

> > - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock

> > - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->

> > + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->

> > + *   page lock

> > + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->

> >   *   i_data_sem (rw)

> >   *

> >   * direct IO:

> > @@ -1350,7 +1348,6 @@ static void init_once(void *foo)

> >  	INIT_LIST_HEAD(&ei->i_orphan);

> >  	init_rwsem(&ei->xattr_sem);

> >  	init_rwsem(&ei->i_data_sem);

> > -	init_rwsem(&ei->i_mmap_sem);

> >  	inode_init_once(&ei->vfs_inode);

> >  	ext4_fc_init_inode(&ei->vfs_inode);

> >  }

> > diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h

> > index bcbe3668c1d4..ce84aa2786c7 100644

> > --- a/fs/ext4/truncate.h

> > +++ b/fs/ext4/truncate.h

> > @@ -11,14 +11,16 @@

> >   */

> >  static inline void ext4_truncate_failed_write(struct inode *inode)

> >  {

> > +	struct address_space *mapping = inode->i_mapping;

> > +

> >  	/*

> >  	 * We don't need to call ext4_break_layouts() because the blocks we

> >  	 * are truncating were never visible to userspace.

> >  	 */

> > -	down_write(&EXT4_I(inode)->i_mmap_sem);

> > -	truncate_inode_pages(inode->i_mapping, inode->i_size);

> > +	filemap_invalidate_lock(mapping);

> > +	truncate_inode_pages(mapping, inode->i_size);

> >  	ext4_truncate(inode);

> > -	up_write(&EXT4_I(inode)->i_mmap_sem);

> > +	filemap_invalidate_unlock(mapping);

> >  }

> >  

> >  /*

> > -- 

> > 2.26.2

> > 

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Theodore Ts'o June 22, 2021, 5:34 p.m. UTC | #3
Sorry, forgot to send it out.

Acked-by: Theodore Ts'o <tytso@mit.edu>
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 37002663d521..ed64b4b217a1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1077,15 +1077,6 @@  struct ext4_inode_info {
 	 * by other means, so we have i_data_sem.
 	 */
 	struct rw_semaphore i_data_sem;
-	/*
-	 * i_mmap_sem is for serializing page faults with truncate / punch hole
-	 * operations. We have to make sure that new page cannot be faulted in
-	 * a section of the inode that is being punched. We cannot easily use
-	 * i_data_sem for this since we need protection for the whole punch
-	 * operation and i_data_sem ranks below transaction start so we have
-	 * to occasionally drop it.
-	 */
-	struct rw_semaphore i_mmap_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
 
@@ -2962,7 +2953,6 @@  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
 extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_release_space(struct inode *inode, int to_free);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cbf37b2cf871..db5d38af9ba8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4470,6 +4470,7 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 			    loff_t len, int mode)
 {
 	struct inode *inode = file_inode(file);
+	struct address_space *mapping = file->f_mapping;
 	handle_t *handle = NULL;
 	unsigned int max_blocks;
 	loff_t new_size = 0;
@@ -4556,17 +4557,17 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 		 * Prevent page faults from reinstantiating pages we have
 		 * released from page cache.
 		 */
-		down_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock(mapping);
 
 		ret = ext4_break_layouts(inode);
 		if (ret) {
-			up_write(&EXT4_I(inode)->i_mmap_sem);
+			filemap_invalidate_unlock(mapping);
 			goto out_mutex;
 		}
 
 		ret = ext4_update_disksize_before_punch(inode, offset, len);
 		if (ret) {
-			up_write(&EXT4_I(inode)->i_mmap_sem);
+			filemap_invalidate_unlock(mapping);
 			goto out_mutex;
 		}
 		/* Now release the pages and zero block aligned part of pages */
@@ -4575,7 +4576,7 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags);
-		up_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock(mapping);
 		if (ret)
 			goto out_mutex;
 	}
@@ -5217,6 +5218,7 @@  ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
 	ext4_lblk_t punch_start, punch_stop;
 	handle_t *handle;
 	unsigned int credits;
@@ -5270,7 +5272,7 @@  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
 	 */
-	down_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_lock(mapping);
 
 	ret = ext4_break_layouts(inode);
 	if (ret)
@@ -5285,15 +5287,15 @@  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	 * Write tail of the last page before removed range since it will get
 	 * removed from the page cache below.
 	 */
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+	ret = filemap_write_and_wait_range(mapping, ioffset, offset);
 	if (ret)
 		goto out_mmap;
 	/*
 	 * Write data that will be shifted to preserve them when discarding
 	 * page cache below. We are also protected from pages becoming dirty
-	 * by i_mmap_sem.
+	 * by i_rwsem and invalidate_lock.
 	 */
-	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+	ret = filemap_write_and_wait_range(mapping, offset + len,
 					   LLONG_MAX);
 	if (ret)
 		goto out_mmap;
@@ -5346,7 +5348,7 @@  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ext4_journal_stop(handle);
 	ext4_fc_stop_ineligible(sb);
 out_mmap:
-	up_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock(mapping);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
@@ -5363,6 +5365,7 @@  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
 	handle_t *handle;
 	struct ext4_ext_path *path;
 	struct ext4_extent *extent;
@@ -5421,7 +5424,7 @@  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
 	 */
-	down_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_lock(mapping);
 
 	ret = ext4_break_layouts(inode);
 	if (ret)
@@ -5522,7 +5525,7 @@  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	ext4_journal_stop(handle);
 	ext4_fc_stop_ineligible(sb);
 out_mmap:
-	up_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock(mapping);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 816dedcbd541..d3b4ed91aa68 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -704,22 +704,23 @@  static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	pfn_t pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vmf->vma->vm_file);
-		down_read(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock_shared(mapping);
 retry:
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
 		if (IS_ERR(handle)) {
-			up_read(&EXT4_I(inode)->i_mmap_sem);
+			filemap_invalidate_unlock_shared(mapping);
 			sb_end_pagefault(sb);
 			return VM_FAULT_SIGBUS;
 		}
 	} else {
-		down_read(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock_shared(mapping);
 	}
 	result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
 	if (write) {
@@ -731,10 +732,10 @@  static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
 		/* Handling synchronous page fault? */
 		if (result & VM_FAULT_NEEDDSYNC)
 			result = dax_finish_sync_fault(vmf, pe_size, pfn);
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock_shared(mapping);
 		sb_end_pagefault(sb);
 	} else {
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock_shared(mapping);
 	}
 
 	return result;
@@ -756,7 +757,7 @@  static const struct vm_operations_struct ext4_dax_vm_ops = {
 #endif
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
-	.fault		= ext4_filemap_fault,
+	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = ext4_page_mkwrite,
 };
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..ee6e69d6f949 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3950,20 +3950,19 @@  int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
 	return ret;
 }
 
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
 {
-	up_write(&ei->i_mmap_sem);
+	filemap_invalidate_unlock(inode->i_mapping);
 	schedule();
-	down_write(&ei->i_mmap_sem);
+	filemap_invalidate_lock(inode->i_mapping);
 }
 
 int ext4_break_layouts(struct inode *inode)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct page *page;
 	int error;
 
-	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
 		return -EINVAL;
 
 	do {
@@ -3974,7 +3973,7 @@  int ext4_break_layouts(struct inode *inode)
 		error = ___wait_var_event(&page->_refcount,
 				atomic_read(&page->_refcount) == 1,
 				TASK_INTERRUPTIBLE, 0, 0,
-				ext4_wait_dax_page(ei));
+				ext4_wait_dax_page(inode));
 	} while (error == 0);
 
 	return error;
@@ -4005,9 +4004,9 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
 	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 	if (ext4_has_inline_data(inode)) {
-		down_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock(mapping);
 		ret = ext4_convert_inline_data(inode);
-		up_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock(mapping);
 		if (ret)
 			return ret;
 	}
@@ -4058,7 +4057,7 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
 	 */
-	down_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_lock(mapping);
 
 	ret = ext4_break_layouts(inode);
 	if (ret)
@@ -4131,7 +4130,7 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 out_stop:
 	ext4_journal_stop(handle);
 out_dio:
-	up_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock(mapping);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
@@ -5426,11 +5425,11 @@  int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 			inode_dio_wait(inode);
 		}
 
-		down_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock(inode->i_mapping);
 
 		rc = ext4_break_layouts(inode);
 		if (rc) {
-			up_write(&EXT4_I(inode)->i_mmap_sem);
+			filemap_invalidate_unlock(inode->i_mapping);
 			goto err_out;
 		}
 
@@ -5506,7 +5505,7 @@  int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 				error = rc;
 		}
 out_mmap_sem:
-		up_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock(inode->i_mapping);
 	}
 
 	if (!error) {
@@ -5983,10 +5982,10 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 * data (and journalled aops don't know how to handle these cases).
 	 */
 	if (val) {
-		down_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_lock(inode->i_mapping);
 		err = filemap_write_and_wait(inode->i_mapping);
 		if (err < 0) {
-			up_write(&EXT4_I(inode)->i_mmap_sem);
+			filemap_invalidate_unlock(inode->i_mapping);
 			return err;
 		}
 	}
@@ -6019,7 +6018,7 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	percpu_up_write(&sbi->s_writepages_rwsem);
 
 	if (val)
-		up_write(&EXT4_I(inode)->i_mmap_sem);
+		filemap_invalidate_unlock(inode->i_mapping);
 
 	/* Finally we can mark the inode as dirty. */
 
@@ -6063,7 +6062,7 @@  vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
 
-	down_read(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_lock_shared(mapping);
 
 	err = ext4_convert_inline_data(inode);
 	if (err)
@@ -6176,7 +6175,7 @@  vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 out_ret:
 	ret = block_page_mkwrite_return(err);
 out:
-	up_read(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock_shared(mapping);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 out_error:
@@ -6184,15 +6183,3 @@  vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 	ext4_journal_stop(handle);
 	goto out;
 }
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	vm_fault_t ret;
-
-	down_read(&EXT4_I(inode)->i_mmap_sem);
-	ret = filemap_fault(vmf);
-	up_read(&EXT4_I(inode)->i_mmap_sem);
-
-	return ret;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 31627f7dc5cd..c5ed562b4185 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -148,7 +148,7 @@  static long swap_inode_boot_loader(struct super_block *sb,
 		goto journal_err_out;
 	}
 
-	down_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_lock(inode->i_mapping);
 	err = filemap_write_and_wait(inode->i_mapping);
 	if (err)
 		goto err_out;
@@ -256,7 +256,7 @@  static long swap_inode_boot_loader(struct super_block *sb,
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
 err_out:
-	up_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock(inode->i_mapping);
 journal_err_out:
 	unlock_two_nondirectories(inode, inode_bl);
 	iput(inode_bl);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d29f6aa7d96e..c3c3cd8b0966 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -90,12 +90,9 @@  static struct inode *ext4_get_journal_inode(struct super_block *sb,
 /*
  * Lock ordering
  *
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
  * page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- *   page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ *   -> page lock -> i_data_sem (rw)
  *
  * buffered write path:
  * sb_start_write -> i_mutex -> mmap_lock
@@ -103,8 +100,9 @@  static struct inode *ext4_get_journal_inode(struct super_block *sb,
  *   i_data_sem (rw)
  *
  * truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ *   page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
  *   i_data_sem (rw)
  *
  * direct IO:
@@ -1350,7 +1348,6 @@  static void init_once(void *foo)
 	INIT_LIST_HEAD(&ei->i_orphan);
 	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
-	init_rwsem(&ei->i_mmap_sem);
 	inode_init_once(&ei->vfs_inode);
 	ext4_fc_init_inode(&ei->vfs_inode);
 }
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index bcbe3668c1d4..ce84aa2786c7 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@ 
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+	struct address_space *mapping = inode->i_mapping;
+
 	/*
 	 * We don't need to call ext4_break_layouts() because the blocks we
 	 * are truncating were never visible to userspace.
 	 */
-	down_write(&EXT4_I(inode)->i_mmap_sem);
-	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	filemap_invalidate_lock(mapping);
+	truncate_inode_pages(mapping, inode->i_size);
 	ext4_truncate(inode);
-	up_write(&EXT4_I(inode)->i_mmap_sem);
+	filemap_invalidate_unlock(mapping);
 }
 
 /*