Message ID | 20220425115828.6966-1-xiubli@redhat.com |
---|---|
State | New |
Headers | show |
Series | ceph: fix possible deadlock when holding Fwb to get inline_data | expand |
On Mon, 2022-04-25 at 19:58 +0800, Xiubo Li wrote: > 1, mount with wsync. > 2, create a file with O_RDWR, and the request was sent to mds.0: > > ceph_atomic_open()--> > ceph_mdsc_do_request(openc) > finish_open(file, dentry, ceph_open)--> > ceph_open()--> > ceph_init_file()--> > ceph_init_file_info()--> > ceph_uninline_data()--> > { > ... > if (inline_version == 1 || /* initial version, no data */ > inline_version == CEPH_INLINE_NONE) > goto out_unlock; > ... > } > > The inline_version will be 1, which is the initial version for the > new create file. And here the ci->i_inline_version will keep with 1, > it's buggy. > > 3, buffer write to the file immediately: > > ceph_write_iter()--> > ceph_get_caps(file, need=Fw, want=Fb, ...); > generic_perform_write()--> > a_ops->write_begin()--> > ceph_write_begin()--> > netfs_write_begin()--> > netfs_begin_read()--> > netfs_rreq_submit_slice()--> > netfs_read_from_server()--> > rreq->netfs_ops->issue_read()--> > ceph_netfs_issue_read()--> > { > ... > if (ci->i_inline_version != CEPH_INLINE_NONE && > ceph_netfs_issue_op_inline(subreq)) > return; > ... > } > ceph_put_cap_refs(ci, Fwb); > > The ceph_netfs_issue_op_inline() will send a getattr(Fsr) request to > mds.1. > > 4, then the mds.1 will request the rd lock for CInode::filelock from > the auth mds.0, the mds.0 will do the CInode::filelock state transation > from excl --> sync, but it need to revoke the Fxwb caps back from the > clients. > > While the kernel client has aleady held the Fwb caps and waiting for > the getattr(Fsr). > > It's deadlock!!!! > > URL: https://tracker.ceph.com/issues/55377 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/addr.c | 33 +++++++++++++++++++-------------- > 1 file changed, 19 insertions(+), 14 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index 02722ac86d73..15e7b48cbc95 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -1641,7 +1641,7 @@ int ceph_uninline_data(struct file *file) > struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > - struct ceph_osd_request *req; > + struct ceph_osd_request *req = NULL; > struct ceph_cap_flush *prealloc_cf; > struct folio *folio = NULL; > u64 inline_version = CEPH_INLINE_NONE; > @@ -1649,10 +1649,23 @@ int ceph_uninline_data(struct file *file) > int err = 0; > u64 len; > > + spin_lock(&ci->i_ceph_lock); > + inline_version = ci->i_inline_version; > + spin_unlock(&ci->i_ceph_lock); > + > + dout("uninline_data %p %llx.%llx inline_version %llu\n", > + inode, ceph_vinop(inode), inline_version); > + > + if (inline_version == CEPH_INLINE_NONE) > + return 0; > + > prealloc_cf = ceph_alloc_cap_flush(); > if (!prealloc_cf) > return -ENOMEM; > > + if (inline_version == 1) /* initial version, no data */ > + goto out_uninline; > + > folio = read_mapping_folio(inode->i_mapping, 0, file); > if (IS_ERR(folio)) { > err = PTR_ERR(folio); > @@ -1661,17 +1674,6 @@ int ceph_uninline_data(struct file *file) > > folio_lock(folio); > > - spin_lock(&ci->i_ceph_lock); > - inline_version = ci->i_inline_version; > - spin_unlock(&ci->i_ceph_lock); > - > - dout("uninline_data %p %llx.%llx inline_version %llu\n", > - inode, ceph_vinop(inode), inline_version); > - > - if (inline_version == 1 || /* initial version, no data */ > - inline_version == CEPH_INLINE_NONE) > - goto out_unlock; > - > len = i_size_read(inode); > if (len > folio_size(folio)) > len = folio_size(folio); > @@ -1736,6 +1738,7 @@ int ceph_uninline_data(struct file *file) > ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, > req->r_end_latency, len, err); > > +out_uninline: > if (!err) { > int dirty; > > @@ -1754,8 +1757,10 @@ int ceph_uninline_data(struct file *file) > if (err == -ECANCELED) > err = 0; > out_unlock: > - folio_unlock(folio); > - folio_put(folio); > + if (folio) { > + folio_unlock(folio); > + folio_put(folio); > + } > out: > ceph_free_cap_flush(prealloc_cf); > dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", Nice catch! Reviewed-by: Jeff Layton <jlayton@kernel.org>
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 02722ac86d73..15e7b48cbc95 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1641,7 +1641,7 @@ int ceph_uninline_data(struct file *file) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; @@ -1649,10 +1649,23 @@ int ceph_uninline_data(struct file *file) int err = 0; u64 len; + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == CEPH_INLINE_NONE) + return 0; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); @@ -1661,17 +1674,6 @@ int ceph_uninline_data(struct file *file) folio_lock(folio); - spin_lock(&ci->i_ceph_lock); - inline_version = ci->i_inline_version; - spin_unlock(&ci->i_ceph_lock); - - dout("uninline_data %p %llx.%llx inline_version %llu\n", - inode, ceph_vinop(inode), inline_version); - - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out_unlock; - len = i_size_read(inode); if (len > folio_size(folio)) len = folio_size(folio); @@ -1736,6 +1738,7 @@ int ceph_uninline_data(struct file *file) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); +out_uninline: if (!err) { int dirty; @@ -1754,8 +1757,10 @@ int ceph_uninline_data(struct file *file) if (err == -ECANCELED) err = 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } out: ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
1, mount with wsync. 2, create a file with O_RDWR, and the request was sent to mds.0: ceph_atomic_open()--> ceph_mdsc_do_request(openc) finish_open(file, dentry, ceph_open)--> ceph_open()--> ceph_init_file()--> ceph_init_file_info()--> ceph_uninline_data()--> { ... if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) goto out_unlock; ... } The inline_version will be 1, which is the initial version for the new create file. And here the ci->i_inline_version will keep with 1, it's buggy. 3, buffer write to the file immediately: ceph_write_iter()--> ceph_get_caps(file, need=Fw, want=Fb, ...); generic_perform_write()--> a_ops->write_begin()--> ceph_write_begin()--> netfs_write_begin()--> netfs_begin_read()--> netfs_rreq_submit_slice()--> netfs_read_from_server()--> rreq->netfs_ops->issue_read()--> ceph_netfs_issue_read()--> { ... if (ci->i_inline_version != CEPH_INLINE_NONE && ceph_netfs_issue_op_inline(subreq)) return; ... } ceph_put_cap_refs(ci, Fwb); The ceph_netfs_issue_op_inline() will send a getattr(Fsr) request to mds.1. 4, then the mds.1 will request the rd lock for CInode::filelock from the auth mds.0, the mds.0 will do the CInode::filelock state transation from excl --> sync, but it need to revoke the Fxwb caps back from the clients. While the kernel client has aleady held the Fwb caps and waiting for the getattr(Fsr). It's deadlock!!!! URL: https://tracker.ceph.com/issues/55377 Signed-off-by: Xiubo Li <xiubli@redhat.com> --- fs/ceph/addr.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-)