diff mbox series

[v3] ceph: fix potential use-after-free bug when trimming caps

Message ID 20230418014506.95428-1-xiubli@redhat.com
State New
Headers show
Series [v3] ceph: fix potential use-after-free bug when trimming caps | expand

Commit Message

Xiubo Li April 18, 2023, 1:45 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

When trimming the caps and just after the 'session->s_cap_lock' is
released in ceph_iterate_session_caps() the cap maybe removed by
another thread, and when using the stale cap memory in the callbacks
it will trigger use-after-free crash.

We need to check the existence of the cap just after the 'ci->i_ceph_lock'
being acquired. And do nothing if it's already removed.

Cc: stable@vger.kernel.org
URL: https://bugzilla.redhat.com/show_bug.cgi?id=2186264
URL: https://tracker.ceph.com/issues/43272
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---

V3:
- Fixed 3 issues, which reported by Luis and kernel test robot <lkp@intel.com>

 fs/ceph/debugfs.c    |  7 ++++-
 fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++++---------------
 fs/ceph/mds_client.h |  2 +-
 3 files changed, 52 insertions(+), 25 deletions(-)

Comments

Luis Henriques April 19, 2023, 1:22 p.m. UTC | #1
Xiubo Li <xiubli@redhat.com> writes:

> On 4/18/23 22:20, Luís Henriques wrote:
>> xiubli@redhat.com writes:
>>
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> When trimming the caps and just after the 'session->s_cap_lock' is
>>> released in ceph_iterate_session_caps() the cap maybe removed by
>>> another thread, and when using the stale cap memory in the callbacks
>>> it will trigger use-after-free crash.
>>>
>>> We need to check the existence of the cap just after the 'ci->i_ceph_lock'
>>> being acquired. And do nothing if it's already removed.
>> Your patch seems to be OK, but I'll be honest: the locking is *so* complex
>> that I can say for sure it really solves any problem :-(
>>
>> ceph_put_cap() uses mdsc->caps_list_lock to protect the list, but I can't
>> be sure that holding ci->i_ceph_lock will protect a race in the case
>> you're trying to solve.
>
> The 'mdsc->caps_list_lock' will protect the members in mdsc:
>
>         /*
>          * Cap reservations
>          *
>          * Maintain a global pool of preallocated struct ceph_caps, referenced
>          * by struct ceph_caps_reservations.  This ensures that we preallocate
>          * memory needed to successfully process an MDS response. (If an MDS
>          * sends us cap information and we fail to process it, we will have
>          * problems due to the client and MDS being out of sync.)
>          *
>          * Reservations are 'owned' by a ceph_cap_reservation context.
>          */
>         spinlock_t      caps_list_lock;
>         struct          list_head caps_list; /* unused (reserved or
>                                                 unreserved) */
>         struct          list_head cap_wait_list;
>         int             caps_total_count;    /* total caps allocated */
>         int             caps_use_count;      /* in use */
>         int             caps_use_max;        /* max used caps */
>         int             caps_reserve_count;  /* unused, reserved */
>         int             caps_avail_count;    /* unused, unreserved */
>         int             caps_min_count;      /* keep at least this many
>
> Not protecting the cap list in session or inode.
>
>
> And the racy is between the session's cap list and inode's cap rbtree and both
> are holding the same 'cap' reference.
>
> So in 'ceph_iterate_session_caps()' after getting the 'cap' and releasing the
> 'session->s_cap_lock', just before passing the 'cap' to _cb() another thread
> could continue and release the 'cap'. Then the 'cap' should be stale now and
> after being passed to _cb() the 'cap' when dereferencing it will crash the
> kernel.
>
> And if the 'cap' is stale, it shouldn't exist in the inode's cap rbtree. Please
> note the lock order will be:
>
> 1, spin_lock(&ci->i_ceph_lock)
>
> 2, spin_lock(&session->s_cap_lock)
>
>
> Before:
>
> ThreadA: ThreadB:
>
> __ceph_remove_caps() -->
>
>     spin_lock(&ci->i_ceph_lock)
>
>     ceph_remove_cap() --> ceph_iterate_session_caps() -->
>
>         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
>
> cap = list_entry(p, struct ceph_cap, session_caps);
>
> spin_unlock(&session->s_cap_lock);
>
>             spin_lock(&session->s_cap_lock);
>
>             // remove it from the session's cap list
>
>             list_del_init(&cap->session_caps);
>
>             spin_unlock(&session->s_cap_lock);
>
>             ceph_put_cap()
>
> trim_caps_cb('cap') -->   // the _cb() could be deferred after ThreadA finished
> 'ceph_put_cap()'
>
> spin_unlock(&ci->i_ceph_lock) dreference cap->xxx will trigger crash
>
>
>
> With this patch:
>
> ThreadA: ThreadB:
>
> __ceph_remove_caps() -->
>
>     spin_lock(&ci->i_ceph_lock)
>
>     ceph_remove_cap() --> ceph_iterate_session_caps() -->
>
>         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
>
> cap = list_entry(p, struct ceph_cap, session_caps);
>
> ci_node = &cap->ci_node;
>
> spin_unlock(&session->s_cap_lock);
>
>             spin_lock(&session->s_cap_lock);
>
>             // remove it from the session's cap list
>
>             list_del_init(&cap->session_caps);
>
>             spin_unlock(&session->s_cap_lock);
>
>             ceph_put_cap()
>
> trim_caps_cb('ci_node') -->
>
> spin_unlock(&ci->i_ceph_lock)
>
> spin_lock(&ci->i_ceph_lock)
>
> cap = rb_entry(ci_node, struct ceph_cap, ci_node);    // This is buggy in this
> version, we should use the 'mds' instead and I will fix it.
>
> if (!cap)  { release the spin lock and return directly }
>
> spin_unlock(&ci->i_ceph_lock)

Thanks a lot for taking the time to explain all of this.  Much
appreciated.  It all seems to make sense, and, again, I don't have any
real objection to your patch.  It's just that I still find the whole
locking to be too complex, and every change that is made to it looks like
walking on a mine field :-)

> While we should switch to use the 'mds' of the cap instead of the 'ci_node',
> which is buggy. I will fix it in next version.

Yeah, I've took a quick look at v4 and it looks like it fixes this.

>> Is the issue in that bugzilla reproducible, or was that a one-time thing?
>
> No, I don't think so. Locally I have tried by turning the mds options to trigger
> the cap reclaiming more frequently, but still couldn't reproduce it. It should
> be very corner case.

Yeah, too bad.  It would help to gain some extra confidence on the patch.

Cheers,
Ilya Dryomov April 30, 2023, 8:39 a.m. UTC | #2
On Wed, Apr 20, 2023 at 3:22 PM Luís Henriques <lhenriques@suse.de> wrote:
>
> Xiubo Li <xiubli@redhat.com> writes:
>
> > On 4/18/23 22:20, Luís Henriques wrote:
> >> xiubli@redhat.com writes:
> >>
> >>> From: Xiubo Li <xiubli@redhat.com>
> >>>
> >>> When trimming the caps and just after the 'session->s_cap_lock' is
> >>> released in ceph_iterate_session_caps() the cap maybe removed by
> >>> another thread, and when using the stale cap memory in the callbacks
> >>> it will trigger use-after-free crash.
> >>>
> >>> We need to check the existence of the cap just after the 'ci->i_ceph_lock'
> >>> being acquired. And do nothing if it's already removed.
> >> Your patch seems to be OK, but I'll be honest: the locking is *so* complex
> >> that I can say for sure it really solves any problem :-(
> >>
> >> ceph_put_cap() uses mdsc->caps_list_lock to protect the list, but I can't
> >> be sure that holding ci->i_ceph_lock will protect a race in the case
> >> you're trying to solve.
> >
> > The 'mdsc->caps_list_lock' will protect the members in mdsc:
> >
> >         /*
> >          * Cap reservations
> >          *
> >          * Maintain a global pool of preallocated struct ceph_caps, referenced
> >          * by struct ceph_caps_reservations.  This ensures that we preallocate
> >          * memory needed to successfully process an MDS response. (If an MDS
> >          * sends us cap information and we fail to process it, we will have
> >          * problems due to the client and MDS being out of sync.)
> >          *
> >          * Reservations are 'owned' by a ceph_cap_reservation context.
> >          */
> >         spinlock_t      caps_list_lock;
> >         struct          list_head caps_list; /* unused (reserved or
> >                                                 unreserved) */
> >         struct          list_head cap_wait_list;
> >         int             caps_total_count;    /* total caps allocated */
> >         int             caps_use_count;      /* in use */
> >         int             caps_use_max;        /* max used caps */
> >         int             caps_reserve_count;  /* unused, reserved */
> >         int             caps_avail_count;    /* unused, unreserved */
> >         int             caps_min_count;      /* keep at least this many
> >
> > Not protecting the cap list in session or inode.
> >
> >
> > And the racy is between the session's cap list and inode's cap rbtree and both
> > are holding the same 'cap' reference.
> >
> > So in 'ceph_iterate_session_caps()' after getting the 'cap' and releasing the
> > 'session->s_cap_lock', just before passing the 'cap' to _cb() another thread
> > could continue and release the 'cap'. Then the 'cap' should be stale now and
> > after being passed to _cb() the 'cap' when dereferencing it will crash the
> > kernel.
> >
> > And if the 'cap' is stale, it shouldn't exist in the inode's cap rbtree. Please
> > note the lock order will be:
> >
> > 1, spin_lock(&ci->i_ceph_lock)
> >
> > 2, spin_lock(&session->s_cap_lock)
> >
> >
> > Before:
> >
> > ThreadA: ThreadB:
> >
> > __ceph_remove_caps() -->
> >
> >     spin_lock(&ci->i_ceph_lock)
> >
> >     ceph_remove_cap() --> ceph_iterate_session_caps() -->
> >
> >         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
> >
> > cap = list_entry(p, struct ceph_cap, session_caps);
> >
> > spin_unlock(&session->s_cap_lock);
> >
> >             spin_lock(&session->s_cap_lock);
> >
> >             // remove it from the session's cap list
> >
> >             list_del_init(&cap->session_caps);
> >
> >             spin_unlock(&session->s_cap_lock);
> >
> >             ceph_put_cap()
> >
> > trim_caps_cb('cap') -->   // the _cb() could be deferred after ThreadA finished
> > 'ceph_put_cap()'
> >
> > spin_unlock(&ci->i_ceph_lock) dreference cap->xxx will trigger crash
> >
> >
> >
> > With this patch:
> >
> > ThreadA: ThreadB:
> >
> > __ceph_remove_caps() -->
> >
> >     spin_lock(&ci->i_ceph_lock)
> >
> >     ceph_remove_cap() --> ceph_iterate_session_caps() -->
> >
> >         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
> >
> > cap = list_entry(p, struct ceph_cap, session_caps);
> >
> > ci_node = &cap->ci_node;
> >
> > spin_unlock(&session->s_cap_lock);
> >
> >             spin_lock(&session->s_cap_lock);
> >
> >             // remove it from the session's cap list
> >
> >             list_del_init(&cap->session_caps);
> >
> >             spin_unlock(&session->s_cap_lock);
> >
> >             ceph_put_cap()
> >
> > trim_caps_cb('ci_node') -->
> >
> > spin_unlock(&ci->i_ceph_lock)
> >
> > spin_lock(&ci->i_ceph_lock)
> >
> > cap = rb_entry(ci_node, struct ceph_cap, ci_node);    // This is buggy in this
> > version, we should use the 'mds' instead and I will fix it.
> >
> > if (!cap)  { release the spin lock and return directly }
> >
> > spin_unlock(&ci->i_ceph_lock)
>
> Thanks a lot for taking the time to explain all of this.  Much
> appreciated.  It all seems to make sense, and, again, I don't have any
> real objection to your patch.  It's just that I still find the whole
> locking to be too complex, and every change that is made to it looks like
> walking on a mine field :-)
>
> > While we should switch to use the 'mds' of the cap instead of the 'ci_node',
> > which is buggy. I will fix it in next version.
>
> Yeah, I've took a quick look at v4 and it looks like it fixes this.

Hi Luís,

Do you mind if I put this down as a Reviewed-by? ;)

Thanks,

                Ilya
Luis Henriques May 1, 2023, 10:37 a.m. UTC | #3
Ilya Dryomov <idryomov@gmail.com> writes:

> On Wed, Apr 20, 2023 at 3:22 PM Luís Henriques <lhenriques@suse.de> wrote:
>>
>> Xiubo Li <xiubli@redhat.com> writes:
>>
>> > On 4/18/23 22:20, Luís Henriques wrote:
>> >> xiubli@redhat.com writes:
>> >>
>> >>> From: Xiubo Li <xiubli@redhat.com>
>> >>>
>> >>> When trimming the caps and just after the 'session->s_cap_lock' is
>> >>> released in ceph_iterate_session_caps() the cap maybe removed by
>> >>> another thread, and when using the stale cap memory in the callbacks
>> >>> it will trigger use-after-free crash.
>> >>>
>> >>> We need to check the existence of the cap just after the 'ci->i_ceph_lock'
>> >>> being acquired. And do nothing if it's already removed.
>> >> Your patch seems to be OK, but I'll be honest: the locking is *so* complex
>> >> that I can say for sure it really solves any problem :-(
>> >>
>> >> ceph_put_cap() uses mdsc->caps_list_lock to protect the list, but I can't
>> >> be sure that holding ci->i_ceph_lock will protect a race in the case
>> >> you're trying to solve.
>> >
>> > The 'mdsc->caps_list_lock' will protect the members in mdsc:
>> >
>> >         /*
>> >          * Cap reservations
>> >          *
>> >          * Maintain a global pool of preallocated struct ceph_caps, referenced
>> >          * by struct ceph_caps_reservations.  This ensures that we preallocate
>> >          * memory needed to successfully process an MDS response. (If an MDS
>> >          * sends us cap information and we fail to process it, we will have
>> >          * problems due to the client and MDS being out of sync.)
>> >          *
>> >          * Reservations are 'owned' by a ceph_cap_reservation context.
>> >          */
>> >         spinlock_t      caps_list_lock;
>> >         struct          list_head caps_list; /* unused (reserved or
>> >                                                 unreserved) */
>> >         struct          list_head cap_wait_list;
>> >         int             caps_total_count;    /* total caps allocated */
>> >         int             caps_use_count;      /* in use */
>> >         int             caps_use_max;        /* max used caps */
>> >         int             caps_reserve_count;  /* unused, reserved */
>> >         int             caps_avail_count;    /* unused, unreserved */
>> >         int             caps_min_count;      /* keep at least this many
>> >
>> > Not protecting the cap list in session or inode.
>> >
>> >
>> > And the racy is between the session's cap list and inode's cap rbtree and both
>> > are holding the same 'cap' reference.
>> >
>> > So in 'ceph_iterate_session_caps()' after getting the 'cap' and releasing the
>> > 'session->s_cap_lock', just before passing the 'cap' to _cb() another thread
>> > could continue and release the 'cap'. Then the 'cap' should be stale now and
>> > after being passed to _cb() the 'cap' when dereferencing it will crash the
>> > kernel.
>> >
>> > And if the 'cap' is stale, it shouldn't exist in the inode's cap rbtree. Please
>> > note the lock order will be:
>> >
>> > 1, spin_lock(&ci->i_ceph_lock)
>> >
>> > 2, spin_lock(&session->s_cap_lock)
>> >
>> >
>> > Before:
>> >
>> > ThreadA: ThreadB:
>> >
>> > __ceph_remove_caps() -->
>> >
>> >     spin_lock(&ci->i_ceph_lock)
>> >
>> >     ceph_remove_cap() --> ceph_iterate_session_caps() -->
>> >
>> >         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
>> >
>> > cap = list_entry(p, struct ceph_cap, session_caps);
>> >
>> > spin_unlock(&session->s_cap_lock);
>> >
>> >             spin_lock(&session->s_cap_lock);
>> >
>> >             // remove it from the session's cap list
>> >
>> >             list_del_init(&cap->session_caps);
>> >
>> >             spin_unlock(&session->s_cap_lock);
>> >
>> >             ceph_put_cap()
>> >
>> > trim_caps_cb('cap') -->   // the _cb() could be deferred after ThreadA finished
>> > 'ceph_put_cap()'
>> >
>> > spin_unlock(&ci->i_ceph_lock) dreference cap->xxx will trigger crash
>> >
>> >
>> >
>> > With this patch:
>> >
>> > ThreadA: ThreadB:
>> >
>> > __ceph_remove_caps() -->
>> >
>> >     spin_lock(&ci->i_ceph_lock)
>> >
>> >     ceph_remove_cap() --> ceph_iterate_session_caps() -->
>> >
>> >         __ceph_remove_cap() --> spin_lock(&session->s_cap_lock);
>> >
>> > cap = list_entry(p, struct ceph_cap, session_caps);
>> >
>> > ci_node = &cap->ci_node;
>> >
>> > spin_unlock(&session->s_cap_lock);
>> >
>> >             spin_lock(&session->s_cap_lock);
>> >
>> >             // remove it from the session's cap list
>> >
>> >             list_del_init(&cap->session_caps);
>> >
>> >             spin_unlock(&session->s_cap_lock);
>> >
>> >             ceph_put_cap()
>> >
>> > trim_caps_cb('ci_node') -->
>> >
>> > spin_unlock(&ci->i_ceph_lock)
>> >
>> > spin_lock(&ci->i_ceph_lock)
>> >
>> > cap = rb_entry(ci_node, struct ceph_cap, ci_node);    // This is buggy in this
>> > version, we should use the 'mds' instead and I will fix it.
>> >
>> > if (!cap)  { release the spin lock and return directly }
>> >
>> > spin_unlock(&ci->i_ceph_lock)
>>
>> Thanks a lot for taking the time to explain all of this.  Much
>> appreciated.  It all seems to make sense, and, again, I don't have any
>> real objection to your patch.  It's just that I still find the whole
>> locking to be too complex, and every change that is made to it looks like
>> walking on a mine field :-)
>>
>> > While we should switch to use the 'mds' of the cap instead of the 'ci_node',
>> > which is buggy. I will fix it in next version.
>>
>> Yeah, I've took a quick look at v4 and it looks like it fixes this.
>
> Hi Luís,
>
> Do you mind if I put this down as a Reviewed-by? ;)

Sure, feel free to add my

Reviewed-by: Luís Henriques <lhenriques@suse.de>

(Sorry, forgot to send that explicitly.)

Cheers,
diff mbox series

Patch

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index bec3c4549c07..5c0f07df5b02 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -248,14 +248,19 @@  static int metrics_caps_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
+static int caps_show_cb(struct inode *inode, struct rb_node *ci_node, void *p)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct seq_file *s = p;
+	struct ceph_cap *cap;
 
+	spin_lock(&ci->i_ceph_lock);
+	cap = rb_entry(ci_node, struct ceph_cap, ci_node);
 	seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
 		   cap->session->s_mds,
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
+	spin_unlock(&ci->i_ceph_lock);
 	return 0;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 294af79c25c9..fb777add2257 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1786,7 +1786,7 @@  static void cleanup_session_requests(struct ceph_mds_client *mdsc,
  * Caller must hold session s_mutex.
  */
 int ceph_iterate_session_caps(struct ceph_mds_session *session,
-			      int (*cb)(struct inode *, struct ceph_cap *,
+			      int (*cb)(struct inode *, struct rb_node *ci_node,
 					void *), void *arg)
 {
 	struct list_head *p;
@@ -1799,6 +1799,8 @@  int ceph_iterate_session_caps(struct ceph_mds_session *session,
 	spin_lock(&session->s_cap_lock);
 	p = session->s_caps.next;
 	while (p != &session->s_caps) {
+		struct rb_node *ci_node;
+
 		cap = list_entry(p, struct ceph_cap, session_caps);
 		inode = igrab(&cap->ci->netfs.inode);
 		if (!inode) {
@@ -1806,6 +1808,7 @@  int ceph_iterate_session_caps(struct ceph_mds_session *session,
 			continue;
 		}
 		session->s_cap_iterator = cap;
+		ci_node = &cap->ci_node;
 		spin_unlock(&session->s_cap_lock);
 
 		if (last_inode) {
@@ -1817,7 +1820,7 @@  int ceph_iterate_session_caps(struct ceph_mds_session *session,
 			old_cap = NULL;
 		}
 
-		ret = cb(inode, cap, arg);
+		ret = cb(inode, ci_node, arg);
 		last_inode = inode;
 
 		spin_lock(&session->s_cap_lock);
@@ -1850,20 +1853,26 @@  int ceph_iterate_session_caps(struct ceph_mds_session *session,
 	return ret;
 }
 
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int remove_session_caps_cb(struct inode *inode, struct rb_node *ci_node,
 				  void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool invalidate = false;
-	int iputs;
+	struct ceph_cap *cap;
+	int iputs = 0;
 
-	dout("removing cap %p, ci is %p, inode is %p\n",
-	     cap, ci, &ci->netfs.inode);
 	spin_lock(&ci->i_ceph_lock);
-	iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+	cap = rb_entry(ci_node, struct ceph_cap, ci_node);
+	if (cap) {
+		dout(" removing cap %p, ci is %p, inode is %p\n",
+		     cap, ci, &ci->netfs.inode);
+
+		iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	wake_up_all(&ci->i_cap_wq);
+	if (cap)
+		wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
 	while (iputs--)
@@ -1934,8 +1943,7 @@  enum {
  *
  * caller must hold s_mutex.
  */
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
-			      void *arg)
+static int wake_up_session_cb(struct inode *inode, struct rb_node *ci_node, void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned long ev = (unsigned long)arg;
@@ -1946,12 +1954,14 @@  static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 		ci->i_requested_max_size = 0;
 		spin_unlock(&ci->i_ceph_lock);
 	} else if (ev == RENEWCAPS) {
-		if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
-			/* mds did not re-issue stale cap */
-			spin_lock(&ci->i_ceph_lock);
+		struct ceph_cap *cap;
+
+		spin_lock(&ci->i_ceph_lock);
+		cap = rb_entry(ci_node, struct ceph_cap, ci_node);
+		/* mds did not re-issue stale cap */
+		if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
 			cap->issued = cap->implemented = CEPH_CAP_PIN;
-			spin_unlock(&ci->i_ceph_lock);
-		}
+		spin_unlock(&ci->i_ceph_lock);
 	} else if (ev == FORCE_RO) {
 	}
 	wake_up_all(&ci->i_cap_wq);
@@ -2113,16 +2123,22 @@  static bool drop_negative_children(struct dentry *dentry)
  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
  * memory pressure from the MDS, though, so it needn't be perfect.
  */
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+static int trim_caps_cb(struct inode *inode, struct rb_node *ci_node, void *arg)
 {
 	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
+	struct ceph_cap *cap;
 
 	if (*remaining <= 0)
 		return -1;
 
 	spin_lock(&ci->i_ceph_lock);
+	cap = rb_entry(ci_node, struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&ci->i_ceph_lock);
+		return 0;
+	}
 	mine = cap->issued | cap->implemented;
 	used = __ceph_caps_used(ci);
 	wanted = __ceph_caps_file_wanted(ci);
@@ -4265,26 +4281,23 @@  static struct dentry* d_find_primary(struct inode *inode)
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
-static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct rb_node *ci_node,
 			  void *arg)
 {
 	union {
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
 	} rec;
-	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	struct dentry *dentry;
+	struct ceph_cap *cap;
 	char *path;
-	int pathlen = 0, err;
+	int pathlen = 0, err = 0;
 	u64 pathbase;
 	u64 snap_follows;
 
-	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
-	     inode, ceph_vinop(inode), cap, cap->cap_id,
-	     ceph_cap_string(cap->issued));
-
 	dentry = d_find_primary(inode);
 	if (dentry) {
 		/* set pathbase to parent dir when msg_version >= 2 */
@@ -4301,6 +4314,15 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	}
 
 	spin_lock(&ci->i_ceph_lock);
+	cap = rb_entry(ci_node, struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&ci->i_ceph_lock);
+		goto out_err;
+	}
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
 	cap->mseq = 0;       /* and migrate_seq */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0f70ca3cdb21..001b69d04307 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -569,7 +569,7 @@  extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
 extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
 extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
 				     int (*cb)(struct inode *,
-					       struct ceph_cap *, void *),
+					       struct rb_node *ci_node, void *),
 				     void *arg);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);