diff mbox series

[v3] ceph: dump info about cap flushes when we're waiting too long for them

Message ID 20210729180442.177399-1-jlayton@kernel.org
State New
Headers show
Series [v3] ceph: dump info about cap flushes when we're waiting too long for them | expand

Commit Message

Jeff Layton July 29, 2021, 6:04 p.m. UTC
We've had some cases of hung umounts in teuthology testing. It looks
like client is waiting for cap flushes to complete, but they aren't.

Add a field to the inode to track the highest cap flush tid seen for
that inode. Also, add a backpointer to the inode to the ceph_cap_flush
struct.

Change wait_caps_flush to wait 60s, and then dump info about the
condition of the list.

Also, print pr_info messages if we end up dropping a FLUSH_ACK for an
inode onto the floor.

Reported-by: Patrick Donnelly <pdonnell@redhat.com>
URL: https://tracker.ceph.com/issues/51279
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 17 +++++++++++++++--
 fs/ceph/inode.c      |  1 +
 fs/ceph/mds_client.c | 31 +++++++++++++++++++++++++++++--
 fs/ceph/super.h      |  2 ++
 4 files changed, 47 insertions(+), 4 deletions(-)

v3: more debugging has shown the client waiting on FLUSH_ACK messages
    that seem to never have come. Add some new printks if we end up
    dropping a FLUSH_ACK onto the floor.

Comments

Luis Henriques July 30, 2021, 10:09 a.m. UTC | #1
Jeff Layton <jlayton@kernel.org> writes:

> We've had some cases of hung umounts in teuthology testing. It looks

> like client is waiting for cap flushes to complete, but they aren't.

>

> Add a field to the inode to track the highest cap flush tid seen for

> that inode. Also, add a backpointer to the inode to the ceph_cap_flush

> struct.

>

> Change wait_caps_flush to wait 60s, and then dump info about the

> condition of the list.

>

> Also, print pr_info messages if we end up dropping a FLUSH_ACK for an

> inode onto the floor.

>

> Reported-by: Patrick Donnelly <pdonnell@redhat.com>

> URL: https://tracker.ceph.com/issues/51279

> Signed-off-by: Jeff Layton <jlayton@kernel.org>

> ---

>  fs/ceph/caps.c       | 17 +++++++++++++++--

>  fs/ceph/inode.c      |  1 +

>  fs/ceph/mds_client.c | 31 +++++++++++++++++++++++++++++--

>  fs/ceph/super.h      |  2 ++

>  4 files changed, 47 insertions(+), 4 deletions(-)

>

> v3: more debugging has shown the client waiting on FLUSH_ACK messages

>     that seem to never have come. Add some new printks if we end up

>     dropping a FLUSH_ACK onto the floor.


Since you're adding debug printks, would it be worth to also add one in
mds_dispatch(), when __verify_registered_session(mdsc, s) < 0?

It's a wild guess, but the FLUSH_ACK could be dropped in that case too.
Not that I could spot any issue there, but since this seems to be
happening during umount...

Cheers,
-- 
Luis


>

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

> index 7ae83d06d48c..cb551c9e5867 100644

> --- a/fs/ceph/caps.c

> +++ b/fs/ceph/caps.c

> @@ -1829,6 +1829,7 @@ static u64 __mark_caps_flushing(struct inode *inode,

>  	swap(cf, ci->i_prealloc_cap_flush);

>  	cf->caps = flushing;

>  	cf->wake = wake;

> +	cf->ci = ci;

>  

>  	spin_lock(&mdsc->cap_dirty_lock);

>  	list_del_init(&ci->i_dirty_item);

> @@ -3588,6 +3589,10 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,

>  	bool wake_ci = false;

>  	bool wake_mdsc = false;

>  

> +	/* track latest cap flush ack seen for this inode */

> +	if (flush_tid > ci->i_last_cap_flush_ack)

> +		ci->i_last_cap_flush_ack = flush_tid;

> +

>  	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {

>  		/* Is this the one that was flushed? */

>  		if (cf->tid == flush_tid)

> @@ -4116,7 +4121,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,

>  	     (unsigned)seq);

>  

>  	if (!inode) {

> -		dout(" i don't have ino %llx\n", vino.ino);

> +		if (op == CEPH_CAP_OP_FLUSH_ACK)

> +			pr_info("%s: can't find ino %llx:%llx for flush_ack!\n",

> +				__func__, vino.snap, vino.ino);

> +		else

> +			dout(" i don't have ino %llx\n", vino.ino);

>  

>  		if (op == CEPH_CAP_OP_IMPORT) {

>  			cap = ceph_get_cap(mdsc, NULL);

> @@ -4169,10 +4178,14 @@ void ceph_handle_caps(struct ceph_mds_session *session,

>  	spin_lock(&ci->i_ceph_lock);

>  	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);

>  	if (!cap) {

> -		dout(" no cap on %p ino %llx.%llx from mds%d\n",

> +		dout(" no cap on %p ino %llx:%llx from mds%d\n",

>  		     inode, ceph_ino(inode), ceph_snap(inode),

>  		     session->s_mds);

>  		spin_unlock(&ci->i_ceph_lock);

> +		if (op == CEPH_CAP_OP_FLUSH_ACK)

> +			pr_info("%s: no cap on %p ino %llx:%llx from mds%d for flush_ack!\n",

> +				__func__, inode, ceph_ino(inode),

> +				ceph_snap(inode), session->s_mds);

>  		goto flush_cap_releases;

>  	}

>  

> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c

> index 1bd2cc015913..84e4f112fc45 100644

> --- a/fs/ceph/inode.c

> +++ b/fs/ceph/inode.c

> @@ -499,6 +499,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)

>  	INIT_LIST_HEAD(&ci->i_cap_snaps);

>  	ci->i_head_snapc = NULL;

>  	ci->i_snap_caps = 0;

> +	ci->i_last_cap_flush_ack = 0;

>  

>  	ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;

>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)

> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

> index c3fa0c0e4f6c..fc26527b8059 100644

> --- a/fs/ceph/mds_client.c

> +++ b/fs/ceph/mds_client.c

> @@ -2064,6 +2064,24 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,

>  	return ret;

>  }

>  

> +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid)

> +{

> +	struct ceph_cap_flush *cf;

> +

> +	pr_info("%s: still waiting for cap flushes through %llu\n:\n",

> +		__func__, want_tid);

> +	spin_lock(&mdsc->cap_dirty_lock);

> +	list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) {

> +		if (cf->tid > want_tid)

> +			break;

> +		pr_info("%llx:%llx %s %llu %llu %d\n",

> +			ceph_vinop(&cf->ci->vfs_inode),

> +			ceph_cap_string(cf->caps), cf->tid,

> +			cf->ci->i_last_cap_flush_ack, cf->wake);

> +	}

> +	spin_unlock(&mdsc->cap_dirty_lock);

> +}

> +

>  /*

>   * flush all dirty inode data to disk.

>   *

> @@ -2072,10 +2090,19 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,

>  static void wait_caps_flush(struct ceph_mds_client *mdsc,

>  			    u64 want_flush_tid)

>  {

> +	long ret;

> +

>  	dout("check_caps_flush want %llu\n", want_flush_tid);

>  

> -	wait_event(mdsc->cap_flushing_wq,

> -		   check_caps_flush(mdsc, want_flush_tid));

> +	do {

> +		ret = wait_event_timeout(mdsc->cap_flushing_wq,

> +			   check_caps_flush(mdsc, want_flush_tid), 60 * HZ);

> +		if (ret == 0)

> +			dump_cap_flushes(mdsc, want_flush_tid);

> +		else if (ret == 1)

> +			pr_info("%s: condition evaluated to true after timeout!\n",

> +				  __func__);

> +	} while (ret == 0);

>  

>  	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);

>  }

> diff --git a/fs/ceph/super.h b/fs/ceph/super.h

> index 07eb542efa1d..d51d42a00f33 100644

> --- a/fs/ceph/super.h

> +++ b/fs/ceph/super.h

> @@ -189,6 +189,7 @@ struct ceph_cap_flush {

>  	bool wake; /* wake up flush waiters when finish ? */

>  	struct list_head g_list; // global

>  	struct list_head i_list; // per inode

> +	struct ceph_inode_info *ci;

>  };

>  

>  /*

> @@ -388,6 +389,7 @@ struct ceph_inode_info {

>  	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or

>  						    dirty|flushing caps */

>  	unsigned i_snap_caps;           /* cap bits for snapped files */

> +	u64 i_last_cap_flush_ack;		/* latest cap flush_ack tid for this inode */

>  

>  	unsigned long i_last_rd;

>  	unsigned long i_last_wr;

> -- 

>

> 2.31.1

>
Jeff Layton July 30, 2021, 1:32 p.m. UTC | #2
On Fri, 2021-07-30 at 11:09 +0100, Luis Henriques wrote:
> Jeff Layton <jlayton@kernel.org> writes:

> 

> > We've had some cases of hung umounts in teuthology testing. It looks

> > like client is waiting for cap flushes to complete, but they aren't.

> > 

> > Add a field to the inode to track the highest cap flush tid seen for

> > that inode. Also, add a backpointer to the inode to the ceph_cap_flush

> > struct.

> > 

> > Change wait_caps_flush to wait 60s, and then dump info about the

> > condition of the list.

> > 

> > Also, print pr_info messages if we end up dropping a FLUSH_ACK for an

> > inode onto the floor.

> > 

> > Reported-by: Patrick Donnelly <pdonnell@redhat.com>

> > URL: https://tracker.ceph.com/issues/51279

> > Signed-off-by: Jeff Layton <jlayton@kernel.org>

> > ---

> >  fs/ceph/caps.c       | 17 +++++++++++++++--

> >  fs/ceph/inode.c      |  1 +

> >  fs/ceph/mds_client.c | 31 +++++++++++++++++++++++++++++--

> >  fs/ceph/super.h      |  2 ++

> >  4 files changed, 47 insertions(+), 4 deletions(-)

> > 

> > v3: more debugging has shown the client waiting on FLUSH_ACK messages

> >     that seem to never have come. Add some new printks if we end up

> >     dropping a FLUSH_ACK onto the floor.

> 

> Since you're adding debug printks, would it be worth to also add one in

> mds_dispatch(), when __verify_registered_session(mdsc, s) < 0?

> 

> It's a wild guess, but the FLUSH_ACK could be dropped in that case too.

> Not that I could spot any issue there, but since this seems to be

> happening during umount...

> 

> Cheers,


Good point. I had looked at that case and had sort of dismissed it in
this situation, but you're probably right. I've added a similar pr_info
for that case and pushed it to the repo after a little testing here. I
won't bother re-posting it though since the change is trivial.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7ae83d06d48c..cb551c9e5867 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1829,6 +1829,7 @@  static u64 __mark_caps_flushing(struct inode *inode,
 	swap(cf, ci->i_prealloc_cap_flush);
 	cf->caps = flushing;
 	cf->wake = wake;
+	cf->ci = ci;
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	list_del_init(&ci->i_dirty_item);
@@ -3588,6 +3589,10 @@  static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	bool wake_ci = false;
 	bool wake_mdsc = false;
 
+	/* track latest cap flush ack seen for this inode */
+	if (flush_tid > ci->i_last_cap_flush_ack)
+		ci->i_last_cap_flush_ack = flush_tid;
+
 	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
 		/* Is this the one that was flushed? */
 		if (cf->tid == flush_tid)
@@ -4116,7 +4121,11 @@  void ceph_handle_caps(struct ceph_mds_session *session,
 	     (unsigned)seq);
 
 	if (!inode) {
-		dout(" i don't have ino %llx\n", vino.ino);
+		if (op == CEPH_CAP_OP_FLUSH_ACK)
+			pr_info("%s: can't find ino %llx:%llx for flush_ack!\n",
+				__func__, vino.snap, vino.ino);
+		else
+			dout(" i don't have ino %llx\n", vino.ino);
 
 		if (op == CEPH_CAP_OP_IMPORT) {
 			cap = ceph_get_cap(mdsc, NULL);
@@ -4169,10 +4178,14 @@  void ceph_handle_caps(struct ceph_mds_session *session,
 	spin_lock(&ci->i_ceph_lock);
 	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
 	if (!cap) {
-		dout(" no cap on %p ino %llx.%llx from mds%d\n",
+		dout(" no cap on %p ino %llx:%llx from mds%d\n",
 		     inode, ceph_ino(inode), ceph_snap(inode),
 		     session->s_mds);
 		spin_unlock(&ci->i_ceph_lock);
+		if (op == CEPH_CAP_OP_FLUSH_ACK)
+			pr_info("%s: no cap on %p ino %llx:%llx from mds%d for flush_ack!\n",
+				__func__, inode, ceph_ino(inode),
+				ceph_snap(inode), session->s_mds);
 		goto flush_cap_releases;
 	}
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1bd2cc015913..84e4f112fc45 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -499,6 +499,7 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ci->i_cap_snaps);
 	ci->i_head_snapc = NULL;
 	ci->i_snap_caps = 0;
+	ci->i_last_cap_flush_ack = 0;
 
 	ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c3fa0c0e4f6c..fc26527b8059 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2064,6 +2064,24 @@  static int check_caps_flush(struct ceph_mds_client *mdsc,
 	return ret;
 }
 
+static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_cap_flush *cf;
+
+	pr_info("%s: still waiting for cap flushes through %llu\n:\n",
+		__func__, want_tid);
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) {
+		if (cf->tid > want_tid)
+			break;
+		pr_info("%llx:%llx %s %llu %llu %d\n",
+			ceph_vinop(&cf->ci->vfs_inode),
+			ceph_cap_string(cf->caps), cf->tid,
+			cf->ci->i_last_cap_flush_ack, cf->wake);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
 /*
  * flush all dirty inode data to disk.
  *
@@ -2072,10 +2090,19 @@  static int check_caps_flush(struct ceph_mds_client *mdsc,
 static void wait_caps_flush(struct ceph_mds_client *mdsc,
 			    u64 want_flush_tid)
 {
+	long ret;
+
 	dout("check_caps_flush want %llu\n", want_flush_tid);
 
-	wait_event(mdsc->cap_flushing_wq,
-		   check_caps_flush(mdsc, want_flush_tid));
+	do {
+		ret = wait_event_timeout(mdsc->cap_flushing_wq,
+			   check_caps_flush(mdsc, want_flush_tid), 60 * HZ);
+		if (ret == 0)
+			dump_cap_flushes(mdsc, want_flush_tid);
+		else if (ret == 1)
+			pr_info("%s: condition evaluated to true after timeout!\n",
+				  __func__);
+	} while (ret == 0);
 
 	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 07eb542efa1d..d51d42a00f33 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -189,6 +189,7 @@  struct ceph_cap_flush {
 	bool wake; /* wake up flush waiters when finish ? */
 	struct list_head g_list; // global
 	struct list_head i_list; // per inode
+	struct ceph_inode_info *ci;
 };
 
 /*
@@ -388,6 +389,7 @@  struct ceph_inode_info {
 	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
 						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
+	u64 i_last_cap_flush_ack;		/* latest cap flush_ack tid for this inode */
 
 	unsigned long i_last_rd;
 	unsigned long i_last_wr;