diff mbox series

ceph: blocklist the kclient when receiving corrupt snap trace

Message ID 20221205061514.50423-1-xiubli@redhat.com
State Superseded
Headers show
Series ceph: blocklist the kclient when receiving corrupt snap trace | expand

Commit Message

Xiubo Li Dec. 5, 2022, 6:15 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

When received corrupted snap trace we don't know what exactly has
happened in MDS side. And we shouldn't continue writing to OSD,
which may corrupt the snapshot contents.

Just try to blocklist this client and If fails we need to crash the
client instead of leaving it writeable to OSDs.

URL: https://tracker.ceph.com/issues/57686
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c |  3 ++-
 fs/ceph/mds_client.h |  1 +
 fs/ceph/snap.c       | 25 +++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

Comments

Aaron Tomlin Dec. 6, 2022, 10:32 a.m. UTC | #1
On Mon 2022-12-05 14:15 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> When received corrupted snap trace we don't know what exactly has
> happened in MDS side. And we shouldn't continue writing to OSD,
> which may corrupt the snapshot contents.
> 
> Just try to blocklist this client and If fails we need to crash the
> client instead of leaving it writeable to OSDs.
> 
> URL: https://tracker.ceph.com/issues/57686
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c |  3 ++-
>  fs/ceph/mds_client.h |  1 +
>  fs/ceph/snap.c       | 25 +++++++++++++++++++++++++
>  3 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index cbbaf334b6b8..59094944af28 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con)
>  	struct ceph_mds_client *mdsc = s->s_mdsc;
>  
>  	pr_warn("mds%d closed our session\n", s->s_mds);
> -	send_mds_reconnect(mdsc, s);
> +	if (!mdsc->no_reconnect)
> +		send_mds_reconnect(mdsc, s);
>  }
>  
>  static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 728b7d72bf76..8e8f0447c0ad 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -413,6 +413,7 @@ struct ceph_mds_client {
>  	atomic_t		num_sessions;
>  	int                     max_sessions;  /* len of sessions array */
>  	int                     stopping;      /* true if shutting down */
> +	int                     no_reconnect;  /* true if snap trace is corrupted */
>  
>  	atomic64_t		quotarealms_count; /* # realms with quota */
>  	/*
> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> index c1c452afa84d..5b211ec7d7f7 100644
> --- a/fs/ceph/snap.c
> +++ b/fs/ceph/snap.c
> @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>  	struct ceph_snap_realm *realm;
>  	struct ceph_snap_realm *first_realm = NULL;
>  	struct ceph_snap_realm *realm_to_rebuild = NULL;
> +	struct ceph_client *client = mdsc->fsc->client;
>  	int rebuild_snapcs;
>  	int err = -ENOMEM;
> +	int ret;
>  	LIST_HEAD(dirty_realms);
>  
>  	lockdep_assert_held_write(&mdsc->snap_rwsem);
> @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>  	if (first_realm)
>  		ceph_put_snap_realm(mdsc, first_realm);
>  	pr_err("%s error %d\n", __func__, err);
> +
> +	/*
> +	 * When received corrupted snap trace we don't know what
> +	 * exactly has happened in MDS side. And we shouldn't continue
> +	 * writing to OSD, which may corrupt the snapshot contents.
> +	 *
> +	 * Just try to blocklist this client and if fails we need to
> +	 * crash the client instead of leaving it writeable to OSDs.
> +	 *
> +	 * Then this kclient must be remounted to continue after the
> +	 * corrupted metadata fixed in MDS side.
> +	 */
> +	mdsc->no_reconnect = 1;
> +	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
> +	if (ret) {
> +		pr_err("%s blocklist of %s failed: %d", __func__,
> +		       ceph_pr_addr(&client->msgr.inst.addr), ret);
> +		BUG();
> +	}
> +	pr_err("%s %s was blocklisted, do remount to continue%s",
> +	       __func__, ceph_pr_addr(&client->msgr.inst.addr),
> +	       err == -EIO ? " after corrupted snaptrace fixed": "");
> +
>  	return err;
>  }

Hi Xiubo,

How about using a WARN() so that we taint the Linux kernel too:

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 864cdaa0d2bd..1941584b8fdb 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	struct ceph_snap_realm *realm = NULL;
 	struct ceph_snap_realm *first_realm = NULL;
 	struct ceph_snap_realm *realm_to_rebuild = NULL;
+	struct ceph_client *client = mdsc->fsc->client;
 	int rebuild_snapcs;
 	int err = -ENOMEM;
+	int ret;
 	LIST_HEAD(dirty_realms);
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	if (first_realm)
 		ceph_put_snap_realm(mdsc, first_realm);
 	pr_err("%s error %d\n", __func__, err);
+
+
+	/*
+	 * When received corrupted snap trace we don't know what
+	 * exactly has happened in MDS side. And we shouldn't continue
+	 * writing to OSD, which may corrupt the snapshot contents.
+	 *
+	 * Just try to blocklist this client and if fails we need to
+	 * crash the client instead of leaving it writeable to OSDs.
+	 *
+	 * Then this kclient must be remounted to continue after the
+	 * corrupted metadata fixed in MDS side.
+	 */
+	mdsc->no_reconnect = 1;
+	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+	if (ret) {
+		pr_err("%s blocklist of %s failed: %d", __func__,
+		       ceph_pr_addr(&client->msgr.inst.addr), ret);
+		BUG();
+	}
+
+	WARN(1, "%s %s was blocklisted, do remount to continue%s",
+	     __func__, ceph_pr_addr(&client->msgr.inst.addr),
+	     err == -EIO ? " after corrupted snaptrace fixed": "");
+
 	return err;
 }
Xiubo Li Dec. 6, 2022, 12:12 p.m. UTC | #2
On 06/12/2022 18:32, Aaron Tomlin wrote:
> On Mon 2022-12-05 14:15 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> When received corrupted snap trace we don't know what exactly has
>> happened in MDS side. And we shouldn't continue writing to OSD,
>> which may corrupt the snapshot contents.
>>
>> Just try to blocklist this client and If fails we need to crash the
>> client instead of leaving it writeable to OSDs.
>>
>> URL: https://tracker.ceph.com/issues/57686
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c |  3 ++-
>>   fs/ceph/mds_client.h |  1 +
>>   fs/ceph/snap.c       | 25 +++++++++++++++++++++++++
>>   3 files changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index cbbaf334b6b8..59094944af28 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con)
>>   	struct ceph_mds_client *mdsc = s->s_mdsc;
>>   
>>   	pr_warn("mds%d closed our session\n", s->s_mds);
>> -	send_mds_reconnect(mdsc, s);
>> +	if (!mdsc->no_reconnect)
>> +		send_mds_reconnect(mdsc, s);
>>   }
>>   
>>   static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 728b7d72bf76..8e8f0447c0ad 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -413,6 +413,7 @@ struct ceph_mds_client {
>>   	atomic_t		num_sessions;
>>   	int                     max_sessions;  /* len of sessions array */
>>   	int                     stopping;      /* true if shutting down */
>> +	int                     no_reconnect;  /* true if snap trace is corrupted */
>>   
>>   	atomic64_t		quotarealms_count; /* # realms with quota */
>>   	/*
>> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
>> index c1c452afa84d..5b211ec7d7f7 100644
>> --- a/fs/ceph/snap.c
>> +++ b/fs/ceph/snap.c
>> @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>>   	struct ceph_snap_realm *realm;
>>   	struct ceph_snap_realm *first_realm = NULL;
>>   	struct ceph_snap_realm *realm_to_rebuild = NULL;
>> +	struct ceph_client *client = mdsc->fsc->client;
>>   	int rebuild_snapcs;
>>   	int err = -ENOMEM;
>> +	int ret;
>>   	LIST_HEAD(dirty_realms);
>>   
>>   	lockdep_assert_held_write(&mdsc->snap_rwsem);
>> @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>>   	if (first_realm)
>>   		ceph_put_snap_realm(mdsc, first_realm);
>>   	pr_err("%s error %d\n", __func__, err);
>> +
>> +	/*
>> +	 * When received corrupted snap trace we don't know what
>> +	 * exactly has happened in MDS side. And we shouldn't continue
>> +	 * writing to OSD, which may corrupt the snapshot contents.
>> +	 *
>> +	 * Just try to blocklist this client and if fails we need to
>> +	 * crash the client instead of leaving it writeable to OSDs.
>> +	 *
>> +	 * Then this kclient must be remounted to continue after the
>> +	 * corrupted metadata fixed in MDS side.
>> +	 */
>> +	mdsc->no_reconnect = 1;
>> +	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
>> +	if (ret) {
>> +		pr_err("%s blocklist of %s failed: %d", __func__,
>> +		       ceph_pr_addr(&client->msgr.inst.addr), ret);
>> +		BUG();
>> +	}
>> +	pr_err("%s %s was blocklisted, do remount to continue%s",
>> +	       __func__, ceph_pr_addr(&client->msgr.inst.addr),
>> +	       err == -EIO ? " after corrupted snaptrace fixed": "");
>> +
>>   	return err;
>>   }
> Hi Xiubo,
>
> How about using a WARN() so that we taint the Linux kernel too:

Yeah, this looks good to me :-)

Thanks Aaron.

- Xiubo

> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> index 864cdaa0d2bd..1941584b8fdb 100644
> --- a/fs/ceph/snap.c
> +++ b/fs/ceph/snap.c
> @@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>   	struct ceph_snap_realm *realm = NULL;
>   	struct ceph_snap_realm *first_realm = NULL;
>   	struct ceph_snap_realm *realm_to_rebuild = NULL;
> +	struct ceph_client *client = mdsc->fsc->client;
>   	int rebuild_snapcs;
>   	int err = -ENOMEM;
> +	int ret;
>   	LIST_HEAD(dirty_realms);
>   
>   	lockdep_assert_held_write(&mdsc->snap_rwsem);
> @@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>   	if (first_realm)
>   		ceph_put_snap_realm(mdsc, first_realm);
>   	pr_err("%s error %d\n", __func__, err);
> +
> +
> +	/*
> +	 * When received corrupted snap trace we don't know what
> +	 * exactly has happened in MDS side. And we shouldn't continue
> +	 * writing to OSD, which may corrupt the snapshot contents.
> +	 *
> +	 * Just try to blocklist this client and if fails we need to
> +	 * crash the client instead of leaving it writeable to OSDs.
> +	 *
> +	 * Then this kclient must be remounted to continue after the
> +	 * corrupted metadata fixed in MDS side.
> +	 */
> +	mdsc->no_reconnect = 1;
> +	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
> +	if (ret) {
> +		pr_err("%s blocklist of %s failed: %d", __func__,
> +		       ceph_pr_addr(&client->msgr.inst.addr), ret);
> +		BUG();
> +	}
> +
> +	WARN(1, "%s %s was blocklisted, do remount to continue%s",
> +	     __func__, ceph_pr_addr(&client->msgr.inst.addr),
> +	     err == -EIO ? " after corrupted snaptrace fixed": "");
> +
>   	return err;
>   }
>
>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cbbaf334b6b8..59094944af28 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5648,7 +5648,8 @@  static void mds_peer_reset(struct ceph_connection *con)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 
 	pr_warn("mds%d closed our session\n", s->s_mds);
-	send_mds_reconnect(mdsc, s);
+	if (!mdsc->no_reconnect)
+		send_mds_reconnect(mdsc, s);
 }
 
 static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 728b7d72bf76..8e8f0447c0ad 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -413,6 +413,7 @@  struct ceph_mds_client {
 	atomic_t		num_sessions;
 	int                     max_sessions;  /* len of sessions array */
 	int                     stopping;      /* true if shutting down */
+	int                     no_reconnect;  /* true if snap trace is corrupted */
 
 	atomic64_t		quotarealms_count; /* # realms with quota */
 	/*
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c1c452afa84d..5b211ec7d7f7 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -767,8 +767,10 @@  int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	struct ceph_snap_realm *realm;
 	struct ceph_snap_realm *first_realm = NULL;
 	struct ceph_snap_realm *realm_to_rebuild = NULL;
+	struct ceph_client *client = mdsc->fsc->client;
 	int rebuild_snapcs;
 	int err = -ENOMEM;
+	int ret;
 	LIST_HEAD(dirty_realms);
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -885,6 +887,29 @@  int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	if (first_realm)
 		ceph_put_snap_realm(mdsc, first_realm);
 	pr_err("%s error %d\n", __func__, err);
+
+	/*
+	 * When received corrupted snap trace we don't know what
+	 * exactly has happened in MDS side. And we shouldn't continue
+	 * writing to OSD, which may corrupt the snapshot contents.
+	 *
+	 * Just try to blocklist this client and if fails we need to
+	 * crash the client instead of leaving it writeable to OSDs.
+	 *
+	 * Then this kclient must be remounted to continue after the
+	 * corrupted metadata fixed in MDS side.
+	 */
+	mdsc->no_reconnect = 1;
+	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+	if (ret) {
+		pr_err("%s blocklist of %s failed: %d", __func__,
+		       ceph_pr_addr(&client->msgr.inst.addr), ret);
+		BUG();
+	}
+	pr_err("%s %s was blocklisted, do remount to continue%s",
+	       __func__, ceph_pr_addr(&client->msgr.inst.addr),
+	       err == -EIO ? " after corrupted snaptrace fixed": "");
+
 	return err;
 }