Message ID | 20221205061514.50423-1-xiubli@redhat.com |
---|---|
State | Superseded |
Headers | show |
Series | ceph: blocklist the kclient when receiving corrupt snap trace | expand |
On Mon 2022-12-05 14:15 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > When received corrupted snap trace we don't know what exactly has > happened in MDS side. And we shouldn't continue writing to OSD, > which may corrupt the snapshot contents. > > Just try to blocklist this client and If fails we need to crash the > client instead of leaving it writeable to OSDs. > > URL: https://tracker.ceph.com/issues/57686 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/mds_client.c | 3 ++- > fs/ceph/mds_client.h | 1 + > fs/ceph/snap.c | 25 +++++++++++++++++++++++++ > 3 files changed, 28 insertions(+), 1 deletion(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index cbbaf334b6b8..59094944af28 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con) > struct ceph_mds_client *mdsc = s->s_mdsc; > > pr_warn("mds%d closed our session\n", s->s_mds); > - send_mds_reconnect(mdsc, s); > + if (!mdsc->no_reconnect) > + send_mds_reconnect(mdsc, s); > } > > static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 728b7d72bf76..8e8f0447c0ad 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -413,6 +413,7 @@ struct ceph_mds_client { > atomic_t num_sessions; > int max_sessions; /* len of sessions array */ > int stopping; /* true if shutting down */ > + int no_reconnect; /* true if snap trace is corrupted */ > > atomic64_t quotarealms_count; /* # realms with quota */ > /* > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c > index c1c452afa84d..5b211ec7d7f7 100644 > --- a/fs/ceph/snap.c > +++ b/fs/ceph/snap.c > @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > struct ceph_snap_realm *realm; > struct ceph_snap_realm *first_realm = NULL; > struct ceph_snap_realm *realm_to_rebuild = NULL; > + struct ceph_client *client = mdsc->fsc->client; > int rebuild_snapcs; > int err = -ENOMEM; > + int ret; > LIST_HEAD(dirty_realms); > > lockdep_assert_held_write(&mdsc->snap_rwsem); > @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > if (first_realm) > ceph_put_snap_realm(mdsc, first_realm); > pr_err("%s error %d\n", __func__, err); > + > + /* > + * When received corrupted snap trace we don't know what > + * exactly has happened in MDS side. And we shouldn't continue > + * writing to OSD, which may corrupt the snapshot contents. > + * > + * Just try to blocklist this client and if fails we need to > + * crash the client instead of leaving it writeable to OSDs. > + * > + * Then this kclient must be remounted to continue after the > + * corrupted metadata fixed in MDS side. > + */ > + mdsc->no_reconnect = 1; > + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); > + if (ret) { > + pr_err("%s blocklist of %s failed: %d", __func__, > + ceph_pr_addr(&client->msgr.inst.addr), ret); > + BUG(); > + } > + pr_err("%s %s was blocklisted, do remount to continue%s", > + __func__, ceph_pr_addr(&client->msgr.inst.addr), > + err == -EIO ? " after corrupted snaptrace fixed": ""); > + > return err; > } Hi Xiubo, How about using a WARN() so that we taint the Linux kernel too: diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 864cdaa0d2bd..1941584b8fdb 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + + /* + * When received corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this client and if fails we need to + * crash the client instead of leaving it writeable to OSDs. + * + * Then this kclient must be remounted to continue after the + * corrupted metadata fixed in MDS side. + */ + mdsc->no_reconnect = 1; + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) { + pr_err("%s blocklist of %s failed: %d", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + BUG(); + } + + WARN(1, "%s %s was blocklisted, do remount to continue%s", + __func__, ceph_pr_addr(&client->msgr.inst.addr), + err == -EIO ? " after corrupted snaptrace fixed": ""); + return err; }
On 06/12/2022 18:32, Aaron Tomlin wrote: > On Mon 2022-12-05 14:15 +0800, xiubli@redhat.com wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> When received corrupted snap trace we don't know what exactly has >> happened in MDS side. And we shouldn't continue writing to OSD, >> which may corrupt the snapshot contents. >> >> Just try to blocklist this client and If fails we need to crash the >> client instead of leaving it writeable to OSDs. >> >> URL: https://tracker.ceph.com/issues/57686 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> fs/ceph/mds_client.c | 3 ++- >> fs/ceph/mds_client.h | 1 + >> fs/ceph/snap.c | 25 +++++++++++++++++++++++++ >> 3 files changed, 28 insertions(+), 1 deletion(-) >> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c >> index cbbaf334b6b8..59094944af28 100644 >> --- a/fs/ceph/mds_client.c >> +++ b/fs/ceph/mds_client.c >> @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con) >> struct ceph_mds_client *mdsc = s->s_mdsc; >> >> pr_warn("mds%d closed our session\n", s->s_mds); >> - send_mds_reconnect(mdsc, s); >> + if (!mdsc->no_reconnect) >> + send_mds_reconnect(mdsc, s); >> } >> >> static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h >> index 728b7d72bf76..8e8f0447c0ad 100644 >> --- a/fs/ceph/mds_client.h >> +++ b/fs/ceph/mds_client.h >> @@ -413,6 +413,7 @@ struct ceph_mds_client { >> atomic_t num_sessions; >> int max_sessions; /* len of sessions array */ >> int stopping; /* true if shutting down */ >> + int no_reconnect; /* true if snap trace is corrupted */ >> >> atomic64_t quotarealms_count; /* # realms with quota */ >> /* >> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c >> index c1c452afa84d..5b211ec7d7f7 100644 >> --- a/fs/ceph/snap.c >> +++ b/fs/ceph/snap.c >> @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, >> struct ceph_snap_realm *realm; >> struct ceph_snap_realm *first_realm = NULL; >> struct ceph_snap_realm *realm_to_rebuild = NULL; >> + struct ceph_client *client = mdsc->fsc->client; >> int rebuild_snapcs; >> int err = -ENOMEM; >> + int ret; >> LIST_HEAD(dirty_realms); >> >> lockdep_assert_held_write(&mdsc->snap_rwsem); >> @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, >> if (first_realm) >> ceph_put_snap_realm(mdsc, first_realm); >> pr_err("%s error %d\n", __func__, err); >> + >> + /* >> + * When received corrupted snap trace we don't know what >> + * exactly has happened in MDS side. And we shouldn't continue >> + * writing to OSD, which may corrupt the snapshot contents. >> + * >> + * Just try to blocklist this client and if fails we need to >> + * crash the client instead of leaving it writeable to OSDs. >> + * >> + * Then this kclient must be remounted to continue after the >> + * corrupted metadata fixed in MDS side. >> + */ >> + mdsc->no_reconnect = 1; >> + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); >> + if (ret) { >> + pr_err("%s blocklist of %s failed: %d", __func__, >> + ceph_pr_addr(&client->msgr.inst.addr), ret); >> + BUG(); >> + } >> + pr_err("%s %s was blocklisted, do remount to continue%s", >> + __func__, ceph_pr_addr(&client->msgr.inst.addr), >> + err == -EIO ? " after corrupted snaptrace fixed": ""); >> + >> return err; >> } > Hi Xiubo, > > How about using a WARN() so that we taint the Linux kernel too: Yeah, this looks good to me :-) Thanks Aaron. - Xiubo > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c > index 864cdaa0d2bd..1941584b8fdb 100644 > --- a/fs/ceph/snap.c > +++ b/fs/ceph/snap.c > @@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > struct ceph_snap_realm *realm = NULL; > struct ceph_snap_realm *first_realm = NULL; > struct ceph_snap_realm *realm_to_rebuild = NULL; > + struct ceph_client *client = mdsc->fsc->client; > int rebuild_snapcs; > int err = -ENOMEM; > + int ret; > LIST_HEAD(dirty_realms); > > lockdep_assert_held_write(&mdsc->snap_rwsem); > @@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > if (first_realm) > ceph_put_snap_realm(mdsc, first_realm); > pr_err("%s error %d\n", __func__, err); > + > + > + /* > + * When received corrupted snap trace we don't know what > + * exactly has happened in MDS side. And we shouldn't continue > + * writing to OSD, which may corrupt the snapshot contents. > + * > + * Just try to blocklist this client and if fails we need to > + * crash the client instead of leaving it writeable to OSDs. > + * > + * Then this kclient must be remounted to continue after the > + * corrupted metadata fixed in MDS side. > + */ > + mdsc->no_reconnect = 1; > + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); > + if (ret) { > + pr_err("%s blocklist of %s failed: %d", __func__, > + ceph_pr_addr(&client->msgr.inst.addr), ret); > + BUG(); > + } > + > + WARN(1, "%s %s was blocklisted, do remount to continue%s", > + __func__, ceph_pr_addr(&client->msgr.inst.addr), > + err == -EIO ? " after corrupted snaptrace fixed": ""); > + > return err; > } > >
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cbbaf334b6b8..59094944af28 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (!mdsc->no_reconnect) + send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 728b7d72bf76..8e8f0447c0ad 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -413,6 +413,7 @@ struct ceph_mds_client { atomic_t num_sessions; int max_sessions; /* len of sessions array */ int stopping; /* true if shutting down */ + int no_reconnect; /* true if snap trace is corrupted */ atomic64_t quotarealms_count; /* # realms with quota */ /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c1c452afa84d..5b211ec7d7f7 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + /* + * When received corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this client and if fails we need to + * crash the client instead of leaving it writeable to OSDs. + * + * Then this kclient must be remounted to continue after the + * corrupted metadata fixed in MDS side. + */ + mdsc->no_reconnect = 1; + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) { + pr_err("%s blocklist of %s failed: %d", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + BUG(); + } + pr_err("%s %s was blocklisted, do remount to continue%s", + __func__, ceph_pr_addr(&client->msgr.inst.addr), + err == -EIO ? " after corrupted snaptrace fixed": ""); + return err; }