diff mbox series

ceph: try to reconnect to the export targets

Message ID 20210812041042.132984-1-xiubli@redhat.com
State New
Headers show
Series ceph: try to reconnect to the export targets | expand

Commit Message

Xiubo Li Aug. 12, 2021, 4:10 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

In case the export MDS is crashed just after the EImportStart journal
is flushed, so when a standby MDS takes over it and when replaying
the EImportStart journal the MDS will wait the client to reconnect,
but the client may never register/open the sessions yet.

We will try to reconnect that MDSes if they're in the export targets
and in RECONNECT state.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

Comments

Jeff Layton Aug. 16, 2021, 12:05 p.m. UTC | #1
On Thu, 2021-08-12 at 12:10 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>

> 

> In case the export MDS is crashed just after the EImportStart journal

> is flushed, so when a standby MDS takes over it and when replaying

> the EImportStart journal the MDS will wait the client to reconnect,

> but the client may never register/open the sessions yet.

> 

> We will try to reconnect that MDSes if they're in the export targets

> and in RECONNECT state.

> 

> Signed-off-by: Xiubo Li <xiubli@redhat.com>

> ---

>  fs/ceph/mds_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++-

>  1 file changed, 57 insertions(+), 1 deletion(-)

> 

> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

> index 14e44de05812..7dfe7a804320 100644

> --- a/fs/ceph/mds_client.c

> +++ b/fs/ceph/mds_client.c

> @@ -4182,13 +4182,24 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>  			  struct ceph_mdsmap *newmap,

>  			  struct ceph_mdsmap *oldmap)

>  {

> -	int i;

> +	int i, err;

> +	int *export_targets;

>  	int oldstate, newstate;

>  	struct ceph_mds_session *s;

> +	struct ceph_mds_info *m_info;

>  

>  	dout("check_new_map new %u old %u\n",

>  	     newmap->m_epoch, oldmap->m_epoch);

>  

> +	m_info = newmap->m_info;

> +	export_targets = kcalloc(newmap->possible_max_rank, sizeof(int), GFP_NOFS);

> +	if (export_targets && m_info) {

> +		for (i = 0; i < m_info->num_export_targets; i++) {

> +			BUG_ON(m_info->export_targets[i] >= newmap->possible_max_rank);


In general, we shouldn't BUG() in response to bad info sent by the MDS.
It would probably be better to check these values in
ceph_mdsmap_decode() and return an error there if it doesn't look right.
That way we can just toss out the new map instead of crashing.

> +			export_targets[m_info->export_targets[i]] = 1;

> +		}

> +	}

> +

>  	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

>  		if (!mdsc->sessions[i])

>  			continue;

> @@ -4242,6 +4253,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>  		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

>  		    newstate >= CEPH_MDS_STATE_RECONNECT) {

>  			mutex_unlock(&mdsc->mutex);

> +			if (export_targets)

> +				export_targets[i] = 0;

>  			send_mds_reconnect(mdsc, s);

>  			mutex_lock(&mdsc->mutex);

>  		}

> @@ -4264,6 +4277,47 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>  		}

>  	}

>  

> +	for (i = 0; i < newmap->possible_max_rank; i++) {


The condition on this loop is slightly different from the one below it,
and I'm not sure why. Should this also be checking this?

    i < newmap->possible_max_rank && i < mdsc->max_sessions

...do we need to look at export targets where i >= mdsc->max_sessions ?

> +		if (!export_targets)

> +			break;

> +

> +		/*

> +		 * Only open and reconnect sessions that don't

> +		 * exist yet.

> +		 */

> +		if (!export_targets[i] || __have_session(mdsc, i))

> +			continue;

> +

> +		/*

> +		 * In case the export MDS is crashed just after

> +		 * the EImportStart journal is flushed, so when

> +		 * a standby MDS takes over it and is replaying

> +		 * the EImportStart journal the new MDS daemon

> +		 * will wait the client to reconnect it, but the

> +		 * client may never register/open the sessions

> +		 * yet.

> +		 *

> +		 * It will try to reconnect that MDS daemons if

> +		 * the MDSes are in the export targets and is the

> +		 * RECONNECT state.

> +		 */

> +		newstate = ceph_mdsmap_get_state(newmap, i);

> +		if (newstate != CEPH_MDS_STATE_RECONNECT)

> +			continue;

> +		s = __open_export_target_session(mdsc, i);

> +		if (IS_ERR(s)) {

> +			err = PTR_ERR(s);

> +			pr_err("failed to open export target session, err %d\n",

> +			       err);

> +			continue;

> +		}

> +		dout("send reconnect to target mds.%d\n", i);

> +		mutex_unlock(&mdsc->mutex);

> +		send_mds_reconnect(mdsc, s);

> +		mutex_lock(&mdsc->mutex);

> +		ceph_put_mds_session(s);


Suppose we end up in this part of the code, and we have to drop the
mdsc->mutex like this. What ensures that an earlier session in the array
won't end up going back into CEPH_MDS_STATE_RECONNECT before we can get
into the loop below? This looks racy.

> +	}

> +

>  	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

>  		s = mdsc->sessions[i];

>  		if (!s)

> @@ -4278,6 +4332,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>  			__open_export_target_sessions(mdsc, s);

>  		}

>  	}

> +

> +	kfree(export_targets);

>  }

>  

>  


-- 
Jeff Layton <jlayton@kernel.org>
Xiubo Li Aug. 17, 2021, 3:03 a.m. UTC | #2
On 8/16/21 8:05 PM, Jeff Layton wrote:
> On Thu, 2021-08-12 at 12:10 +0800, xiubli@redhat.com wrote:

>> From: Xiubo Li <xiubli@redhat.com>

>>

>> In case the export MDS is crashed just after the EImportStart journal

>> is flushed, so when a standby MDS takes over it and when replaying

>> the EImportStart journal the MDS will wait the client to reconnect,

>> but the client may never register/open the sessions yet.

>>

>> We will try to reconnect that MDSes if they're in the export targets

>> and in RECONNECT state.

>>

>> Signed-off-by: Xiubo Li <xiubli@redhat.com>

>> ---

>>   fs/ceph/mds_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++-

>>   1 file changed, 57 insertions(+), 1 deletion(-)

>>

>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

>> index 14e44de05812..7dfe7a804320 100644

>> --- a/fs/ceph/mds_client.c

>> +++ b/fs/ceph/mds_client.c

>> @@ -4182,13 +4182,24 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>>   			  struct ceph_mdsmap *newmap,

>>   			  struct ceph_mdsmap *oldmap)

>>   {

>> -	int i;

>> +	int i, err;

>> +	int *export_targets;

>>   	int oldstate, newstate;

>>   	struct ceph_mds_session *s;

>> +	struct ceph_mds_info *m_info;

>>   

>>   	dout("check_new_map new %u old %u\n",

>>   	     newmap->m_epoch, oldmap->m_epoch);

>>   

>> +	m_info = newmap->m_info;

>> +	export_targets = kcalloc(newmap->possible_max_rank, sizeof(int), GFP_NOFS);

>> +	if (export_targets && m_info) {

>> +		for (i = 0; i < m_info->num_export_targets; i++) {

>> +			BUG_ON(m_info->export_targets[i] >= newmap->possible_max_rank);

> In general, we shouldn't BUG() in response to bad info sent by the MDS.

> It would probably be better to check these values in

> ceph_mdsmap_decode() and return an error there if it doesn't look right.

> That way we can just toss out the new map instead of crashing.


Sound reasonable, will fix it.


>> +			export_targets[m_info->export_targets[i]] = 1;

>> +		}

>> +	}

>> +

>>   	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

>>   		if (!mdsc->sessions[i])

>>   			continue;

>> @@ -4242,6 +4253,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>>   		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

>>   		    newstate >= CEPH_MDS_STATE_RECONNECT) {

>>   			mutex_unlock(&mdsc->mutex);

>> +			if (export_targets)

>> +				export_targets[i] = 0;

>>   			send_mds_reconnect(mdsc, s);

>>   			mutex_lock(&mdsc->mutex);

>>   		}

>> @@ -4264,6 +4277,47 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>>   		}

>>   	}

>>   

>> +	for (i = 0; i < newmap->possible_max_rank; i++) {

> The condition on this loop is slightly different from the one below it,

> and I'm not sure why. Should this also be checking this?

>

>      i < newmap->possible_max_rank && i < mdsc->max_sessions

>

> ...do we need to look at export targets where i >= mdsc->max_sessions ?


No, in this loop I am skipping that check on purpose.

Because just after the importing MDS daemon received the export info it 
will save this info in EImportStart journal and force to open the 
related client sessions, but this force open is only doing the 
preparation to open the session in the importing MDS daemon and then 
journal the sessions info, but won't establish the connections with the 
clients immediately until the clients to connect it seconds later, so if 
the importing MDS crashes before that in the clients may have no record 
in the mdsc->sessions[] for those sessions.

 From my tests in new mdsmap the rank numbers in export_targets maybe 
larger than mdsc->max_sessions.

When a standby MDS is replaying that sessions journal, it will restore 
those sessions state and wait the clients to reconnect them. And in this 
loop we only need to establish the connections for those sessions not in 
the mdsc->sessions[] yet.


>> +		if (!export_targets)

>> +			break;

>> +

>> +		/*

>> +		 * Only open and reconnect sessions that don't

>> +		 * exist yet.

>> +		 */

>> +		if (!export_targets[i] || __have_session(mdsc, i))

>> +			continue;

>> +

>> +		/*

>> +		 * In case the export MDS is crashed just after

>> +		 * the EImportStart journal is flushed, so when

>> +		 * a standby MDS takes over it and is replaying

>> +		 * the EImportStart journal the new MDS daemon

>> +		 * will wait the client to reconnect it, but the

>> +		 * client may never register/open the sessions

>> +		 * yet.

>> +		 *

>> +		 * It will try to reconnect that MDS daemons if

>> +		 * the MDSes are in the export targets and is the

>> +		 * RECONNECT state.

>> +		 */

>> +		newstate = ceph_mdsmap_get_state(newmap, i);

>> +		if (newstate != CEPH_MDS_STATE_RECONNECT)

>> +			continue;

>> +		s = __open_export_target_session(mdsc, i);

>> +		if (IS_ERR(s)) {

>> +			err = PTR_ERR(s);

>> +			pr_err("failed to open export target session, err %d\n",

>> +			       err);

>> +			continue;

>> +		}

>> +		dout("send reconnect to target mds.%d\n", i);

>> +		mutex_unlock(&mdsc->mutex);

>> +		send_mds_reconnect(mdsc, s);

>> +		mutex_lock(&mdsc->mutex);

>> +		ceph_put_mds_session(s);

> Suppose we end up in this part of the code, and we have to drop the

> mdsc->mutex like this. What ensures that an earlier session in the array

> won't end up going back into CEPH_MDS_STATE_RECONNECT before we can get

> into the loop below? This looks racy.


I am not sure I'm totally understanding this.

If my understanding it correct, you may mean:

The session maybe registered and opened by some requests which are 
choosing random MDSes during the mdsc->mutex's unlock/lock gap in the 
loop above.

If so I can fix it by not checking the '__have_session(mdsc, i)' and try 
to get the existing sessions, if exists then just reconnect it, if not 
then register-->open-->reconnect it.


The following loop will only try to open the sessions for the laggy 
MDSes, which haven't been replaced by the standby ones, once a specific 
rank is replaced by a standby MDS it shouldn't be in the laggy state. So 
once a rank has been handled in above reconnect loop, it shouldn't be in 
the following loop.

Thanks


>

>> +	}

>> +

>>   	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

>>   		s = mdsc->sessions[i];

>>   		if (!s)

>> @@ -4278,6 +4332,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

>>   			__open_export_target_sessions(mdsc, s);

>>   		}

>>   	}

>> +

>> +	kfree(export_targets);

>>   }

>>   

>>
Gregory Farnum Aug. 17, 2021, 5:56 p.m. UTC | #3
On Mon, Aug 16, 2021 at 5:06 AM Jeff Layton <jlayton@kernel.org> wrote:
>

> On Thu, 2021-08-12 at 12:10 +0800, xiubli@redhat.com wrote:

> > From: Xiubo Li <xiubli@redhat.com>

> >

> > In case the export MDS is crashed just after the EImportStart journal

> > is flushed, so when a standby MDS takes over it and when replaying

> > the EImportStart journal the MDS will wait the client to reconnect,

> > but the client may never register/open the sessions yet.

> >

> > We will try to reconnect that MDSes if they're in the export targets

> > and in RECONNECT state.

> >

> > Signed-off-by: Xiubo Li <xiubli@redhat.com>

> > ---

> >  fs/ceph/mds_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++-

> >  1 file changed, 57 insertions(+), 1 deletion(-)

> >

> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

> > index 14e44de05812..7dfe7a804320 100644

> > --- a/fs/ceph/mds_client.c

> > +++ b/fs/ceph/mds_client.c

> > @@ -4182,13 +4182,24 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> >                         struct ceph_mdsmap *newmap,

> >                         struct ceph_mdsmap *oldmap)

> >  {

> > -     int i;

> > +     int i, err;

> > +     int *export_targets;

> >       int oldstate, newstate;

> >       struct ceph_mds_session *s;

> > +     struct ceph_mds_info *m_info;

> >

> >       dout("check_new_map new %u old %u\n",

> >            newmap->m_epoch, oldmap->m_epoch);

> >

> > +     m_info = newmap->m_info;

> > +     export_targets = kcalloc(newmap->possible_max_rank, sizeof(int), GFP_NOFS);

> > +     if (export_targets && m_info) {

> > +             for (i = 0; i < m_info->num_export_targets; i++) {

> > +                     BUG_ON(m_info->export_targets[i] >= newmap->possible_max_rank);

>

> In general, we shouldn't BUG() in response to bad info sent by the MDS.

> It would probably be better to check these values in

> ceph_mdsmap_decode() and return an error there if it doesn't look right.

> That way we can just toss out the new map instead of crashing.


While I agree we don’t want to crash on unexpected input from the
network, if we are tossing out a map we need to shut down the mount as
well. If we think the system metadata is invalid, that’s not really a
recoverable condition and continuing to do IO is a mistake from the
whole-system perspective — either the server has failed horribly or
there’s something the client doesn’t understand which may be critical
to correctness; either way there's a big problem with the basic system
operation. (I mean, if we hit this point obviously the server has
failed horribly since we should have gated it, but it may have failed
horribly in some non-code-logic fashion.)
-Greg

>

> > +                     export_targets[m_info->export_targets[i]] = 1;

> > +             }

> > +     }

> > +

> >       for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> >               if (!mdsc->sessions[i])

> >                       continue;

> > @@ -4242,6 +4253,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> >               if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

> >                   newstate >= CEPH_MDS_STATE_RECONNECT) {

> >                       mutex_unlock(&mdsc->mutex);

> > +                     if (export_targets)

> > +                             export_targets[i] = 0;

> >                       send_mds_reconnect(mdsc, s);

> >                       mutex_lock(&mdsc->mutex);

> >               }

> > @@ -4264,6 +4277,47 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> >               }

> >       }

> >

> > +     for (i = 0; i < newmap->possible_max_rank; i++) {

>

> The condition on this loop is slightly different from the one below it,

> and I'm not sure why. Should this also be checking this?

>

>     i < newmap->possible_max_rank && i < mdsc->max_sessions

>

> ...do we need to look at export targets where i >= mdsc->max_sessions ?

>

> > +             if (!export_targets)

> > +                     break;

> > +

> > +             /*

> > +              * Only open and reconnect sessions that don't

> > +              * exist yet.

> > +              */

> > +             if (!export_targets[i] || __have_session(mdsc, i))

> > +                     continue;

> > +

> > +             /*

> > +              * In case the export MDS is crashed just after

> > +              * the EImportStart journal is flushed, so when

> > +              * a standby MDS takes over it and is replaying

> > +              * the EImportStart journal the new MDS daemon

> > +              * will wait the client to reconnect it, but the

> > +              * client may never register/open the sessions

> > +              * yet.

> > +              *

> > +              * It will try to reconnect that MDS daemons if

> > +              * the MDSes are in the export targets and is the

> > +              * RECONNECT state.

> > +              */

> > +             newstate = ceph_mdsmap_get_state(newmap, i);

> > +             if (newstate != CEPH_MDS_STATE_RECONNECT)

> > +                     continue;

> > +             s = __open_export_target_session(mdsc, i);

> > +             if (IS_ERR(s)) {

> > +                     err = PTR_ERR(s);

> > +                     pr_err("failed to open export target session, err %d\n",

> > +                            err);

> > +                     continue;

> > +             }

> > +             dout("send reconnect to target mds.%d\n", i);

> > +             mutex_unlock(&mdsc->mutex);

> > +             send_mds_reconnect(mdsc, s);

> > +             mutex_lock(&mdsc->mutex);

> > +             ceph_put_mds_session(s);

>

> Suppose we end up in this part of the code, and we have to drop the

> mdsc->mutex like this. What ensures that an earlier session in the array

> won't end up going back into CEPH_MDS_STATE_RECONNECT before we can get

> into the loop below? This looks racy.

>

> > +     }

> > +

> >       for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> >               s = mdsc->sessions[i];

> >               if (!s)

> > @@ -4278,6 +4332,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> >                       __open_export_target_sessions(mdsc, s);

> >               }

> >       }

> > +

> > +     kfree(export_targets);

> >  }

> >

> >

>

> --

> Jeff Layton <jlayton@kernel.org>

>
Jeff Layton Aug. 17, 2021, 6:14 p.m. UTC | #4
On Tue, 2021-08-17 at 10:56 -0700, Gregory Farnum wrote:
> On Mon, Aug 16, 2021 at 5:06 AM Jeff Layton <jlayton@kernel.org> wrote:

> > 

> > On Thu, 2021-08-12 at 12:10 +0800, xiubli@redhat.com wrote:

> > > From: Xiubo Li <xiubli@redhat.com>

> > > 

> > > In case the export MDS is crashed just after the EImportStart journal

> > > is flushed, so when a standby MDS takes over it and when replaying

> > > the EImportStart journal the MDS will wait the client to reconnect,

> > > but the client may never register/open the sessions yet.

> > > 

> > > We will try to reconnect that MDSes if they're in the export targets

> > > and in RECONNECT state.

> > > 

> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>

> > > ---

> > >  fs/ceph/mds_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++-

> > >  1 file changed, 57 insertions(+), 1 deletion(-)

> > > 

> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

> > > index 14e44de05812..7dfe7a804320 100644

> > > --- a/fs/ceph/mds_client.c

> > > +++ b/fs/ceph/mds_client.c

> > > @@ -4182,13 +4182,24 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > >                         struct ceph_mdsmap *newmap,

> > >                         struct ceph_mdsmap *oldmap)

> > >  {

> > > -     int i;

> > > +     int i, err;

> > > +     int *export_targets;

> > >       int oldstate, newstate;

> > >       struct ceph_mds_session *s;

> > > +     struct ceph_mds_info *m_info;

> > > 

> > >       dout("check_new_map new %u old %u\n",

> > >            newmap->m_epoch, oldmap->m_epoch);

> > > 

> > > +     m_info = newmap->m_info;

> > > +     export_targets = kcalloc(newmap->possible_max_rank, sizeof(int), GFP_NOFS);

> > > +     if (export_targets && m_info) {

> > > +             for (i = 0; i < m_info->num_export_targets; i++) {

> > > +                     BUG_ON(m_info->export_targets[i] >= newmap->possible_max_rank);

> > 

> > In general, we shouldn't BUG() in response to bad info sent by the MDS.

> > It would probably be better to check these values in

> > ceph_mdsmap_decode() and return an error there if it doesn't look right.

> > That way we can just toss out the new map instead of crashing.

> 

> While I agree we don’t want to crash on unexpected input from the

> network, if we are tossing out a map we need to shut down the mount as

> well. If we think the system metadata is invalid, that’s not really a

> recoverable condition and continuing to do IO is a mistake from the

> whole-system perspective — either the server has failed horribly or

> there’s something the client doesn’t understand which may be critical

> to correctness; either way there's a big problem with the basic system

> operation. (I mean, if we hit this point obviously the server has

> failed horribly since we should have gated it, but it may have failed

> horribly in some non-code-logic fashion.)

> -Greg

> 


I see this as essentially the same as any other parsing error in the
mdsmap. When we hit one of those, we currently just do this:

    pr_err("error decoding fsmap\n");

...and soldier on. It's been this way since the beginning, afaict.

If we want to do something more involved there, then that could probably
be done, but it's not as simple as throwing a switch. We may have open
files and dirty data to deal with. We do have some code to deal with
attempting to reconnect after a blacklist event, so you might be able to
treat this similarly. 

In any case, this would be a pretty unusual situation, and I don't see
us having the manpower to spend on coding up an elegant solution to this
potential problem anytime soon. It might be worth opening a tracker for
it though if that changes in the future.


> > 

> > > +                     export_targets[m_info->export_targets[i]] = 1;

> > > +             }

> > > +     }

> > > +

> > >       for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> > >               if (!mdsc->sessions[i])

> > >                       continue;

> > > @@ -4242,6 +4253,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > >               if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

> > >                   newstate >= CEPH_MDS_STATE_RECONNECT) {

> > >                       mutex_unlock(&mdsc->mutex);

> > > +                     if (export_targets)

> > > +                             export_targets[i] = 0;

> > >                       send_mds_reconnect(mdsc, s);

> > >                       mutex_lock(&mdsc->mutex);

> > >               }

> > > @@ -4264,6 +4277,47 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > >               }

> > >       }

> > > 

> > > +     for (i = 0; i < newmap->possible_max_rank; i++) {

> > 

> > The condition on this loop is slightly different from the one below it,

> > and I'm not sure why. Should this also be checking this?

> > 

> >     i < newmap->possible_max_rank && i < mdsc->max_sessions

> > 

> > ...do we need to look at export targets where i >= mdsc->max_sessions ?

> > 

> > > +             if (!export_targets)

> > > +                     break;

> > > +

> > > +             /*

> > > +              * Only open and reconnect sessions that don't

> > > +              * exist yet.

> > > +              */

> > > +             if (!export_targets[i] || __have_session(mdsc, i))

> > > +                     continue;

> > > +

> > > +             /*

> > > +              * In case the export MDS is crashed just after

> > > +              * the EImportStart journal is flushed, so when

> > > +              * a standby MDS takes over it and is replaying

> > > +              * the EImportStart journal the new MDS daemon

> > > +              * will wait the client to reconnect it, but the

> > > +              * client may never register/open the sessions

> > > +              * yet.

> > > +              *

> > > +              * It will try to reconnect that MDS daemons if

> > > +              * the MDSes are in the export targets and is the

> > > +              * RECONNECT state.

> > > +              */

> > > +             newstate = ceph_mdsmap_get_state(newmap, i);

> > > +             if (newstate != CEPH_MDS_STATE_RECONNECT)

> > > +                     continue;

> > > +             s = __open_export_target_session(mdsc, i);

> > > +             if (IS_ERR(s)) {

> > > +                     err = PTR_ERR(s);

> > > +                     pr_err("failed to open export target session, err %d\n",

> > > +                            err);

> > > +                     continue;

> > > +             }

> > > +             dout("send reconnect to target mds.%d\n", i);

> > > +             mutex_unlock(&mdsc->mutex);

> > > +             send_mds_reconnect(mdsc, s);

> > > +             mutex_lock(&mdsc->mutex);

> > > +             ceph_put_mds_session(s);

> > 

> > Suppose we end up in this part of the code, and we have to drop the

> > mdsc->mutex like this. What ensures that an earlier session in the array

> > won't end up going back into CEPH_MDS_STATE_RECONNECT before we can get

> > into the loop below? This looks racy.

> > 

> > > +     }

> > > +

> > >       for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> > >               s = mdsc->sessions[i];

> > >               if (!s)

> > > @@ -4278,6 +4332,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > >                       __open_export_target_sessions(mdsc, s);

> > >               }

> > >       }

> > > +

> > > +     kfree(export_targets);

> > >  }

> > > 

> > > 

> > 

> > --

> > Jeff Layton <jlayton@kernel.org>

> > 

> 


-- 
Jeff Layton <jlayton@kernel.org>
Gregory Farnum Aug. 17, 2021, 6:48 p.m. UTC | #5
On Tue, Aug 17, 2021 at 11:14 AM Jeff Layton <jlayton@kernel.org> wrote:
> On Tue, 2021-08-17 at 10:56 -0700, Gregory Farnum wrote:

> > While I agree we don’t want to crash on unexpected input from the

> > network, if we are tossing out a map we need to shut down the mount as

> > well. If we think the system metadata is invalid, that’s not really a

> > recoverable condition and continuing to do IO is a mistake from the

> > whole-system perspective — either the server has failed horribly or

> > there’s something the client doesn’t understand which may be critical

> > to correctness; either way there's a big problem with the basic system

> > operation. (I mean, if we hit this point obviously the server has

> > failed horribly since we should have gated it, but it may have failed

> > horribly in some non-code-logic fashion.)

> > -Greg

> >

>

> I see this as essentially the same as any other parsing error in the

> mdsmap. When we hit one of those, we currently just do this:

>

>     pr_err("error decoding fsmap\n");

>

> ...and soldier on. It's been this way since the beginning, afaict.


Oh. That's, uh, interesting.
I mean, you're right, this case isn't any more special. I just didn't
know that's how the kernel client handles it. (The userspace client
inherits the usual userspace decode logic and any accompanying
asserts.)

> If we want to do something more involved there, then that could probably

> be done, but it's not as simple as throwing a switch. We may have open

> files and dirty data to deal with. We do have some code to deal with

> attempting to reconnect after a blacklist event, so you might be able to

> treat this similarly.


Hmm, my guess is this only happens if the MDS is spewing nonsense out
over its pipe, or we've made a logic error and let a client join
across a non-backwards-compatible encoding/feature change. I think we
probably just start throwing EIO and don't try to remount, rather than
going for anything more polite. *shrug*

> In any case, this would be a pretty unusual situation, and I don't see

> us having the manpower to spend on coding up an elegant solution to this

> potential problem anytime soon. It might be worth opening a tracker for

> it though if that changes in the future.


Makes sense. Ticket done: https://tracker.ceph.com/issues/52303
-Greg

>

>

> > >

> > > > +                     export_targets[m_info->export_targets[i]] = 1;

> > > > +             }

> > > > +     }

> > > > +

> > > >       for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> > > >               if (!mdsc->sessions[i])

> > > >                       continue;

> > > > @@ -4242,6 +4253,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > > >               if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

> > > >                   newstate >= CEPH_MDS_STATE_RECONNECT) {

> > > >                       mutex_unlock(&mdsc->mutex);

> > > > +                     if (export_targets)

> > > > +                             export_targets[i] = 0;

> > > >                       send_mds_reconnect(mdsc, s);

> > > >                       mutex_lock(&mdsc->mutex);

> > > >               }

> > > > @@ -4264,6 +4277,47 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > > >               }

> > > >       }

> > > >

> > > > +     for (i = 0; i < newmap->possible_max_rank; i++) {

> > >

> > > The condition on this loop is slightly different from the one below it,

> > > and I'm not sure why. Should this also be checking this?

> > >

> > >     i < newmap->possible_max_rank && i < mdsc->max_sessions

> > >

> > > ...do we need to look at export targets where i >= mdsc->max_sessions ?

> > >

> > > > +             if (!export_targets)

> > > > +                     break;

> > > > +

> > > > +             /*

> > > > +              * Only open and reconnect sessions that don't

> > > > +              * exist yet.

> > > > +              */

> > > > +             if (!export_targets[i] || __have_session(mdsc, i))

> > > > +                     continue;

> > > > +

> > > > +             /*

> > > > +              * In case the export MDS is crashed just after

> > > > +              * the EImportStart journal is flushed, so when

> > > > +              * a standby MDS takes over it and is replaying

> > > > +              * the EImportStart journal the new MDS daemon

> > > > +              * will wait the client to reconnect it, but the

> > > > +              * client may never register/open the sessions

> > > > +              * yet.

> > > > +              *

> > > > +              * It will try to reconnect that MDS daemons if

> > > > +              * the MDSes are in the export targets and is the

> > > > +              * RECONNECT state.

> > > > +              */

> > > > +             newstate = ceph_mdsmap_get_state(newmap, i);

> > > > +             if (newstate != CEPH_MDS_STATE_RECONNECT)

> > > > +                     continue;

> > > > +             s = __open_export_target_session(mdsc, i);

> > > > +             if (IS_ERR(s)) {

> > > > +                     err = PTR_ERR(s);

> > > > +                     pr_err("failed to open export target session, err %d\n",

> > > > +                            err);

> > > > +                     continue;

> > > > +             }

> > > > +             dout("send reconnect to target mds.%d\n", i);

> > > > +             mutex_unlock(&mdsc->mutex);

> > > > +             send_mds_reconnect(mdsc, s);

> > > > +             mutex_lock(&mdsc->mutex);

> > > > +             ceph_put_mds_session(s);

> > >

> > > Suppose we end up in this part of the code, and we have to drop the

> > > mdsc->mutex like this. What ensures that an earlier session in the array

> > > won't end up going back into CEPH_MDS_STATE_RECONNECT before we can get

> > > into the loop below? This looks racy.

> > >

> > > > +     }

> > > > +

> > > >       for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

> > > >               s = mdsc->sessions[i];

> > > >               if (!s)

> > > > @@ -4278,6 +4332,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,

> > > >                       __open_export_target_sessions(mdsc, s);

> > > >               }

> > > >       }

> > > > +

> > > > +     kfree(export_targets);

> > > >  }

> > > >

> > > >

> > >

> > > --

> > > Jeff Layton <jlayton@kernel.org>

> > >

> >

>

> --

> Jeff Layton <jlayton@kernel.org>

>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 14e44de05812..7dfe7a804320 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4182,13 +4182,24 @@  static void check_new_map(struct ceph_mds_client *mdsc,
 			  struct ceph_mdsmap *newmap,
 			  struct ceph_mdsmap *oldmap)
 {
-	int i;
+	int i, err;
+	int *export_targets;
 	int oldstate, newstate;
 	struct ceph_mds_session *s;
+	struct ceph_mds_info *m_info;
 
 	dout("check_new_map new %u old %u\n",
 	     newmap->m_epoch, oldmap->m_epoch);
 
+	m_info = newmap->m_info;
+	export_targets = kcalloc(newmap->possible_max_rank, sizeof(int), GFP_NOFS);
+	if (export_targets && m_info) {
+		for (i = 0; i < m_info->num_export_targets; i++) {
+			BUG_ON(m_info->export_targets[i] >= newmap->possible_max_rank);
+			export_targets[m_info->export_targets[i]] = 1;
+		}
+	}
+
 	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
 		if (!mdsc->sessions[i])
 			continue;
@@ -4242,6 +4253,8 @@  static void check_new_map(struct ceph_mds_client *mdsc,
 		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
 		    newstate >= CEPH_MDS_STATE_RECONNECT) {
 			mutex_unlock(&mdsc->mutex);
+			if (export_targets)
+				export_targets[i] = 0;
 			send_mds_reconnect(mdsc, s);
 			mutex_lock(&mdsc->mutex);
 		}
@@ -4264,6 +4277,47 @@  static void check_new_map(struct ceph_mds_client *mdsc,
 		}
 	}
 
+	for (i = 0; i < newmap->possible_max_rank; i++) {
+		if (!export_targets)
+			break;
+
+		/*
+		 * Only open and reconnect sessions that don't
+		 * exist yet.
+		 */
+		if (!export_targets[i] || __have_session(mdsc, i))
+			continue;
+
+		/*
+		 * In case the export MDS is crashed just after
+		 * the EImportStart journal is flushed, so when
+		 * a standby MDS takes over it and is replaying
+		 * the EImportStart journal the new MDS daemon
+		 * will wait the client to reconnect it, but the
+		 * client may never register/open the sessions
+		 * yet.
+		 *
+		 * It will try to reconnect that MDS daemons if
+		 * the MDSes are in the export targets and is the
+		 * RECONNECT state.
+		 */
+		newstate = ceph_mdsmap_get_state(newmap, i);
+		if (newstate != CEPH_MDS_STATE_RECONNECT)
+			continue;
+		s = __open_export_target_session(mdsc, i);
+		if (IS_ERR(s)) {
+			err = PTR_ERR(s);
+			pr_err("failed to open export target session, err %d\n",
+			       err);
+			continue;
+		}
+		dout("send reconnect to target mds.%d\n", i);
+		mutex_unlock(&mdsc->mutex);
+		send_mds_reconnect(mdsc, s);
+		mutex_lock(&mdsc->mutex);
+		ceph_put_mds_session(s);
+	}
+
 	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
 		s = mdsc->sessions[i];
 		if (!s)
@@ -4278,6 +4332,8 @@  static void check_new_map(struct ceph_mds_client *mdsc,
 			__open_export_target_sessions(mdsc, s);
 		}
 	}
+
+	kfree(export_targets);
 }