diff mbox series

[v2,4/4] ceph: flush the mdlog before waiting on unsafe reqs

Message ID 20210705012257.182669-5-xiubli@redhat.com
State New
Headers show
Series flush the mdlog before waiting on unsafe reqs | expand

Commit Message

Xiubo Li July 5, 2021, 1:22 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

For the client requests who will have unsafe and safe replies from
MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
(journal log) immediatelly, because they think it's unnecessary.
That's true for most cases but not all, likes the fsync request.
The fsync will wait until all the unsafe replied requests to be
safely replied.

Normally if there have multiple threads or clients are running, the
whole mdlog in MDS daemons could be flushed in time if any request
will trigger the mdlog submit thread. So usually we won't experience
the normal operations will stuck for a long time. But in case there
has only one client with only thread is running, the stuck phenomenon
maybe obvious and the worst case it must wait at most 5 seconds to
wait the mdlog to be flushed by the MDS's tick thread periodically.

This patch will trigger to flush the mdlog in the relevant and auth
MDSes to which the in-flight requests are sent just before waiting
the unsafe requests to finish.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

Comments

Jeff Layton July 6, 2021, 11:42 a.m. UTC | #1
On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>

> 

> For the client requests who will have unsafe and safe replies from

> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog

> (journal log) immediatelly, because they think it's unnecessary.

> That's true for most cases but not all, likes the fsync request.

> The fsync will wait until all the unsafe replied requests to be

> safely replied.

> 

> Normally if there have multiple threads or clients are running, the

> whole mdlog in MDS daemons could be flushed in time if any request

> will trigger the mdlog submit thread. So usually we won't experience

> the normal operations will stuck for a long time. But in case there

> has only one client with only thread is running, the stuck phenomenon

> maybe obvious and the worst case it must wait at most 5 seconds to

> wait the mdlog to be flushed by the MDS's tick thread periodically.

> 

> This patch will trigger to flush the mdlog in the relevant and auth

> MDSes to which the in-flight requests are sent just before waiting

> the unsafe requests to finish.

> 

> Signed-off-by: Xiubo Li <xiubli@redhat.com>

> ---

>  fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++

>  1 file changed, 78 insertions(+)

> 

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

> index c6a3352a4d52..4b966c29d9b5 100644

> --- a/fs/ceph/caps.c

> +++ b/fs/ceph/caps.c

> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)

>   */

>  static int unsafe_request_wait(struct inode *inode)

>  {

> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;

>  	struct ceph_inode_info *ci = ceph_inode(inode);

>  	struct ceph_mds_request *req1 = NULL, *req2 = NULL;

>  	int ret, err = 0;

> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)

>  	}

>  	spin_unlock(&ci->i_unsafe_lock);

>  

> +	/*

> +	 * Trigger to flush the journal logs in all the relevant MDSes

> +	 * manually, or in the worst case we must wait at most 5 seconds

> +	 * to wait the journal logs to be flushed by the MDSes periodically.

> +	 */

> +	if (req1 || req2) {

> +		struct ceph_mds_session **sessions = NULL;

> +		struct ceph_mds_session *s;

> +		struct ceph_mds_request *req;

> +		unsigned int max;

> +		int i;

> +

> +		/*

> +		 * The mdsc->max_sessions is unlikely to be changed

> +		 * mostly, here we will retry it by reallocating the

> +		 * sessions arrary memory to get rid of the mdsc->mutex

> +		 * lock.

> +		 */

> +retry:

> +		max = mdsc->max_sessions;

> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);


The kerneldoc over krealloc() says:

 * The contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes (__GFP_ZERO flag is effectively
ignored).

This code however relies on krealloc zeroing out the new part of the
allocation. Do you know for certain that that works?

> +		if (!sessions) {

> +			err = -ENOMEM;

> +			goto out;

> +		}

> +		spin_lock(&ci->i_unsafe_lock);

> +		if (req1) {

> +			list_for_each_entry(req, &ci->i_unsafe_dirops,

> +					    r_unsafe_dir_item) {

> +				s = req->r_session;

> +				if (unlikely(s->s_mds > max)) {

> +					spin_unlock(&ci->i_unsafe_lock);

> +					goto retry;

> +				}

> +				if (!sessions[s->s_mds]) {

> +					s = ceph_get_mds_session(s);

> +					sessions[s->s_mds] = s;


nit: maybe just do:

    sessions[s->s_mds] = ceph_get_mds_session(s);


> +				}

> +			}

> +		}

> +		if (req2) {

> +			list_for_each_entry(req, &ci->i_unsafe_iops,

> +					    r_unsafe_target_item) {

> +				s = req->r_session;

> +				if (unlikely(s->s_mds > max)) {

> +					spin_unlock(&ci->i_unsafe_lock);

> +					goto retry;

> +				}

> +				if (!sessions[s->s_mds]) {

> +					s = ceph_get_mds_session(s);

> +					sessions[s->s_mds] = s;

> +				}

> +			}

> +		}

> +		spin_unlock(&ci->i_unsafe_lock);

> +

> +		/* the auth MDS */

> +		spin_lock(&ci->i_ceph_lock);

> +		if (ci->i_auth_cap) {

> +		      s = ci->i_auth_cap->session;

> +		      if (!sessions[s->s_mds])

> +			      sessions[s->s_mds] = ceph_get_mds_session(s);

> +		}

> +		spin_unlock(&ci->i_ceph_lock);

> +

> +		/* send flush mdlog request to MDSes */

> +		for (i = 0; i < max; i++) {

> +			s = sessions[i];

> +			if (s) {

> +				send_flush_mdlog(s);

> +				ceph_put_mds_session(s);

> +			}

> +		}

> +		kfree(sessions);

> +	}

> +

>  	dout("unsafe_request_wait %p wait on tid %llu %llu\n",

>  	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);

>  	if (req1) {

> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)

>  			err = -EIO;

>  		ceph_mdsc_put_request(req2);

>  	}

> +out:

>  	return err;

>  }

>  


Otherwise the whole set looks pretty reasonable.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>
Xiubo Li July 6, 2021, 12:37 p.m. UTC | #2
On 7/6/21 7:42 PM, Jeff Layton wrote:
> On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:

>> From: Xiubo Li <xiubli@redhat.com>

>>

>> For the client requests who will have unsafe and safe replies from

>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog

>> (journal log) immediatelly, because they think it's unnecessary.

>> That's true for most cases but not all, likes the fsync request.

>> The fsync will wait until all the unsafe replied requests to be

>> safely replied.

>>

>> Normally if there have multiple threads or clients are running, the

>> whole mdlog in MDS daemons could be flushed in time if any request

>> will trigger the mdlog submit thread. So usually we won't experience

>> the normal operations will stuck for a long time. But in case there

>> has only one client with only thread is running, the stuck phenomenon

>> maybe obvious and the worst case it must wait at most 5 seconds to

>> wait the mdlog to be flushed by the MDS's tick thread periodically.

>>

>> This patch will trigger to flush the mdlog in the relevant and auth

>> MDSes to which the in-flight requests are sent just before waiting

>> the unsafe requests to finish.

>>

>> Signed-off-by: Xiubo Li <xiubli@redhat.com>

>> ---

>>   fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++

>>   1 file changed, 78 insertions(+)

>>

>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

>> index c6a3352a4d52..4b966c29d9b5 100644

>> --- a/fs/ceph/caps.c

>> +++ b/fs/ceph/caps.c

>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)

>>    */

>>   static int unsafe_request_wait(struct inode *inode)

>>   {

>> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;

>>   	struct ceph_inode_info *ci = ceph_inode(inode);

>>   	struct ceph_mds_request *req1 = NULL, *req2 = NULL;

>>   	int ret, err = 0;

>> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)

>>   	}

>>   	spin_unlock(&ci->i_unsafe_lock);

>>   

>> +	/*

>> +	 * Trigger to flush the journal logs in all the relevant MDSes

>> +	 * manually, or in the worst case we must wait at most 5 seconds

>> +	 * to wait the journal logs to be flushed by the MDSes periodically.

>> +	 */

>> +	if (req1 || req2) {

>> +		struct ceph_mds_session **sessions = NULL;

>> +		struct ceph_mds_session *s;

>> +		struct ceph_mds_request *req;

>> +		unsigned int max;

>> +		int i;

>> +

>> +		/*

>> +		 * The mdsc->max_sessions is unlikely to be changed

>> +		 * mostly, here we will retry it by reallocating the

>> +		 * sessions arrary memory to get rid of the mdsc->mutex

>> +		 * lock.

>> +		 */

>> +retry:

>> +		max = mdsc->max_sessions;

>> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);

> The kerneldoc over krealloc() says:

>

>   * The contents of the object pointed to are preserved up to the

>   * lesser of the new and old sizes (__GFP_ZERO flag is effectively

> ignored).

>

> This code however relies on krealloc zeroing out the new part of the

> allocation. Do you know for certain that that works?


I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only 
for the preserved contents. If the slab really needs to allocate a new 
object, the slab will help zero it first and then copy the old contents 
to it, the new part will keep zeroed.


>> +		if (!sessions) {

>> +			err = -ENOMEM;

>> +			goto out;

>> +		}

>> +		spin_lock(&ci->i_unsafe_lock);

>> +		if (req1) {

>> +			list_for_each_entry(req, &ci->i_unsafe_dirops,

>> +					    r_unsafe_dir_item) {

>> +				s = req->r_session;

>> +				if (unlikely(s->s_mds > max)) {

>> +					spin_unlock(&ci->i_unsafe_lock);

>> +					goto retry;

>> +				}

>> +				if (!sessions[s->s_mds]) {

>> +					s = ceph_get_mds_session(s);

>> +					sessions[s->s_mds] = s;

> nit: maybe just do:

>

>      sessions[s->s_mds] = ceph_get_mds_session(s);


Then it will exceed 80 chars for this line. Should we ignore it here ?

Thanks.

>

>

>> +				}

>> +			}

>> +		}

>> +		if (req2) {

>> +			list_for_each_entry(req, &ci->i_unsafe_iops,

>> +					    r_unsafe_target_item) {

>> +				s = req->r_session;

>> +				if (unlikely(s->s_mds > max)) {

>> +					spin_unlock(&ci->i_unsafe_lock);

>> +					goto retry;

>> +				}

>> +				if (!sessions[s->s_mds]) {

>> +					s = ceph_get_mds_session(s);

>> +					sessions[s->s_mds] = s;

>> +				}

>> +			}

>> +		}

>> +		spin_unlock(&ci->i_unsafe_lock);

>> +

>> +		/* the auth MDS */

>> +		spin_lock(&ci->i_ceph_lock);

>> +		if (ci->i_auth_cap) {

>> +		      s = ci->i_auth_cap->session;

>> +		      if (!sessions[s->s_mds])

>> +			      sessions[s->s_mds] = ceph_get_mds_session(s);

>> +		}

>> +		spin_unlock(&ci->i_ceph_lock);

>> +

>> +		/* send flush mdlog request to MDSes */

>> +		for (i = 0; i < max; i++) {

>> +			s = sessions[i];

>> +			if (s) {

>> +				send_flush_mdlog(s);

>> +				ceph_put_mds_session(s);

>> +			}

>> +		}

>> +		kfree(sessions);

>> +	}

>> +

>>   	dout("unsafe_request_wait %p wait on tid %llu %llu\n",

>>   	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);

>>   	if (req1) {

>> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)

>>   			err = -EIO;

>>   		ceph_mdsc_put_request(req2);

>>   	}

>> +out:

>>   	return err;

>>   }

>>   

> Otherwise the whole set looks pretty reasonable.

>

> Thanks,
Jeff Layton July 6, 2021, 1:11 p.m. UTC | #3
On Tue, 2021-07-06 at 20:37 +0800, Xiubo Li wrote:
> On 7/6/21 7:42 PM, Jeff Layton wrote:

> > On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:

> > > From: Xiubo Li <xiubli@redhat.com>

> > > 

> > > For the client requests who will have unsafe and safe replies from

> > > MDS daemons, in the MDS side the MDS daemons won't flush the mdlog

> > > (journal log) immediatelly, because they think it's unnecessary.

> > > That's true for most cases but not all, likes the fsync request.

> > > The fsync will wait until all the unsafe replied requests to be

> > > safely replied.

> > > 

> > > Normally if there have multiple threads or clients are running, the

> > > whole mdlog in MDS daemons could be flushed in time if any request

> > > will trigger the mdlog submit thread. So usually we won't experience

> > > the normal operations will stuck for a long time. But in case there

> > > has only one client with only thread is running, the stuck phenomenon

> > > maybe obvious and the worst case it must wait at most 5 seconds to

> > > wait the mdlog to be flushed by the MDS's tick thread periodically.

> > > 

> > > This patch will trigger to flush the mdlog in the relevant and auth

> > > MDSes to which the in-flight requests are sent just before waiting

> > > the unsafe requests to finish.

> > > 

> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>

> > > ---

> > >   fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++

> > >   1 file changed, 78 insertions(+)

> > > 

> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

> > > index c6a3352a4d52..4b966c29d9b5 100644

> > > --- a/fs/ceph/caps.c

> > > +++ b/fs/ceph/caps.c

> > > @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)

> > >    */

> > >   static int unsafe_request_wait(struct inode *inode)

> > >   {

> > > +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;

> > >   	struct ceph_inode_info *ci = ceph_inode(inode);

> > >   	struct ceph_mds_request *req1 = NULL, *req2 = NULL;

> > >   	int ret, err = 0;

> > > @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)

> > >   	}

> > >   	spin_unlock(&ci->i_unsafe_lock);

> > >   

> > > +	/*

> > > +	 * Trigger to flush the journal logs in all the relevant MDSes

> > > +	 * manually, or in the worst case we must wait at most 5 seconds

> > > +	 * to wait the journal logs to be flushed by the MDSes periodically.

> > > +	 */

> > > +	if (req1 || req2) {

> > > +		struct ceph_mds_session **sessions = NULL;

> > > +		struct ceph_mds_session *s;

> > > +		struct ceph_mds_request *req;

> > > +		unsigned int max;

> > > +		int i;

> > > +

> > > +		/*

> > > +		 * The mdsc->max_sessions is unlikely to be changed

> > > +		 * mostly, here we will retry it by reallocating the

> > > +		 * sessions arrary memory to get rid of the mdsc->mutex

> > > +		 * lock.

> > > +		 */

> > > +retry:

> > > +		max = mdsc->max_sessions;

> > > +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);

> > The kerneldoc over krealloc() says:

> > 

> >   * The contents of the object pointed to are preserved up to the

> >   * lesser of the new and old sizes (__GFP_ZERO flag is effectively

> > ignored).

> > 

> > This code however relies on krealloc zeroing out the new part of the

> > allocation. Do you know for certain that that works?

> 

> I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only 

> for the preserved contents. If the slab really needs to allocate a new 

> object, the slab will help zero it first and then copy the old contents 

> to it, the new part will keep zeroed.

> 

> 


Ok, and in the case where it's an initial kmalloc, that will be done
with __GFP_ZERO so any remaining space in the allocation will already be
zeroed. That works.

> > > +		if (!sessions) {

> > > +			err = -ENOMEM;

> > > +			goto out;

> > > +		}

> > > +		spin_lock(&ci->i_unsafe_lock);

> > > +		if (req1) {

> > > +			list_for_each_entry(req, &ci->i_unsafe_dirops,

> > > +					    r_unsafe_dir_item) {

> > > +				s = req->r_session;

> > > +				if (unlikely(s->s_mds > max)) {

> > > +					spin_unlock(&ci->i_unsafe_lock);

> > > +					goto retry;

> > > +				}

> > > +				if (!sessions[s->s_mds]) {

> > > +					s = ceph_get_mds_session(s);

> > > +					sessions[s->s_mds] = s;

> > nit: maybe just do:

> > 

> >      sessions[s->s_mds] = ceph_get_mds_session(s);

> 

> Then it will exceed 80 chars for this line. Should we ignore it here ?

> 


I probably would have but it's not worth respinning over all by itself.

It might also be possible to do all of this without taking the
i_unsafe_lock twice, but that too probably won't make much difference.

I'll give these a closer look and probably merge into testing branch
later today unless I see a problem.

Thanks!
Jeff

> > 

> > > +				}

> > > +			}

> > > +		}

> > > +		if (req2) {

> > > +			list_for_each_entry(req, &ci->i_unsafe_iops,

> > > +					    r_unsafe_target_item) {

> > > +				s = req->r_session;

> > > +				if (unlikely(s->s_mds > max)) {

> > > +					spin_unlock(&ci->i_unsafe_lock);

> > > +					goto retry;

> > > +				}

> > > +				if (!sessions[s->s_mds]) {

> > > +					s = ceph_get_mds_session(s);

> > > +					sessions[s->s_mds] = s;

> > > +				}

> > > +			}

> > > +		}

> > > +		spin_unlock(&ci->i_unsafe_lock);

> > > +

> > > +		/* the auth MDS */

> > > +		spin_lock(&ci->i_ceph_lock);

> > > +		if (ci->i_auth_cap) {

> > > +		      s = ci->i_auth_cap->session;

> > > +		      if (!sessions[s->s_mds])

> > > +			      sessions[s->s_mds] = ceph_get_mds_session(s);

> > > +		}

> > > +		spin_unlock(&ci->i_ceph_lock);

> > > +

> > > +		/* send flush mdlog request to MDSes */

> > > +		for (i = 0; i < max; i++) {

> > > +			s = sessions[i];

> > > +			if (s) {

> > > +				send_flush_mdlog(s);

> > > +				ceph_put_mds_session(s);

> > > +			}

> > > +		}

> > > +		kfree(sessions);

> > > +	}

> > > +

> > >   	dout("unsafe_request_wait %p wait on tid %llu %llu\n",

> > >   	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);

> > >   	if (req1) {

> > > @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)

> > >   			err = -EIO;

> > >   		ceph_mdsc_put_request(req2);

> > >   	}

> > > +out:

> > >   	return err;

> > >   }

> > >   

> > Otherwise the whole set looks pretty reasonable.

> > 

> > Thanks,

> 


-- 
Jeff Layton <jlayton@kernel.org>
Xiubo Li July 6, 2021, 1:17 p.m. UTC | #4
On 7/6/21 9:11 PM, Jeff Layton wrote:
> On Tue, 2021-07-06 at 20:37 +0800, Xiubo Li wrote:

>> On 7/6/21 7:42 PM, Jeff Layton wrote:

>>> On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:

>>>> From: Xiubo Li <xiubli@redhat.com>

>>>>

>>>> For the client requests who will have unsafe and safe replies from

>>>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog

>>>> (journal log) immediatelly, because they think it's unnecessary.

>>>> That's true for most cases but not all, likes the fsync request.

>>>> The fsync will wait until all the unsafe replied requests to be

>>>> safely replied.

>>>>

>>>> Normally if there have multiple threads or clients are running, the

>>>> whole mdlog in MDS daemons could be flushed in time if any request

>>>> will trigger the mdlog submit thread. So usually we won't experience

>>>> the normal operations will stuck for a long time. But in case there

>>>> has only one client with only thread is running, the stuck phenomenon

>>>> maybe obvious and the worst case it must wait at most 5 seconds to

>>>> wait the mdlog to be flushed by the MDS's tick thread periodically.

>>>>

>>>> This patch will trigger to flush the mdlog in the relevant and auth

>>>> MDSes to which the in-flight requests are sent just before waiting

>>>> the unsafe requests to finish.

>>>>

>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>

>>>> ---

>>>>    fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++

>>>>    1 file changed, 78 insertions(+)

>>>>

>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

>>>> index c6a3352a4d52..4b966c29d9b5 100644

>>>> --- a/fs/ceph/caps.c

>>>> +++ b/fs/ceph/caps.c

>>>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)

>>>>     */

>>>>    static int unsafe_request_wait(struct inode *inode)

>>>>    {

>>>> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;

>>>>    	struct ceph_inode_info *ci = ceph_inode(inode);

>>>>    	struct ceph_mds_request *req1 = NULL, *req2 = NULL;

>>>>    	int ret, err = 0;

>>>> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)

>>>>    	}

>>>>    	spin_unlock(&ci->i_unsafe_lock);

>>>>    

>>>> +	/*

>>>> +	 * Trigger to flush the journal logs in all the relevant MDSes

>>>> +	 * manually, or in the worst case we must wait at most 5 seconds

>>>> +	 * to wait the journal logs to be flushed by the MDSes periodically.

>>>> +	 */

>>>> +	if (req1 || req2) {

>>>> +		struct ceph_mds_session **sessions = NULL;

>>>> +		struct ceph_mds_session *s;

>>>> +		struct ceph_mds_request *req;

>>>> +		unsigned int max;

>>>> +		int i;

>>>> +

>>>> +		/*

>>>> +		 * The mdsc->max_sessions is unlikely to be changed

>>>> +		 * mostly, here we will retry it by reallocating the

>>>> +		 * sessions arrary memory to get rid of the mdsc->mutex

>>>> +		 * lock.

>>>> +		 */

>>>> +retry:

>>>> +		max = mdsc->max_sessions;

>>>> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);

>>> The kerneldoc over krealloc() says:

>>>

>>>    * The contents of the object pointed to are preserved up to the

>>>    * lesser of the new and old sizes (__GFP_ZERO flag is effectively

>>> ignored).

>>>

>>> This code however relies on krealloc zeroing out the new part of the

>>> allocation. Do you know for certain that that works?

>> I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only

>> for the preserved contents. If the slab really needs to allocate a new

>> object, the slab will help zero it first and then copy the old contents

>> to it, the new part will keep zeroed.

>>

>>

> Ok, and in the case where it's an initial kmalloc, that will be done

> with __GFP_ZERO so any remaining space in the allocation will already be

> zeroed. That works.


Yeah, it is.


>

>>>> +		if (!sessions) {

>>>> +			err = -ENOMEM;

>>>> +			goto out;

>>>> +		}

>>>> +		spin_lock(&ci->i_unsafe_lock);

>>>> +		if (req1) {

>>>> +			list_for_each_entry(req, &ci->i_unsafe_dirops,

>>>> +					    r_unsafe_dir_item) {

>>>> +				s = req->r_session;

>>>> +				if (unlikely(s->s_mds > max)) {

>>>> +					spin_unlock(&ci->i_unsafe_lock);

>>>> +					goto retry;

>>>> +				}

>>>> +				if (!sessions[s->s_mds]) {

>>>> +					s = ceph_get_mds_session(s);

>>>> +					sessions[s->s_mds] = s;

>>> nit: maybe just do:

>>>

>>>       sessions[s->s_mds] = ceph_get_mds_session(s);

>> Then it will exceed 80 chars for this line. Should we ignore it here ?

>>

> I probably would have but it's not worth respinning over all by itself.

>

> It might also be possible to do all of this without taking the

> i_unsafe_lock twice, but that too probably won't make much difference.

>

> I'll give these a closer look and probably merge into testing branch

> later today unless I see a problem.


Sure, thanks Jeff.


>

> Thanks!

> Jeff

>

>>>> +				}

>>>> +			}

>>>> +		}

>>>> +		if (req2) {

>>>> +			list_for_each_entry(req, &ci->i_unsafe_iops,

>>>> +					    r_unsafe_target_item) {

>>>> +				s = req->r_session;

>>>> +				if (unlikely(s->s_mds > max)) {

>>>> +					spin_unlock(&ci->i_unsafe_lock);

>>>> +					goto retry;

>>>> +				}

>>>> +				if (!sessions[s->s_mds]) {

>>>> +					s = ceph_get_mds_session(s);

>>>> +					sessions[s->s_mds] = s;

>>>> +				}

>>>> +			}

>>>> +		}

>>>> +		spin_unlock(&ci->i_unsafe_lock);

>>>> +

>>>> +		/* the auth MDS */

>>>> +		spin_lock(&ci->i_ceph_lock);

>>>> +		if (ci->i_auth_cap) {

>>>> +		      s = ci->i_auth_cap->session;

>>>> +		      if (!sessions[s->s_mds])

>>>> +			      sessions[s->s_mds] = ceph_get_mds_session(s);

>>>> +		}

>>>> +		spin_unlock(&ci->i_ceph_lock);

>>>> +

>>>> +		/* send flush mdlog request to MDSes */

>>>> +		for (i = 0; i < max; i++) {

>>>> +			s = sessions[i];

>>>> +			if (s) {

>>>> +				send_flush_mdlog(s);

>>>> +				ceph_put_mds_session(s);

>>>> +			}

>>>> +		}

>>>> +		kfree(sessions);

>>>> +	}

>>>> +

>>>>    	dout("unsafe_request_wait %p wait on tid %llu %llu\n",

>>>>    	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);

>>>>    	if (req1) {

>>>> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)

>>>>    			err = -EIO;

>>>>    		ceph_mdsc_put_request(req2);

>>>>    	}

>>>> +out:

>>>>    	return err;

>>>>    }

>>>>    

>>> Otherwise the whole set looks pretty reasonable.

>>>

>>> Thanks,
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c6a3352a4d52..4b966c29d9b5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2286,6 +2286,7 @@  static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  */
 static int unsafe_request_wait(struct inode *inode)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
 	int ret, err = 0;
@@ -2305,6 +2306,82 @@  static int unsafe_request_wait(struct inode *inode)
 	}
 	spin_unlock(&ci->i_unsafe_lock);
 
+	/*
+	 * Trigger to flush the journal logs in all the relevant MDSes
+	 * manually, or in the worst case we must wait at most 5 seconds
+	 * to wait the journal logs to be flushed by the MDSes periodically.
+	 */
+	if (req1 || req2) {
+		struct ceph_mds_session **sessions = NULL;
+		struct ceph_mds_session *s;
+		struct ceph_mds_request *req;
+		unsigned int max;
+		int i;
+
+		/*
+		 * The mdsc->max_sessions is unlikely to be changed
+		 * mostly, here we will retry it by reallocating the
+		 * sessions arrary memory to get rid of the mdsc->mutex
+		 * lock.
+		 */
+retry:
+		max = mdsc->max_sessions;
+		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
+		if (!sessions) {
+			err = -ENOMEM;
+			goto out;
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		if (req1) {
+			list_for_each_entry(req, &ci->i_unsafe_dirops,
+					    r_unsafe_dir_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		if (req2) {
+			list_for_each_entry(req, &ci->i_unsafe_iops,
+					    r_unsafe_target_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		spin_unlock(&ci->i_unsafe_lock);
+
+		/* the auth MDS */
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_auth_cap) {
+		      s = ci->i_auth_cap->session;
+		      if (!sessions[s->s_mds])
+			      sessions[s->s_mds] = ceph_get_mds_session(s);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+
+		/* send flush mdlog request to MDSes */
+		for (i = 0; i < max; i++) {
+			s = sessions[i];
+			if (s) {
+				send_flush_mdlog(s);
+				ceph_put_mds_session(s);
+			}
+		}
+		kfree(sessions);
+	}
+
 	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
 	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
 	if (req1) {
@@ -2321,6 +2398,7 @@  static int unsafe_request_wait(struct inode *inode)
 			err = -EIO;
 		ceph_mdsc_put_request(req2);
 	}
+out:
 	return err;
 }