diff mbox series

[V1] mmc: core: Enable force hw reset

Message ID 1650961818-13452-1-git-send-email-quic_spathi@quicinc.com
State New
Headers show
Series [V1] mmc: core: Enable force hw reset | expand

Commit Message

Srinivasarao Pathipati April 26, 2022, 8:30 a.m. UTC
From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>

During error recovery set need hw reset to handle
ICE error where cqe reset is must.

Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
Signed-off-by: kamasali <quic_kamasali@quicinc.com>
Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
---
 drivers/mmc/core/block.c      | 8 +++++---
 drivers/mmc/host/cqhci-core.c | 7 +++++--
 include/linux/mmc/host.h      | 1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

Comments

Kamasali Satyanarayan (Consultant) (QUIC) May 24, 2022, 12:02 p.m. UTC | #1
Hi,
These patches will be further taken by Sarthak.

Thanks,
Satya

-----Original Message-----
From: Adrian Hunter <adrian.hunter@intel.com> 
Sent: Wednesday, April 27, 2022 6:04 PM
To: quic_spathi <quic_spathi@quicinc.com>; ulf.hansson@linaro.org; riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk; avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com; swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-mmc@vger.kernel.org; linux-kernel@vger.kernel.org
Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>; Kamasali Satyanarayan (Consultant) (QUIC) <quic_kamasali@quicinc.com>
Subject: Re: [PATCH V1] mmc: core: Enable force hw reset

On 26/04/22 11:30, Srinivasarao Pathipati wrote:
> From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> 
> During error recovery set need hw reset to handle ICE error where cqe 
> reset is must.

How do you get ICE errors?  Doesn't it mean either the hardware is broken or the configuration is broken?

> 
> Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> Signed-off-by: kamasali <quic_kamasali@quicinc.com>
> Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
> ---
>  drivers/mmc/core/block.c      | 8 +++++---
>  drivers/mmc/host/cqhci-core.c | 7 +++++--
>  include/linux/mmc/host.h      | 1 +
>  3 files changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 
> b35e7a9..f63bf33 100644
> --- a/drivers/mmc/core/block.c
> +++ b/drivers/mmc/core/block.c
> @@ -1482,10 +1482,12 @@ void mmc_blk_cqe_recovery(struct mmc_queue *mq)
>  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
>  
>  	err = mmc_cqe_recovery(host);
> -	if (err)
> +	if (err || host->need_hw_reset) {
>  		mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY);
> -	else
> -		mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
> +		if (host->need_hw_reset)
> +			host->need_hw_reset = false;
> +	}
> +	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
>  
>  	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));  } diff 
> --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c 
> index b0d30c3..311b510 100644
> --- a/drivers/mmc/host/cqhci-core.c
> +++ b/drivers/mmc/host/cqhci-core.c
> @@ -812,18 +812,21 @@ static void cqhci_finish_mrq(struct mmc_host 
> *mmc, unsigned int tag)  irqreturn_t cqhci_irq(struct mmc_host *mmc, u32 intmask, int cmd_error,
>  		      int data_error)
>  {
> -	u32 status;
> +	u32 status, ice_err;
>  	unsigned long tag = 0, comp_status;
>  	struct cqhci_host *cq_host = mmc->cqe_private;
>  
>  	status = cqhci_readl(cq_host, CQHCI_IS);
>  	cqhci_writel(cq_host, status, CQHCI_IS);
> +	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
>  
>  	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc), 
> status);
>  
>  	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
> -	    cmd_error || data_error)
> +	    cmd_error || data_error || ice_err){
> +		mmc->need_hw_reset = true;
>  		cqhci_error_irq(mmc, status, cmd_error, data_error);
> +	}
>  
>  	if (status & CQHCI_IS_TCC) {
>  		/* read TCN and complete the request */ diff --git 
> a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 
> c193c50..3d00bcf 100644
> --- a/include/linux/mmc/host.h
> +++ b/include/linux/mmc/host.h
> @@ -492,6 +492,7 @@ struct mmc_host {
>  	int			cqe_qdepth;
>  	bool			cqe_enabled;
>  	bool			cqe_on;
> +	bool                    need_hw_reset;
>  
>  	/* Inline encryption support */
>  #ifdef CONFIG_MMC_CRYPTO
Sarthak Garg May 25, 2022, 7:06 a.m. UTC | #2
Hi Adrian,

Thanks for the review.
Please find comments inline.

Thanks,
Sarthak

> -----Original Message-----
> From: Kamasali Satyanarayan (Consultant) (QUIC)
> <quic_kamasali@quicinc.com>
> Sent: Tuesday, May 24, 2022 5:33 PM
> To: 'Adrian Hunter' <adrian.hunter@intel.com>; quic_spathi
> <quic_spathi@quicinc.com>; ulf.hansson@linaro.org; riteshh@codeaurora.org;
> asutoshd@codeaurora.org; axboe@kernel.dk; avri.altman@wdc.com;
> kch@nvidia.com; CLoehle@hyperstone.com; swboyd@chromium.org;
> digetx@gmail.com; bigeasy@linutronix.de; linux-mmc@vger.kernel.org; linux-
> kernel@vger.kernel.org; Sarthak Garg (QUIC) <quic_sartgarg@quicinc.com>
> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> Subject: RE: [PATCH V1] mmc: core: Enable force hw reset
> 
> Hi,
> These patches will be further taken by Sarthak.
> 
> Thanks,
> Satya
> 
> -----Original Message-----
> From: Adrian Hunter <adrian.hunter@intel.com>
> Sent: Wednesday, April 27, 2022 6:04 PM
> To: quic_spathi <quic_spathi@quicinc.com>; ulf.hansson@linaro.org;
> riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk;
> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-
> mmc@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>; Kamasali Satyanarayan
> (Consultant) (QUIC) <quic_kamasali@quicinc.com>
> Subject: Re: [PATCH V1] mmc: core: Enable force hw reset
> 
> On 26/04/22 11:30, Srinivasarao Pathipati wrote:
> > From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> >
> > During error recovery set need hw reset to handle ICE error where cqe
> > reset is must.
> 
> How do you get ICE errors?  Doesn't it mean either the hardware is broken or
> the configuration is broken?

This patch is not intended for ice errors and will update the commit text in V2.
Long back intermittent recovery failures were observed but after forcing hardware reset during error recovery we have no single instance of recovery failure. This have made recovery more robust for us.
Any suggestions on how we can take it forward will be highly appreciated.
> 
> >
> > Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> > Signed-off-by: kamasali <quic_kamasali@quicinc.com>
> > Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
> > ---
> >  drivers/mmc/core/block.c      | 8 +++++---
> >  drivers/mmc/host/cqhci-core.c | 7 +++++--
> >  include/linux/mmc/host.h      | 1 +
> >  3 files changed, 11 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index
> > b35e7a9..f63bf33 100644
> > --- a/drivers/mmc/core/block.c
> > +++ b/drivers/mmc/core/block.c
> > @@ -1482,10 +1482,12 @@ void mmc_blk_cqe_recovery(struct mmc_queue
> *mq)
> >  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
> >
> >  	err = mmc_cqe_recovery(host);
> > -	if (err)
> > +	if (err || host->need_hw_reset) {
> >  		mmc_blk_reset(mq->blkdata, host,
> MMC_BLK_CQE_RECOVERY);
> > -	else
> > -		mmc_blk_reset_success(mq->blkdata,
> MMC_BLK_CQE_RECOVERY);
> > +		if (host->need_hw_reset)
> > +			host->need_hw_reset = false;
> > +	}
> > +	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
> >
> >  	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));  } diff
> > --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
> > index b0d30c3..311b510 100644
> > --- a/drivers/mmc/host/cqhci-core.c
> > +++ b/drivers/mmc/host/cqhci-core.c
> > @@ -812,18 +812,21 @@ static void cqhci_finish_mrq(struct mmc_host
> > *mmc, unsigned int tag)  irqreturn_t cqhci_irq(struct mmc_host *mmc, u32
> intmask, int cmd_error,
> >  		      int data_error)
> >  {
> > -	u32 status;
> > +	u32 status, ice_err;
> >  	unsigned long tag = 0, comp_status;
> >  	struct cqhci_host *cq_host = mmc->cqe_private;
> >
> >  	status = cqhci_readl(cq_host, CQHCI_IS);
> >  	cqhci_writel(cq_host, status, CQHCI_IS);
> > +	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
> >
> >  	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc),
> > status);
> >
> >  	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
> > -	    cmd_error || data_error)
> > +	    cmd_error || data_error || ice_err){
> > +		mmc->need_hw_reset = true;
> >  		cqhci_error_irq(mmc, status, cmd_error, data_error);
> > +	}
> >
> >  	if (status & CQHCI_IS_TCC) {
> >  		/* read TCN and complete the request */ diff --git
> > a/include/linux/mmc/host.h b/include/linux/mmc/host.h index
> > c193c50..3d00bcf 100644
> > --- a/include/linux/mmc/host.h
> > +++ b/include/linux/mmc/host.h
> > @@ -492,6 +492,7 @@ struct mmc_host {
> >  	int			cqe_qdepth;
> >  	bool			cqe_enabled;
> >  	bool			cqe_on;
> > +	bool                    need_hw_reset;
> >
> >  	/* Inline encryption support */
> >  #ifdef CONFIG_MMC_CRYPTO
>
Adrian Hunter May 27, 2022, 12:44 p.m. UTC | #3
On 25/05/22 10:06, Sarthak Garg (QUIC) wrote:
> Hi Adrian,
> 
> Thanks for the review.
> Please find comments inline.
> 
> Thanks,
> Sarthak
> 
>> -----Original Message-----
>> From: Kamasali Satyanarayan (Consultant) (QUIC)
>> <quic_kamasali@quicinc.com>
>> Sent: Tuesday, May 24, 2022 5:33 PM
>> To: 'Adrian Hunter' <adrian.hunter@intel.com>; quic_spathi
>> <quic_spathi@quicinc.com>; ulf.hansson@linaro.org; riteshh@codeaurora.org;
>> asutoshd@codeaurora.org; axboe@kernel.dk; avri.altman@wdc.com;
>> kch@nvidia.com; CLoehle@hyperstone.com; swboyd@chromium.org;
>> digetx@gmail.com; bigeasy@linutronix.de; linux-mmc@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Sarthak Garg (QUIC) <quic_sartgarg@quicinc.com>
>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>> Subject: RE: [PATCH V1] mmc: core: Enable force hw reset
>>
>> Hi,
>> These patches will be further taken by Sarthak.
>>
>> Thanks,
>> Satya
>>
>> -----Original Message-----
>> From: Adrian Hunter <adrian.hunter@intel.com>
>> Sent: Wednesday, April 27, 2022 6:04 PM
>> To: quic_spathi <quic_spathi@quicinc.com>; ulf.hansson@linaro.org;
>> riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk;
>> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
>> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-
>> mmc@vger.kernel.org; linux-kernel@vger.kernel.org
>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>; Kamasali Satyanarayan
>> (Consultant) (QUIC) <quic_kamasali@quicinc.com>
>> Subject: Re: [PATCH V1] mmc: core: Enable force hw reset
>>
>> On 26/04/22 11:30, Srinivasarao Pathipati wrote:
>>> From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>>>
>>> During error recovery set need hw reset to handle ICE error where cqe
>>> reset is must.
>>
>> How do you get ICE errors?  Doesn't it mean either the hardware is broken or
>> the configuration is broken?
> 
> This patch is not intended for ice errors and will update the commit text in V2.
> Long back intermittent recovery failures were observed but after forcing hardware reset during error recovery we have no single instance of recovery failure. This have made recovery more robust for us.
> Any suggestions on how we can take it forward will be highly appreciated.

We can definitely go forward, but with hopefully a little more
explanation first.

It is preferable to be able to explain why changes are being made.

Do you have any logs or other information on the recovery failures?
Are you able to reproduce the problem?

I notice you always do mmc_blk_reset_success().  Does that mean you
sometimes need several resets in a row?

A potential issue that I notice, is that the recovery does not
explicitly deal with the case that the card's command queue has
been disabled e.g. due to RPMB access or IOCTL commands. Are you
using either of those?

>>
>>>
>>> Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>>> Signed-off-by: kamasali <quic_kamasali@quicinc.com>
>>> Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
>>> ---
>>>  drivers/mmc/core/block.c      | 8 +++++---
>>>  drivers/mmc/host/cqhci-core.c | 7 +++++--
>>>  include/linux/mmc/host.h      | 1 +
>>>  3 files changed, 11 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index
>>> b35e7a9..f63bf33 100644
>>> --- a/drivers/mmc/core/block.c
>>> +++ b/drivers/mmc/core/block.c
>>> @@ -1482,10 +1482,12 @@ void mmc_blk_cqe_recovery(struct mmc_queue
>> *mq)
>>>  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
>>>
>>>  	err = mmc_cqe_recovery(host);
>>> -	if (err)
>>> +	if (err || host->need_hw_reset) {
>>>  		mmc_blk_reset(mq->blkdata, host,
>> MMC_BLK_CQE_RECOVERY);
>>> -	else
>>> -		mmc_blk_reset_success(mq->blkdata,
>> MMC_BLK_CQE_RECOVERY);
>>> +		if (host->need_hw_reset)
>>> +			host->need_hw_reset = false;
>>> +	}
>>> +	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
>>>
>>>  	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));  } diff
>>> --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
>>> index b0d30c3..311b510 100644
>>> --- a/drivers/mmc/host/cqhci-core.c
>>> +++ b/drivers/mmc/host/cqhci-core.c
>>> @@ -812,18 +812,21 @@ static void cqhci_finish_mrq(struct mmc_host
>>> *mmc, unsigned int tag)  irqreturn_t cqhci_irq(struct mmc_host *mmc, u32
>> intmask, int cmd_error,
>>>  		      int data_error)
>>>  {
>>> -	u32 status;
>>> +	u32 status, ice_err;
>>>  	unsigned long tag = 0, comp_status;
>>>  	struct cqhci_host *cq_host = mmc->cqe_private;
>>>
>>>  	status = cqhci_readl(cq_host, CQHCI_IS);
>>>  	cqhci_writel(cq_host, status, CQHCI_IS);
>>> +	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
>>>
>>>  	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc),
>>> status);
>>>
>>>  	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
>>> -	    cmd_error || data_error)
>>> +	    cmd_error || data_error || ice_err){
>>> +		mmc->need_hw_reset = true;
>>>  		cqhci_error_irq(mmc, status, cmd_error, data_error);
>>> +	}
>>>
>>>  	if (status & CQHCI_IS_TCC) {
>>>  		/* read TCN and complete the request */ diff --git
>>> a/include/linux/mmc/host.h b/include/linux/mmc/host.h index
>>> c193c50..3d00bcf 100644
>>> --- a/include/linux/mmc/host.h
>>> +++ b/include/linux/mmc/host.h
>>> @@ -492,6 +492,7 @@ struct mmc_host {
>>>  	int			cqe_qdepth;
>>>  	bool			cqe_enabled;
>>>  	bool			cqe_on;
>>> +	bool                    need_hw_reset;
>>>
>>>  	/* Inline encryption support */
>>>  #ifdef CONFIG_MMC_CRYPTO
>>
>
Adrian Hunter June 1, 2022, 6:04 a.m. UTC | #4
On 27/05/22 15:44, Adrian Hunter wrote:
> On 25/05/22 10:06, Sarthak Garg (QUIC) wrote:
>> Hi Adrian,
>>
>> Thanks for the review.
>> Please find comments inline.
>>
>> Thanks,
>> Sarthak
>>
>>> -----Original Message-----
>>> From: Kamasali Satyanarayan (Consultant) (QUIC)
>>> <quic_kamasali@quicinc.com>
>>> Sent: Tuesday, May 24, 2022 5:33 PM
>>> To: 'Adrian Hunter' <adrian.hunter@intel.com>; quic_spathi
>>> <quic_spathi@quicinc.com>; ulf.hansson@linaro.org; riteshh@codeaurora.org;
>>> asutoshd@codeaurora.org; axboe@kernel.dk; avri.altman@wdc.com;
>>> kch@nvidia.com; CLoehle@hyperstone.com; swboyd@chromium.org;
>>> digetx@gmail.com; bigeasy@linutronix.de; linux-mmc@vger.kernel.org; linux-
>>> kernel@vger.kernel.org; Sarthak Garg (QUIC) <quic_sartgarg@quicinc.com>
>>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>>> Subject: RE: [PATCH V1] mmc: core: Enable force hw reset
>>>
>>> Hi,
>>> These patches will be further taken by Sarthak.
>>>
>>> Thanks,
>>> Satya
>>>
>>> -----Original Message-----
>>> From: Adrian Hunter <adrian.hunter@intel.com>
>>> Sent: Wednesday, April 27, 2022 6:04 PM
>>> To: quic_spathi <quic_spathi@quicinc.com>; ulf.hansson@linaro.org;
>>> riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk;
>>> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
>>> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-
>>> mmc@vger.kernel.org; linux-kernel@vger.kernel.org
>>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>; Kamasali Satyanarayan
>>> (Consultant) (QUIC) <quic_kamasali@quicinc.com>
>>> Subject: Re: [PATCH V1] mmc: core: Enable force hw reset
>>>
>>> On 26/04/22 11:30, Srinivasarao Pathipati wrote:
>>>> From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>>>>
>>>> During error recovery set need hw reset to handle ICE error where cqe
>>>> reset is must.
>>>
>>> How do you get ICE errors?  Doesn't it mean either the hardware is broken or
>>> the configuration is broken?
>>
>> This patch is not intended for ice errors and will update the commit text in V2.
>> Long back intermittent recovery failures were observed but after forcing hardware reset during error recovery we have no single instance of recovery failure. This have made recovery more robust for us.
>> Any suggestions on how we can take it forward will be highly appreciated.
> 
> We can definitely go forward, but with hopefully a little more
> explanation first.
> 
> It is preferable to be able to explain why changes are being made.
> 
> Do you have any logs or other information on the recovery failures?
> Are you able to reproduce the problem?
> 
> I notice you always do mmc_blk_reset_success().  Does that mean you
> sometimes need several resets in a row?
> 
> A potential issue that I notice, is that the recovery does not
> explicitly deal with the case that the card's command queue has
> been disabled e.g. due to RPMB access or IOCTL commands. Are you
> using either of those?

Looking closer, the command queue is reenabled on the error
path so that is not a concern, but I did send:

https://lore.kernel.org/linux-mmc/20220531171922.76080-1-adrian.hunter@intel.com/

For your case, if it is not about ICE errors, why not add only:

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f4a1281658db..a2ee850a5c16 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1497,7 +1497,7 @@ void mmc_blk_cqe_recovery(struct mmc_queue *mq)
 	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
 
 	err = mmc_cqe_recovery(host);
-	if (err)
+	if (err || host->cqe_recovery_reset_always)
 		mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY);
 	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
 
And then just set it in your host controller driver probe function.

	host->cqe_recovery_reset_always = true;

> 
>>>
>>>>
>>>> Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
>>>> Signed-off-by: kamasali <quic_kamasali@quicinc.com>
>>>> Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
>>>> ---
>>>>  drivers/mmc/core/block.c      | 8 +++++---
>>>>  drivers/mmc/host/cqhci-core.c | 7 +++++--
>>>>  include/linux/mmc/host.h      | 1 +
>>>>  3 files changed, 11 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index
>>>> b35e7a9..f63bf33 100644
>>>> --- a/drivers/mmc/core/block.c
>>>> +++ b/drivers/mmc/core/block.c
>>>> @@ -1482,10 +1482,12 @@ void mmc_blk_cqe_recovery(struct mmc_queue
>>> *mq)
>>>>  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
>>>>
>>>>  	err = mmc_cqe_recovery(host);
>>>> -	if (err)
>>>> +	if (err || host->need_hw_reset) {
>>>>  		mmc_blk_reset(mq->blkdata, host,
>>> MMC_BLK_CQE_RECOVERY);
>>>> -	else
>>>> -		mmc_blk_reset_success(mq->blkdata,
>>> MMC_BLK_CQE_RECOVERY);
>>>> +		if (host->need_hw_reset)
>>>> +			host->need_hw_reset = false;
>>>> +	}
>>>> +	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
>>>>
>>>>  	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));  } diff
>>>> --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
>>>> index b0d30c3..311b510 100644
>>>> --- a/drivers/mmc/host/cqhci-core.c
>>>> +++ b/drivers/mmc/host/cqhci-core.c
>>>> @@ -812,18 +812,21 @@ static void cqhci_finish_mrq(struct mmc_host
>>>> *mmc, unsigned int tag)  irqreturn_t cqhci_irq(struct mmc_host *mmc, u32
>>> intmask, int cmd_error,
>>>>  		      int data_error)
>>>>  {
>>>> -	u32 status;
>>>> +	u32 status, ice_err;
>>>>  	unsigned long tag = 0, comp_status;
>>>>  	struct cqhci_host *cq_host = mmc->cqe_private;
>>>>
>>>>  	status = cqhci_readl(cq_host, CQHCI_IS);
>>>>  	cqhci_writel(cq_host, status, CQHCI_IS);
>>>> +	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
>>>>
>>>>  	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc),
>>>> status);
>>>>
>>>>  	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
>>>> -	    cmd_error || data_error)
>>>> +	    cmd_error || data_error || ice_err){
>>>> +		mmc->need_hw_reset = true;
>>>>  		cqhci_error_irq(mmc, status, cmd_error, data_error);
>>>> +	}
>>>>
>>>>  	if (status & CQHCI_IS_TCC) {
>>>>  		/* read TCN and complete the request */ diff --git
>>>> a/include/linux/mmc/host.h b/include/linux/mmc/host.h index
>>>> c193c50..3d00bcf 100644
>>>> --- a/include/linux/mmc/host.h
>>>> +++ b/include/linux/mmc/host.h
>>>> @@ -492,6 +492,7 @@ struct mmc_host {
>>>>  	int			cqe_qdepth;
>>>>  	bool			cqe_enabled;
>>>>  	bool			cqe_on;
>>>> +	bool                    need_hw_reset;
>>>>
>>>>  	/* Inline encryption support */
>>>>  #ifdef CONFIG_MMC_CRYPTO
>>>
>>
>
Sarthak Garg June 2, 2022, 6:12 p.m. UTC | #5
> -----Original Message-----
> From: Adrian Hunter <adrian.hunter@intel.com>
> Sent: Wednesday, June 1, 2022 11:35 AM
> To: Sarthak Garg (QUIC) <quic_sartgarg@quicinc.com>; Kamasali Satyanarayan
> (Consultant) (QUIC) <quic_kamasali@quicinc.com>; quic_spathi
> <quic_spathi@quicinc.com>; ulf.hansson@linaro.org; axboe@kernel.dk;
> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-
> mmc@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH V1] mmc: core: Enable force hw reset
> 
> On 27/05/22 15:44, Adrian Hunter wrote:
> > On 25/05/22 10:06, Sarthak Garg (QUIC) wrote:
> >> Hi Adrian,
> >>
> >> Thanks for the review.
> >> Please find comments inline.
> >>
> >> Thanks,
> >> Sarthak
> >>
> >>> -----Original Message-----
> >>> From: Kamasali Satyanarayan (Consultant) (QUIC)
> >>> <quic_kamasali@quicinc.com>
> >>> Sent: Tuesday, May 24, 2022 5:33 PM
> >>> To: 'Adrian Hunter' <adrian.hunter@intel.com>; quic_spathi
> >>> <quic_spathi@quicinc.com>; ulf.hansson@linaro.org;
> >>> riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk;
> >>> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
> >>> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de;
> >>> linux-mmc@vger.kernel.org; linux- kernel@vger.kernel.org; Sarthak
> >>> Garg (QUIC) <quic_sartgarg@quicinc.com>
> >>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> >>> Subject: RE: [PATCH V1] mmc: core: Enable force hw reset
> >>>
> >>> Hi,
> >>> These patches will be further taken by Sarthak.
> >>>
> >>> Thanks,
> >>> Satya
> >>>
> >>> -----Original Message-----
> >>> From: Adrian Hunter <adrian.hunter@intel.com>
> >>> Sent: Wednesday, April 27, 2022 6:04 PM
> >>> To: quic_spathi <quic_spathi@quicinc.com>; ulf.hansson@linaro.org;
> >>> riteshh@codeaurora.org; asutoshd@codeaurora.org; axboe@kernel.dk;
> >>> avri.altman@wdc.com; kch@nvidia.com; CLoehle@hyperstone.com;
> >>> swboyd@chromium.org; digetx@gmail.com; bigeasy@linutronix.de; linux-
> >>> mmc@vger.kernel.org; linux-kernel@vger.kernel.org
> >>> Cc: Shaik Sajida Bhanu <sbhanu@codeaurora.org>; Kamasali
> >>> Satyanarayan
> >>> (Consultant) (QUIC) <quic_kamasali@quicinc.com>
> >>> Subject: Re: [PATCH V1] mmc: core: Enable force hw reset
> >>>
> >>> On 26/04/22 11:30, Srinivasarao Pathipati wrote:
> >>>> From: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> >>>>
> >>>> During error recovery set need hw reset to handle ICE error where
> >>>> cqe reset is must.
> >>>
> >>> How do you get ICE errors?  Doesn't it mean either the hardware is
> >>> broken or the configuration is broken?
> >>
> >> This patch is not intended for ice errors and will update the commit text in V2.
> >> Long back intermittent recovery failures were observed but after forcing
> hardware reset during error recovery we have no single instance of recovery
> failure. This have made recovery more robust for us.
> >> Any suggestions on how we can take it forward will be highly appreciated.
> >
> > We can definitely go forward, but with hopefully a little more
> > explanation first.
> >
> > It is preferable to be able to explain why changes are being made.
> >
> > Do you have any logs or other information on the recovery failures?
> > Are you able to reproduce the problem?
> >
> > I notice you always do mmc_blk_reset_success().  Does that mean you
> > sometimes need several resets in a row?
> >
> > A potential issue that I notice, is that the recovery does not
> > explicitly deal with the case that the card's command queue has been
> > disabled e.g. due to RPMB access or IOCTL commands. Are you using
> > either of those?
> 
> Looking closer, the command queue is reenabled on the error path so that is not
> a concern, but I did send:
> 
> https://lore.kernel.org/linux-mmc/20220531171922.76080-1-
> adrian.hunter@intel.com/
> 
> For your case, if it is not about ICE errors, why not add only:
> 
> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index
> f4a1281658db..a2ee850a5c16 100644
> --- a/drivers/mmc/core/block.c
> +++ b/drivers/mmc/core/block.c
> @@ -1497,7 +1497,7 @@ void mmc_blk_cqe_recovery(struct mmc_queue *mq)
>  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
> 
>  	err = mmc_cqe_recovery(host);
> -	if (err)
> +	if (err || host->cqe_recovery_reset_always)
>  		mmc_blk_reset(mq->blkdata, host,
> MMC_BLK_CQE_RECOVERY);
>  	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
> 
> And then just set it in your host controller driver probe function.
> 
> 	host->cqe_recovery_reset_always = true;
> 
> >
Thanks Adrian for considering and posting https://lore.kernel.org/linux-mmc/20220531171922.76080-1-adrian.hunter@intel.com/ and for the above suggestion as well.
Will soon post the updated patches.
> >>>
> >>>>
> >>>> Signed-off-by: Shaik Sajida Bhanu <sbhanu@codeaurora.org>
> >>>> Signed-off-by: kamasali <quic_kamasali@quicinc.com>
> >>>> Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
> >>>> ---
> >>>>  drivers/mmc/core/block.c      | 8 +++++---
> >>>>  drivers/mmc/host/cqhci-core.c | 7 +++++--
> >>>>  include/linux/mmc/host.h      | 1 +
> >>>>  3 files changed, 11 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
> >>>> index
> >>>> b35e7a9..f63bf33 100644
> >>>> --- a/drivers/mmc/core/block.c
> >>>> +++ b/drivers/mmc/core/block.c
> >>>> @@ -1482,10 +1482,12 @@ void mmc_blk_cqe_recovery(struct
> mmc_queue
> >>> *mq)
> >>>>  	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
> >>>>
> >>>>  	err = mmc_cqe_recovery(host);
> >>>> -	if (err)
> >>>> +	if (err || host->need_hw_reset) {
> >>>>  		mmc_blk_reset(mq->blkdata, host,
> >>> MMC_BLK_CQE_RECOVERY);
> >>>> -	else
> >>>> -		mmc_blk_reset_success(mq->blkdata,
> >>> MMC_BLK_CQE_RECOVERY);
> >>>> +		if (host->need_hw_reset)
> >>>> +			host->need_hw_reset = false;
> >>>> +	}
> >>>> +	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
> >>>>
> >>>>  	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));  } diff
> >>>> --git a/drivers/mmc/host/cqhci-core.c
> >>>> b/drivers/mmc/host/cqhci-core.c index b0d30c3..311b510 100644
> >>>> --- a/drivers/mmc/host/cqhci-core.c
> >>>> +++ b/drivers/mmc/host/cqhci-core.c
> >>>> @@ -812,18 +812,21 @@ static void cqhci_finish_mrq(struct mmc_host
> >>>> *mmc, unsigned int tag)  irqreturn_t cqhci_irq(struct mmc_host
> >>>> *mmc, u32
> >>> intmask, int cmd_error,
> >>>>  		      int data_error)
> >>>>  {
> >>>> -	u32 status;
> >>>> +	u32 status, ice_err;
> >>>>  	unsigned long tag = 0, comp_status;
> >>>>  	struct cqhci_host *cq_host = mmc->cqe_private;
> >>>>
> >>>>  	status = cqhci_readl(cq_host, CQHCI_IS);
> >>>>  	cqhci_writel(cq_host, status, CQHCI_IS);
> >>>> +	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
> >>>>
> >>>>  	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc),
> >>>> status);
> >>>>
> >>>>  	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
> >>>> -	    cmd_error || data_error)
> >>>> +	    cmd_error || data_error || ice_err){
> >>>> +		mmc->need_hw_reset = true;
> >>>>  		cqhci_error_irq(mmc, status, cmd_error, data_error);
> >>>> +	}
> >>>>
> >>>>  	if (status & CQHCI_IS_TCC) {
> >>>>  		/* read TCN and complete the request */ diff --git
> >>>> a/include/linux/mmc/host.h b/include/linux/mmc/host.h index
> >>>> c193c50..3d00bcf 100644
> >>>> --- a/include/linux/mmc/host.h
> >>>> +++ b/include/linux/mmc/host.h
> >>>> @@ -492,6 +492,7 @@ struct mmc_host {
> >>>>  	int			cqe_qdepth;
> >>>>  	bool			cqe_enabled;
> >>>>  	bool			cqe_on;
> >>>> +	bool                    need_hw_reset;
> >>>>
> >>>>  	/* Inline encryption support */
> >>>>  #ifdef CONFIG_MMC_CRYPTO
> >>>
> >>
> >
diff mbox series

Patch

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index b35e7a9..f63bf33 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1482,10 +1482,12 @@  void mmc_blk_cqe_recovery(struct mmc_queue *mq)
 	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
 
 	err = mmc_cqe_recovery(host);
-	if (err)
+	if (err || host->need_hw_reset) {
 		mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY);
-	else
-		mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
+		if (host->need_hw_reset)
+			host->need_hw_reset = false;
+	}
+	mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
 
 	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));
 }
diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
index b0d30c3..311b510 100644
--- a/drivers/mmc/host/cqhci-core.c
+++ b/drivers/mmc/host/cqhci-core.c
@@ -812,18 +812,21 @@  static void cqhci_finish_mrq(struct mmc_host *mmc, unsigned int tag)
 irqreturn_t cqhci_irq(struct mmc_host *mmc, u32 intmask, int cmd_error,
 		      int data_error)
 {
-	u32 status;
+	u32 status, ice_err;
 	unsigned long tag = 0, comp_status;
 	struct cqhci_host *cq_host = mmc->cqe_private;
 
 	status = cqhci_readl(cq_host, CQHCI_IS);
 	cqhci_writel(cq_host, status, CQHCI_IS);
+	ice_err = status & (CQHCI_IS_GCE | CQHCI_IS_ICCE);
 
 	pr_debug("%s: cqhci: IRQ status: 0x%08x\n", mmc_hostname(mmc), status);
 
 	if ((status & (CQHCI_IS_RED | CQHCI_IS_GCE | CQHCI_IS_ICCE)) ||
-	    cmd_error || data_error)
+	    cmd_error || data_error || ice_err){
+		mmc->need_hw_reset = true;
 		cqhci_error_irq(mmc, status, cmd_error, data_error);
+	}
 
 	if (status & CQHCI_IS_TCC) {
 		/* read TCN and complete the request */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c193c50..3d00bcf 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -492,6 +492,7 @@  struct mmc_host {
 	int			cqe_qdepth;
 	bool			cqe_enabled;
 	bool			cqe_on;
+	bool                    need_hw_reset;
 
 	/* Inline encryption support */
 #ifdef CONFIG_MMC_CRYPTO