diff mbox series

mmc: meson-gx: fix SDIO interrupt handling

Message ID 20221114093857.491695-1-peter.suti@streamunlimited.com
State New
Headers show
Series mmc: meson-gx: fix SDIO interrupt handling | expand

Commit Message

Peter Suti Nov. 14, 2022, 9:38 a.m. UTC
With the interrupt support introduced in commit 066ecde sometimes the
Marvell-8987 wifi chip entered a deadlock using the marvell-sd-uapsta-8987
vendor driver. The cause seems to be that sometimes the interrupt handler
handles 2 IRQs and one of them disables the interrupts which are not reenabled
when all interrupts are finished. To work around this, disable all interrupts
when we are in the IRQ context and reenable them when the current IRQ is handled.

Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")

Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
---
 drivers/mmc/host/meson-gx-mmc.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

Comments

Ulf Hansson Nov. 16, 2022, 4:13 p.m. UTC | #1
On Mon, 14 Nov 2022 at 10:40, Peter Suti <peter.suti@streamunlimited.com> wrote:
>
> With the interrupt support introduced in commit 066ecde sometimes the
> Marvell-8987 wifi chip entered a deadlock using the marvell-sd-uapsta-8987
> vendor driver. The cause seems to be that sometimes the interrupt handler
> handles 2 IRQs and one of them disables the interrupts which are not reenabled
> when all interrupts are finished. To work around this, disable all interrupts
> when we are in the IRQ context and reenable them when the current IRQ is handled.
>
> Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")
>
> Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
> ---
>  drivers/mmc/host/meson-gx-mmc.c | 21 ++++++++++++++-------
>  1 file changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
> index 6e5ea0213b47..972024d57d1c 100644
> --- a/drivers/mmc/host/meson-gx-mmc.c
> +++ b/drivers/mmc/host/meson-gx-mmc.c
> @@ -950,6 +950,10 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>         struct mmc_command *cmd;
>         u32 status, raw_status;
>         irqreturn_t ret = IRQ_NONE;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&host->lock, flags);
> +       __meson_mmc_enable_sdio_irq(host->mmc, 0);
>
>         raw_status = readl(host->regs + SD_EMMC_STATUS);
>         status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);
> @@ -958,11 +962,11 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>                 dev_dbg(host->dev,
>                         "Unexpected IRQ! irq_en 0x%08lx - status 0x%08x\n",
>                          IRQ_EN_MASK | IRQ_SDIO, raw_status);
> -               return IRQ_NONE;
> +               goto out_unlock;

This may end up re-enabling the sdio irqs, even if it was not enabled
to start with. This is probably not what we want.

>         }
>
>         if (WARN_ON(!host))
> -               return IRQ_NONE;
> +               goto out_unlock;
>
>         /* ack all raised interrupts */
>         writel(status, host->regs + SD_EMMC_STATUS);
> @@ -970,17 +974,16 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>         cmd = host->cmd;
>
>         if (status & IRQ_SDIO) {
> -               spin_lock(&host->lock);
> -               __meson_mmc_enable_sdio_irq(host->mmc, 0);
>                 sdio_signal_irq(host->mmc);
> -               spin_unlock(&host->lock);
>                 status &= ~IRQ_SDIO;
> -               if (!status)
> +               if (!status) {
> +                       spin_unlock_irqrestore(&host->lock, flags);
>                         return IRQ_HANDLED;
> +               }
>         }
>
>         if (WARN_ON(!cmd))
> -               return IRQ_NONE;
> +               goto out_unlock;
>
>         cmd->error = 0;
>         if (status & IRQ_CRC_ERR) {
> @@ -1023,6 +1026,10 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>         if (ret == IRQ_HANDLED)
>                 meson_mmc_request_done(host->mmc, cmd->mrq);
>
> +out_unlock:
> +       __meson_mmc_enable_sdio_irq(host->mmc, 1);
> +       spin_unlock_irqrestore(&host->lock, flags);
> +
>         return ret;
>  }
>

Kind regards
Uffe
Heiner Kallweit Nov. 22, 2022, 9:41 p.m. UTC | #2
On 22.11.2022 14:23, Peter Suti wrote:
> With the interrupt support introduced in commit 066ecde sometimes the
> Marvell-8987 wifi chip entered a deadlock using the marvell-sd-uapsta-8987
> vendor driver. The cause seems to be that sometimes the interrupt handler
> handles 2 IRQs and one of them disables the interrupts which are not reenabled
> when all interrupts are finished. To work around this, disable all interrupts
> when we are in the IRQ context and reenable them when the current IRQ is handled.
> 
To me it's still not clear what you mean with "handles 2 IRQs".
Hard irq handlers aren't re-entrant. Can you provide the exact call sequence
for the issue you're facing?
Are SDIO interrupts handled on more than one CPU in your case?
Are you concerned about the hard irq handler running in parallel on more than one CPU?

The proposed patch looks hacky and somewhat bypasses the SDIO core logic
by partially re-enabling SDIO interrupts in the hard irq handler.

In the first review round I wrote the following but didn't see a feedback yet.
Did you check the linked discussion whether it may be related to your issue?

-> IIRC I had a similar/same problem in mind when discussing the following:
-> https://lore.kernel.org/linux-arm-kernel/CAPDyKFoameOb7d3cn8_ki1O6DbMEAFvkQh1uUsYp4S-Lkq41oQ@mail.gmail.com/
-> Not sure though whether it's related to the issue you're facing.



> Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")
> 
> Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
> ---
> Changes in v2:
> 	- use spin_lock instead of spin_lock_irqsave
> 	- only reenable interrupts if they were enabled already
> 
>  drivers/mmc/host/meson-gx-mmc.c | 30 +++++++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
> index 6e5ea0213b47..0c95f8640b34 100644
> --- a/drivers/mmc/host/meson-gx-mmc.c
> +++ b/drivers/mmc/host/meson-gx-mmc.c
> @@ -934,6 +934,13 @@ static void meson_mmc_read_resp(struct mmc_host *mmc, struct mmc_command *cmd)
>  	}
>  }
>  
> +static bool __meson_mmc_sdio_irq_is_enabled(struct mmc_host *mmc)
> +{
> +	struct meson_host *host = mmc_priv(mmc);
> +
> +	return readl(host->regs + SD_EMMC_IRQ_EN) & IRQ_SDIO;
> +}
> +
>  static void __meson_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
>  {
>  	struct meson_host *host = mmc_priv(mmc);
> @@ -950,6 +957,11 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>  	struct mmc_command *cmd;
>  	u32 status, raw_status;
>  	irqreturn_t ret = IRQ_NONE;
> +	bool irq_enabled;
> +
> +	spin_lock(&host->lock);
> +	irq_enabled = __meson_mmc_sdio_irq_is_enabled(host->mmc);
> +	__meson_mmc_enable_sdio_irq(host->mmc, 0);
>  
>  	raw_status = readl(host->regs + SD_EMMC_STATUS);
>  	status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);
> @@ -958,11 +970,11 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>  		dev_dbg(host->dev,
>  			"Unexpected IRQ! irq_en 0x%08lx - status 0x%08x\n",
>  			 IRQ_EN_MASK | IRQ_SDIO, raw_status);
> -		return IRQ_NONE;
> +		goto out_unlock;
>  	}
>  
>  	if (WARN_ON(!host))
> -		return IRQ_NONE;
> +		goto out_unlock;
>  
>  	/* ack all raised interrupts */
>  	writel(status, host->regs + SD_EMMC_STATUS);
> @@ -970,17 +982,16 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>  	cmd = host->cmd;
>  
>  	if (status & IRQ_SDIO) {
> -		spin_lock(&host->lock);
> -		__meson_mmc_enable_sdio_irq(host->mmc, 0);
>  		sdio_signal_irq(host->mmc);
> -		spin_unlock(&host->lock);
>  		status &= ~IRQ_SDIO;
> -		if (!status)
> +		if (!status) {
> +			spin_unlock(&host->lock);
>  			return IRQ_HANDLED;
> +		}
>  	}
>  
>  	if (WARN_ON(!cmd))
> -		return IRQ_NONE;
> +		goto out_unlock;
>  
>  	cmd->error = 0;
>  	if (status & IRQ_CRC_ERR) {
> @@ -1023,6 +1034,11 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>  	if (ret == IRQ_HANDLED)
>  		meson_mmc_request_done(host->mmc, cmd->mrq);
>  
> +out_unlock:
> +	if (irq_enabled)
> +		__meson_mmc_enable_sdio_irq(host->mmc, 1);
> +	spin_unlock(&host->lock);
> +
>  	return ret;
>  }
>
Heiner Kallweit Dec. 14, 2022, 9:33 p.m. UTC | #3
On 14.12.2022 14:46, Peter Suti wrote:
> With the interrupt support introduced in commit 066ecde sometimes the
> Marvell-8987 wifi chip got stuck using the marvell-sd-uapsta-8987
> vendor driver. The cause seems to be that after sending ack to all interrupts
> the IRQ_SDIO still happens, but it is ignored.
> 
> To work around this, recheck the IRQ_SDIO after meson_mmc_request_done().
> 
> Inspired by 9e2582e ("mmc: mediatek: fix SDIO irq issue") which used a
> similar fix to handle lost interrupts.
> 
The commit description of the referenced fix isn't clear with regard to
who's fault it is that an interrupt can be lost. I'd interpret it being
a silicon bug rather than a kernel/driver bug.
Not sure whether it's the case, but it's possible that both vendors use
at least parts of the same IP in the MMC block, and therefore the issue
pops up here too.

> Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")
> 
> Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
> ---
> Changes in v2:
> 	- use spin_lock instead of spin_lock_irqsave
> 	- only reenable interrupts if they were enabled already
> 
> Changes in v3:
> 	- Rework the patch based on feedback from Heiner Kallweit.
> 		The IRQ does not happen on 2 CPUs and the hard IRQ is not re-entrant.
> 		But still one SDIO IRQ is lost without this change.
> 		After the ack, reading the SD_EMMC_STATUS BIT(15) is set, but 
> 		meson_mmc_irq() is never called again.
> 
> 		The fix is similar to Mediatek msdc_recheck_sdio_irq().
> 		That platform also loses an IRQ in some cases it seems.
> 
>  drivers/mmc/host/meson-gx-mmc.c | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
> index 6e5ea0213b47..7d3ee2f9a7f6 100644
> --- a/drivers/mmc/host/meson-gx-mmc.c
> +++ b/drivers/mmc/host/meson-gx-mmc.c
> @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>  	if (ret == IRQ_HANDLED)
>  		meson_mmc_request_done(host->mmc, cmd->mrq);
>  
> +	/*
> +	* Sometimes after we ack all raised interrupts,
> +	* an IRQ_SDIO can still be pending, which can get lost.
> +	*

A reader may scratch his head here and wonder how the interrupt can get lost,
and why adding a workaround instead of eliminating the root cause for losing
the interrupt. If you can't provide an explanation why the root cause for
losing the interrupt can't be fixed, presumably you would have to say that
you're adding a workaround for a suspected silicon bug.

> +	* To prevent this, recheck the IRQ_SDIO here and schedule
> +	* it to be processed.
> +	*/
> +	raw_status = readl(host->regs + SD_EMMC_STATUS);
> +	status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);

This isn't needed here. Why not simply:

status = readl(host->regs + SD_EMMC_STATUS);
if (status & IRQ_SDIO)
  ...


> +	if (status & IRQ_SDIO) {
> +		spin_lock(&host->lock);
> +		__meson_mmc_enable_sdio_irq(host->mmc, 0);
> +		sdio_signal_irq(host->mmc);
> +		spin_unlock(&host->lock);
> +	}
> +
>  	return ret;
>  }
>
Peter Suti Jan. 9, 2023, 11:52 a.m. UTC | #4
On Wed, Dec 14, 2022 at 10:33 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 14.12.2022 14:46, Peter Suti wrote:
> > With the interrupt support introduced in commit 066ecde sometimes the
> > Marvell-8987 wifi chip got stuck using the marvell-sd-uapsta-8987
> > vendor driver. The cause seems to be that after sending ack to all interrupts
> > the IRQ_SDIO still happens, but it is ignored.
> >
> > To work around this, recheck the IRQ_SDIO after meson_mmc_request_done().
> >
> > Inspired by 9e2582e ("mmc: mediatek: fix SDIO irq issue") which used a
> > similar fix to handle lost interrupts.
> >
> The commit description of the referenced fix isn't clear with regard to
> who's fault it is that an interrupt can be lost. I'd interpret it being
> a silicon bug rather than a kernel/driver bug.
Unfortunately I can't confirm that the referenced bug is in the
silicon for the original commit either.
However a similar workaround works in this case too which is why I
referenced that commit.

> Not sure whether it's the case, but it's possible that both vendors use
> at least parts of the same IP in the MMC block, and therefore the issue
> pops up here too.
>
> > Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")
> >
> > Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
> > ---
> > Changes in v2:
> >       - use spin_lock instead of spin_lock_irqsave
> >       - only reenable interrupts if they were enabled already
> >
> > Changes in v3:
> >       - Rework the patch based on feedback from Heiner Kallweit.
> >               The IRQ does not happen on 2 CPUs and the hard IRQ is not re-entrant.
> >               But still one SDIO IRQ is lost without this change.
> >               After the ack, reading the SD_EMMC_STATUS BIT(15) is set, but
> >               meson_mmc_irq() is never called again.
> >
> >               The fix is similar to Mediatek msdc_recheck_sdio_irq().
> >               That platform also loses an IRQ in some cases it seems.
> >
> >  drivers/mmc/host/meson-gx-mmc.c | 16 ++++++++++++++++
> >  1 file changed, 16 insertions(+)
> >
> > diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
> > index 6e5ea0213b47..7d3ee2f9a7f6 100644
> > --- a/drivers/mmc/host/meson-gx-mmc.c
> > +++ b/drivers/mmc/host/meson-gx-mmc.c
> > @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
> >       if (ret == IRQ_HANDLED)
> >               meson_mmc_request_done(host->mmc, cmd->mrq);
> >
> > +     /*
> > +     * Sometimes after we ack all raised interrupts,
> > +     * an IRQ_SDIO can still be pending, which can get lost.
> > +     *
>
> A reader may scratch his head here and wonder how the interrupt can get lost,
> and why adding a workaround instead of eliminating the root cause for losing
> the interrupt. If you can't provide an explanation why the root cause for
> losing the interrupt can't be fixed, presumably you would have to say that
> you're adding a workaround for a suspected silicon bug.
After talking to the manufacturer, we got the following explanation
for this situation:
"wifi may have dat1 interrupt coming in, without this the dat1
interrupt would be missed"
Supposedly this is fixed in their codebase.
Unfortunately we were not able to find out more and can't prepare a
patch with a proper explanation.
Thank you for reviewing.
>
> > +     * To prevent this, recheck the IRQ_SDIO here and schedule
> > +     * it to be processed.
> > +     */
> > +     raw_status = readl(host->regs + SD_EMMC_STATUS);
> > +     status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);
>
> This isn't needed here. Why not simply:
>
> status = readl(host->regs + SD_EMMC_STATUS);
> if (status & IRQ_SDIO)
>   ...
>
>
> > +     if (status & IRQ_SDIO) {
> > +             spin_lock(&host->lock);
> > +             __meson_mmc_enable_sdio_irq(host->mmc, 0);
> > +             sdio_signal_irq(host->mmc);
> > +             spin_unlock(&host->lock);
> > +     }
> > +
> >       return ret;
> >  }
> >
>
Heiner Kallweit Jan. 12, 2023, 9:27 p.m. UTC | #5
On 09.01.2023 12:52, Peter Suti wrote:
> On Wed, Dec 14, 2022 at 10:33 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 14.12.2022 14:46, Peter Suti wrote:
>>> With the interrupt support introduced in commit 066ecde sometimes the
>>> Marvell-8987 wifi chip got stuck using the marvell-sd-uapsta-8987
>>> vendor driver. The cause seems to be that after sending ack to all interrupts
>>> the IRQ_SDIO still happens, but it is ignored.
>>>
>>> To work around this, recheck the IRQ_SDIO after meson_mmc_request_done().
>>>
>>> Inspired by 9e2582e ("mmc: mediatek: fix SDIO irq issue") which used a
>>> similar fix to handle lost interrupts.
>>>
>> The commit description of the referenced fix isn't clear with regard to
>> who's fault it is that an interrupt can be lost. I'd interpret it being
>> a silicon bug rather than a kernel/driver bug.
> Unfortunately I can't confirm that the referenced bug is in the
> silicon for the original commit either.
> However a similar workaround works in this case too which is why I
> referenced that commit.
> 
>> Not sure whether it's the case, but it's possible that both vendors use
>> at least parts of the same IP in the MMC block, and therefore the issue
>> pops up here too.
>>
>>> Fixes: 066ecde ("mmc: meson-gx: add SDIO interrupt support")
>>>
>>> Signed-off-by: Peter Suti <peter.suti@streamunlimited.com>
>>> ---
>>> Changes in v2:
>>>       - use spin_lock instead of spin_lock_irqsave
>>>       - only reenable interrupts if they were enabled already
>>>
>>> Changes in v3:
>>>       - Rework the patch based on feedback from Heiner Kallweit.
>>>               The IRQ does not happen on 2 CPUs and the hard IRQ is not re-entrant.
>>>               But still one SDIO IRQ is lost without this change.
>>>               After the ack, reading the SD_EMMC_STATUS BIT(15) is set, but
>>>               meson_mmc_irq() is never called again.
>>>
>>>               The fix is similar to Mediatek msdc_recheck_sdio_irq().
>>>               That platform also loses an IRQ in some cases it seems.
>>>
>>>  drivers/mmc/host/meson-gx-mmc.c | 16 ++++++++++++++++
>>>  1 file changed, 16 insertions(+)
>>>
>>> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
>>> index 6e5ea0213b47..7d3ee2f9a7f6 100644
>>> --- a/drivers/mmc/host/meson-gx-mmc.c
>>> +++ b/drivers/mmc/host/meson-gx-mmc.c
>>> @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>>>       if (ret == IRQ_HANDLED)
>>>               meson_mmc_request_done(host->mmc, cmd->mrq);
>>>
>>> +     /*
>>> +     * Sometimes after we ack all raised interrupts,
>>> +     * an IRQ_SDIO can still be pending, which can get lost.
>>> +     *
>>
>> A reader may scratch his head here and wonder how the interrupt can get lost,
>> and why adding a workaround instead of eliminating the root cause for losing
>> the interrupt. If you can't provide an explanation why the root cause for
>> losing the interrupt can't be fixed, presumably you would have to say that
>> you're adding a workaround for a suspected silicon bug.
> After talking to the manufacturer, we got the following explanation
> for this situation:

To which manufacturer did you talk, Marvell or Amlogic?

> "wifi may have dat1 interrupt coming in, without this the dat1
> interrupt would be missed"

I don't understand this sentence. W/o the interrupt coming in
the interrupt would be missed? Can you explain it?

> Supposedly this is fixed in their codebase.

Which codebase do you mean? A specific vendor driver? Or firmware?

> Unfortunately we were not able to find out more and can't prepare a
> patch with a proper explanation.

The workaround is rather simple, so we should add it.
It's just unfortunate that we have no idea about the root cause of the issue.

> Thank you for reviewing.
>>
>>> +     * To prevent this, recheck the IRQ_SDIO here and schedule
>>> +     * it to be processed.
>>> +     */
>>> +     raw_status = readl(host->regs + SD_EMMC_STATUS);
>>> +     status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);
>>
>> This isn't needed here. Why not simply:
>>
>> status = readl(host->regs + SD_EMMC_STATUS);
>> if (status & IRQ_SDIO)
>>   ...
>>
>>
>>> +     if (status & IRQ_SDIO) {
>>> +             spin_lock(&host->lock);
>>> +             __meson_mmc_enable_sdio_irq(host->mmc, 0);
>>> +             sdio_signal_irq(host->mmc);
>>> +             spin_unlock(&host->lock);
>>> +     }
>>> +
>>>       return ret;
>>>  }
>>>
>>
Peter Suti Jan. 18, 2023, 1:32 p.m. UTC | #6
On Thu, Jan 12, 2023 at 10:27 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 09.01.2023 12:52, Peter Suti wrote:
> > On Wed, Dec 14, 2022 at 10:33 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>
> >> On 14.12.2022 14:46, Peter Suti wrote:
> >>>
> >>> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
> >>> index 6e5ea0213b47..7d3ee2f9a7f6 100644
> >>> --- a/drivers/mmc/host/meson-gx-mmc.c
> >>> +++ b/drivers/mmc/host/meson-gx-mmc.c
> >>> @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
> >>>       if (ret == IRQ_HANDLED)
> >>>               meson_mmc_request_done(host->mmc, cmd->mrq);
> >>>
> >>> +     /*
> >>> +     * Sometimes after we ack all raised interrupts,
> >>> +     * an IRQ_SDIO can still be pending, which can get lost.
> >>> +     *
> >>
> >> A reader may scratch his head here and wonder how the interrupt can get lost,
> >> and why adding a workaround instead of eliminating the root cause for losing
> >> the interrupt. If you can't provide an explanation why the root cause for
> >> losing the interrupt can't be fixed, presumably you would have to say that
> >> you're adding a workaround for a suspected silicon bug.
> > After talking to the manufacturer, we got the following explanation
> > for this situation:
>
> To which manufacturer did you talk, Marvell or Amlogic?

Amlogic

>
> > "wifi may have dat1 interrupt coming in, without this the dat1
> > interrupt would be missed"
>
> I don't understand this sentence. W/o the interrupt coming in
> the interrupt would be missed? Can you explain it?

So the "without this" part of that sentence referred to the patch.
Which means that without the patch, the interrupt can be missed.

>
> > Supposedly this is fixed in their codebase.
>
> Which codebase do you mean? A specific vendor driver? Or firmware?

The vendor driver from openlinux2.amlogic.com handles SDIO interrupts
a bit differently. It uses mmc_signal_sdio_irq()

>
> > Unfortunately we were not able to find out more and can't prepare a
> > patch with a proper explanation.
>
> The workaround is rather simple, so we should add it.
> It's just unfortunate that we have no idea about the root cause of the issue.

After doing a more long term stress test, it was revealed that this
patch is still not sufficient, but only masks the underlying problem
better. Reverting 066ecde fixes the issue fully for us (verified
overnight with iperf).
v1 and v2 also fix the issue, but v3 does not correct the bug when
WiFi is stressed for a longer time. And therefore it should not be
used.
Could you please give us some advice on how to investigate this further?
Heiner Kallweit Jan. 19, 2023, 7:53 a.m. UTC | #7
On 18.01.2023 14:32, Peter Suti wrote:
> On Thu, Jan 12, 2023 at 10:27 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 09.01.2023 12:52, Peter Suti wrote:
>>> On Wed, Dec 14, 2022 at 10:33 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> On 14.12.2022 14:46, Peter Suti wrote:
>>>>>
>>>>> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
>>>>> index 6e5ea0213b47..7d3ee2f9a7f6 100644
>>>>> --- a/drivers/mmc/host/meson-gx-mmc.c
>>>>> +++ b/drivers/mmc/host/meson-gx-mmc.c
>>>>> @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>>>>>       if (ret == IRQ_HANDLED)
>>>>>               meson_mmc_request_done(host->mmc, cmd->mrq);
>>>>>
>>>>> +     /*
>>>>> +     * Sometimes after we ack all raised interrupts,
>>>>> +     * an IRQ_SDIO can still be pending, which can get lost.
>>>>> +     *
>>>>
>>>> A reader may scratch his head here and wonder how the interrupt can get lost,
>>>> and why adding a workaround instead of eliminating the root cause for losing
>>>> the interrupt. If you can't provide an explanation why the root cause for
>>>> losing the interrupt can't be fixed, presumably you would have to say that
>>>> you're adding a workaround for a suspected silicon bug.
>>> After talking to the manufacturer, we got the following explanation
>>> for this situation:
>>
>> To which manufacturer did you talk, Marvell or Amlogic?
> 
> Amlogic
> 
>>
>>> "wifi may have dat1 interrupt coming in, without this the dat1
>>> interrupt would be missed"
>>
>> I don't understand this sentence. W/o the interrupt coming in
>> the interrupt would be missed? Can you explain it?
> 
> So the "without this" part of that sentence referred to the patch.
> Which means that without the patch, the interrupt can be missed.
> 

Did you ask Amlogic for the root cause of the problem? Silicon bug?

>>
>>> Supposedly this is fixed in their codebase.
>>
>> Which codebase do you mean? A specific vendor driver? Or firmware?
> 
> The vendor driver from openlinux2.amlogic.com handles SDIO interrupts
> a bit differently. It uses mmc_signal_sdio_irq()
> 
This is the legacy API for SDIO irq.

>>
>>> Unfortunately we were not able to find out more and can't prepare a
>>> patch with a proper explanation.
>>
>> The workaround is rather simple, so we should add it.
>> It's just unfortunate that we have no idea about the root cause of the issue.
> 
> After doing a more long term stress test, it was revealed that this
> patch is still not sufficient, but only masks the underlying problem
> better. Reverting 066ecde fixes the issue fully for us (verified
> overnight with iperf).
> v1 and v2 also fix the issue, but v3 does not correct the bug when
> WiFi is stressed for a longer time. And therefore it should not be
> used.
> Could you please give us some advice on how to investigate this further?

When talking about v2 of the patch, it includes a number of actual changes:

- hold lock from start to end of interrupt handler
- if SDIO irq was disabled when entering the irq handler I see no change
- if SDIO irq was enabled
  - disable SDIO irq at start of irq handler, no matter which type of interrupt was received
    -> means disable SDIO irq during irq handler even if some other interrupt was received
  - if SDIO and another interrupt source flag is set, leave irq handler with SDIO irq enabled.
    Not sure whether that's intentional, you can use the small patch listed below on top of v2
    to check whether SDIO irq still works ok or not.
    

At least to me it's not clear which (or all?) of the changes are needed to work
around the problem. So you could check whether SDIO irq still works fine if you
remove a single change from the patch.


diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
index f4b102b85..711431706 100644
--- a/drivers/mmc/host/meson-gx-mmc.c
+++ b/drivers/mmc/host/meson-gx-mmc.c
@@ -994,6 +994,7 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
 			spin_unlock(&host->lock);
 			return IRQ_HANDLED;
 		}
+		irq_enabled = false;
 	}
 
 	if (WARN_ON(!cmd))
Heiner Kallweit Jan. 19, 2023, 7:37 p.m. UTC | #8
On 18.01.2023 14:32, Peter Suti wrote:
> On Thu, Jan 12, 2023 at 10:27 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 09.01.2023 12:52, Peter Suti wrote:
>>> On Wed, Dec 14, 2022 at 10:33 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> On 14.12.2022 14:46, Peter Suti wrote:
>>>>>
>>>>> diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
>>>>> index 6e5ea0213b47..7d3ee2f9a7f6 100644
>>>>> --- a/drivers/mmc/host/meson-gx-mmc.c
>>>>> +++ b/drivers/mmc/host/meson-gx-mmc.c
>>>>> @@ -1023,6 +1023,22 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
>>>>>       if (ret == IRQ_HANDLED)
>>>>>               meson_mmc_request_done(host->mmc, cmd->mrq);
>>>>>
>>>>> +     /*
>>>>> +     * Sometimes after we ack all raised interrupts,
>>>>> +     * an IRQ_SDIO can still be pending, which can get lost.
>>>>> +     *
>>>>
>>>> A reader may scratch his head here and wonder how the interrupt can get lost,
>>>> and why adding a workaround instead of eliminating the root cause for losing
>>>> the interrupt. If you can't provide an explanation why the root cause for
>>>> losing the interrupt can't be fixed, presumably you would have to say that
>>>> you're adding a workaround for a suspected silicon bug.
>>> After talking to the manufacturer, we got the following explanation
>>> for this situation:
>>
>> To which manufacturer did you talk, Marvell or Amlogic?
> 
> Amlogic
> 
>>
>>> "wifi may have dat1 interrupt coming in, without this the dat1
>>> interrupt would be missed"
>>
>> I don't understand this sentence. W/o the interrupt coming in
>> the interrupt would be missed? Can you explain it?
> 
> So the "without this" part of that sentence referred to the patch.
> Which means that without the patch, the interrupt can be missed.
> 
>>
>>> Supposedly this is fixed in their codebase.
>>
>> Which codebase do you mean? A specific vendor driver? Or firmware?
> 
> The vendor driver from openlinux2.amlogic.com handles SDIO interrupts
> a bit differently. It uses mmc_signal_sdio_irq()
> 
>>
>>> Unfortunately we were not able to find out more and can't prepare a
>>> patch with a proper explanation.
>>
>> The workaround is rather simple, so we should add it.
>> It's just unfortunate that we have no idea about the root cause of the issue.
> 
> After doing a more long term stress test, it was revealed that this
> patch is still not sufficient, but only masks the underlying problem
> better. Reverting 066ecde fixes the issue fully for us (verified
> overnight with iperf).
> v1 and v2 also fix the issue, but v3 does not correct the bug when
> WiFi is stressed for a longer time. And therefore it should not be
> used.
> Could you please give us some advice on how to investigate this further?

One more thought:
When checking device tree I found that my system uses IRQ_TYPE_LEVEL_HIGH
for the SDIO interrupt. meson-g12-common.dtsi uses IRQ_TYPE_EDGE_RISING
in mainline Linux, however vendor kernel uses IRQ_TYPE_LEVEL_HIGH
in meson-g12-common.dtsi. A wrong interrupt trigger type may result in
lost interrupts.
So you could check which trigger type your system uses for the SDIO interrupt.
If it's IRQ_TYPE_EDGE_RISING, re-test with IRQ_TYPE_LEVEL_HIGH.
Peter Suti Jan. 24, 2023, 7:29 a.m. UTC | #9
On Thu, Jan 19, 2023 at 8:37 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> > v1 and v2 also fix the issue, but v3 does not correct the bug when
> > WiFi is stressed for a longer time. And therefore it should not be
> > used.
> > Could you please give us some advice on how to investigate this further?
>
> One more thought:
> When checking device tree I found that my system uses IRQ_TYPE_LEVEL_HIGH
> for the SDIO interrupt. meson-g12-common.dtsi uses IRQ_TYPE_EDGE_RISING
> in mainline Linux, however vendor kernel uses IRQ_TYPE_LEVEL_HIGH
> in meson-g12-common.dtsi. A wrong interrupt trigger type may result in
> lost interrupts.
> So you could check which trigger type your system uses for the SDIO interrupt.
> If it's IRQ_TYPE_EDGE_RISING, re-test with IRQ_TYPE_LEVEL_HIGH.

Thank you for this suggestion, this change fixes the problem fully.
With the interrupt trigger type set to IRQ_TYPE_LEVEL_HIGH everything
works as expected even when WiFI is stressed for a long time.
diff mbox series

Patch

diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
index 6e5ea0213b47..972024d57d1c 100644
--- a/drivers/mmc/host/meson-gx-mmc.c
+++ b/drivers/mmc/host/meson-gx-mmc.c
@@ -950,6 +950,10 @@  static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
 	struct mmc_command *cmd;
 	u32 status, raw_status;
 	irqreturn_t ret = IRQ_NONE;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->lock, flags);
+	__meson_mmc_enable_sdio_irq(host->mmc, 0);
 
 	raw_status = readl(host->regs + SD_EMMC_STATUS);
 	status = raw_status & (IRQ_EN_MASK | IRQ_SDIO);
@@ -958,11 +962,11 @@  static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
 		dev_dbg(host->dev,
 			"Unexpected IRQ! irq_en 0x%08lx - status 0x%08x\n",
 			 IRQ_EN_MASK | IRQ_SDIO, raw_status);
-		return IRQ_NONE;
+		goto out_unlock;
 	}
 
 	if (WARN_ON(!host))
-		return IRQ_NONE;
+		goto out_unlock;
 
 	/* ack all raised interrupts */
 	writel(status, host->regs + SD_EMMC_STATUS);
@@ -970,17 +974,16 @@  static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
 	cmd = host->cmd;
 
 	if (status & IRQ_SDIO) {
-		spin_lock(&host->lock);
-		__meson_mmc_enable_sdio_irq(host->mmc, 0);
 		sdio_signal_irq(host->mmc);
-		spin_unlock(&host->lock);
 		status &= ~IRQ_SDIO;
-		if (!status)
+		if (!status) {
+			spin_unlock_irqrestore(&host->lock, flags);
 			return IRQ_HANDLED;
+		}
 	}
 
 	if (WARN_ON(!cmd))
-		return IRQ_NONE;
+		goto out_unlock;
 
 	cmd->error = 0;
 	if (status & IRQ_CRC_ERR) {
@@ -1023,6 +1026,10 @@  static irqreturn_t meson_mmc_irq(int irq, void *dev_id)
 	if (ret == IRQ_HANDLED)
 		meson_mmc_request_done(host->mmc, cmd->mrq);
 
+out_unlock:
+	__meson_mmc_enable_sdio_irq(host->mmc, 1);
+	spin_unlock_irqrestore(&host->lock, flags);
+
 	return ret;
 }