diff mbox series

[v2] mmc: sdhci-pci-gli: GL975x: Mask rootport's replay timer timeout during suspend

Message ID 20231221032147.434647-1-kai.heng.feng@canonical.com
State New
Headers show
Series [v2] mmc: sdhci-pci-gli: GL975x: Mask rootport's replay timer timeout during suspend | expand

Commit Message

Kai-Heng Feng Dec. 21, 2023, 3:21 a.m. UTC
Spamming `lspci -vv` can still observe the replay timer timeout error
even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
replay timer timeout of AER"), albeit with a lower reproduce rate.

Such AER interrupt can still prevent the system from suspending, so let
root port mask and unmask replay timer timeout during suspend and
resume, respectively.

Cc: Victor Shih <victor.shih@genesyslogic.com.tw>
Cc: Ben Chuang <benchuanggli@gmail.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
v2:
 - Change subject to reflect it works on GL9750 & GL9755
 - Fix when aer_cap is missing
 
 drivers/mmc/host/sdhci-pci-core.c |  2 +-
 drivers/mmc/host/sdhci-pci-gli.c  | 55 +++++++++++++++++++++++++++++--
 drivers/mmc/host/sdhci-pci.h      |  1 +
 3 files changed, 55 insertions(+), 3 deletions(-)

Comments

Ulf Hansson Jan. 3, 2024, 10:52 a.m. UTC | #1
On Thu, 21 Dec 2023 at 04:23, Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:
>
> Spamming `lspci -vv` can still observe the replay timer timeout error
> even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> replay timer timeout of AER"), albeit with a lower reproduce rate.
>
> Such AER interrupt can still prevent the system from suspending, so let
> root port mask and unmask replay timer timeout during suspend and
> resume, respectively.
>
> Cc: Victor Shih <victor.shih@genesyslogic.com.tw>
> Cc: Ben Chuang <benchuanggli@gmail.com>
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
>  - Change subject to reflect it works on GL9750 & GL9755
>  - Fix when aer_cap is missing
>
>  drivers/mmc/host/sdhci-pci-core.c |  2 +-
>  drivers/mmc/host/sdhci-pci-gli.c  | 55 +++++++++++++++++++++++++++++--
>  drivers/mmc/host/sdhci-pci.h      |  1 +
>  3 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
> index 025b31aa712c..59ae4da72974 100644
> --- a/drivers/mmc/host/sdhci-pci-core.c
> +++ b/drivers/mmc/host/sdhci-pci-core.c
> @@ -68,7 +68,7 @@ static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip)
>         return 0;
>  }
>
> -static int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
> +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
>  {
>         int i, ret;
>
> diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> index 77911a57b12c..54943e9df835 100644
> --- a/drivers/mmc/host/sdhci-pci-gli.c
> +++ b/drivers/mmc/host/sdhci-pci-gli.c
> @@ -1429,6 +1429,55 @@ static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip)
>         return sdhci_pci_resume_host(chip);
>  }
>
> +#ifdef CONFIG_PCIEAER
> +static void mask_replay_timer_timeout(struct pci_dev *pdev)
> +{
> +       struct pci_dev *parent = pci_upstream_bridge(pdev);
> +       u32 val;
> +
> +       if (!parent || !parent->aer_cap)

Wouldn't it be more correct to use pci_aer_available(), rather than
just checking the aer_cap?

If pci_aer_available() can be used, we wouldn't even need the stubs as
the is already stubs for pci_aer_available().

> +               return;
> +
> +       pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> +       val |= PCI_ERR_COR_REP_TIMER;
> +       pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
> +}
> +
> +static void unmask_replay_timer_timeout(struct pci_dev *pdev)
> +{
> +       struct pci_dev *parent = pci_upstream_bridge(pdev);
> +       u32 val;
> +
> +       if (!parent || !parent->aer_cap)
> +               return;
> +
> +       pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> +       val &= ~PCI_ERR_COR_REP_TIMER;
> +       pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
> +}
> +#else
> +static inline void mask_replay_timer_timeout(struct pci_dev *pdev) { }
> +static inline void unmask_replay_timer_timeout(struct pci_dev *pdev) {  }
> +#endif
> +
> +static int sdhci_pci_gl975x_suspend(struct sdhci_pci_chip *chip)
> +{
> +       mask_replay_timer_timeout(chip->pdev);
> +
> +       return sdhci_pci_suspend_host(chip);
> +}
> +
> +static int sdhci_pci_gl975x_resume(struct sdhci_pci_chip *chip)
> +{
> +       int ret;
> +
> +       ret = sdhci_pci_gli_resume(chip);
> +
> +       unmask_replay_timer_timeout(chip->pdev);
> +
> +       return ret;
> +}
> +
>  static int gl9763e_resume(struct sdhci_pci_chip *chip)
>  {
>         struct sdhci_pci_slot *slot = chip->slots[0];
> @@ -1547,7 +1596,8 @@ const struct sdhci_pci_fixes sdhci_gl9755 = {
>         .probe_slot     = gli_probe_slot_gl9755,
>         .ops            = &sdhci_gl9755_ops,
>  #ifdef CONFIG_PM_SLEEP
> -       .resume         = sdhci_pci_gli_resume,
> +       .suspend        = sdhci_pci_gl975x_suspend,
> +       .resume         = sdhci_pci_gl975x_resume,
>  #endif
>  };
>
> @@ -1570,7 +1620,8 @@ const struct sdhci_pci_fixes sdhci_gl9750 = {
>         .probe_slot     = gli_probe_slot_gl9750,
>         .ops            = &sdhci_gl9750_ops,
>  #ifdef CONFIG_PM_SLEEP
> -       .resume         = sdhci_pci_gli_resume,
> +       .suspend        = sdhci_pci_gl975x_suspend,
> +       .resume         = sdhci_pci_gl975x_resume,
>  #endif
>  };
>
> diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
> index 153704f812ed..19253dce687d 100644
> --- a/drivers/mmc/host/sdhci-pci.h
> +++ b/drivers/mmc/host/sdhci-pci.h
> @@ -190,6 +190,7 @@ static inline void *sdhci_pci_priv(struct sdhci_pci_slot *slot)
>  }
>
>  #ifdef CONFIG_PM_SLEEP
> +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip);
>  int sdhci_pci_resume_host(struct sdhci_pci_chip *chip);
>  #endif
>  int sdhci_pci_enable_dma(struct sdhci_host *host);

Kind regards
Uffe
Kai-Heng Feng Jan. 4, 2024, 4:10 a.m. UTC | #2
On Wed, Jan 3, 2024 at 6:53 PM Ulf Hansson <ulf.hansson@linaro.org> wrote:
>
> On Thu, 21 Dec 2023 at 04:23, Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:
> >
> > Spamming `lspci -vv` can still observe the replay timer timeout error
> > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > replay timer timeout of AER"), albeit with a lower reproduce rate.
> >
> > Such AER interrupt can still prevent the system from suspending, so let
> > root port mask and unmask replay timer timeout during suspend and
> > resume, respectively.
> >
> > Cc: Victor Shih <victor.shih@genesyslogic.com.tw>
> > Cc: Ben Chuang <benchuanggli@gmail.com>
> > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > ---
> > v2:
> >  - Change subject to reflect it works on GL9750 & GL9755
> >  - Fix when aer_cap is missing
> >
> >  drivers/mmc/host/sdhci-pci-core.c |  2 +-
> >  drivers/mmc/host/sdhci-pci-gli.c  | 55 +++++++++++++++++++++++++++++--
> >  drivers/mmc/host/sdhci-pci.h      |  1 +
> >  3 files changed, 55 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
> > index 025b31aa712c..59ae4da72974 100644
> > --- a/drivers/mmc/host/sdhci-pci-core.c
> > +++ b/drivers/mmc/host/sdhci-pci-core.c
> > @@ -68,7 +68,7 @@ static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip)
> >         return 0;
> >  }
> >
> > -static int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
> > +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
> >  {
> >         int i, ret;
> >
> > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > index 77911a57b12c..54943e9df835 100644
> > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > @@ -1429,6 +1429,55 @@ static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip)
> >         return sdhci_pci_resume_host(chip);
> >  }
> >
> > +#ifdef CONFIG_PCIEAER
> > +static void mask_replay_timer_timeout(struct pci_dev *pdev)
> > +{
> > +       struct pci_dev *parent = pci_upstream_bridge(pdev);
> > +       u32 val;
> > +
> > +       if (!parent || !parent->aer_cap)
>
> Wouldn't it be more correct to use pci_aer_available(), rather than
> just checking the aer_cap?

pci_aer_available() is more of a global check, so checking aer_cap is
still required for the device.

>
> If pci_aer_available() can be used, we wouldn't even need the stubs as
> the is already stubs for pci_aer_available().

A helper that checks both aer_cap and  pci_aer_available() can be
added for such purpose, but there aren't many users of that.

Kai-Heng

>
> > +               return;
> > +
> > +       pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> > +       val |= PCI_ERR_COR_REP_TIMER;
> > +       pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
> > +}
> > +
> > +static void unmask_replay_timer_timeout(struct pci_dev *pdev)
> > +{
> > +       struct pci_dev *parent = pci_upstream_bridge(pdev);
> > +       u32 val;
> > +
> > +       if (!parent || !parent->aer_cap)
> > +               return;
> > +
> > +       pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> > +       val &= ~PCI_ERR_COR_REP_TIMER;
> > +       pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
> > +}
> > +#else
> > +static inline void mask_replay_timer_timeout(struct pci_dev *pdev) { }
> > +static inline void unmask_replay_timer_timeout(struct pci_dev *pdev) {  }
> > +#endif
> > +
> > +static int sdhci_pci_gl975x_suspend(struct sdhci_pci_chip *chip)
> > +{
> > +       mask_replay_timer_timeout(chip->pdev);
> > +
> > +       return sdhci_pci_suspend_host(chip);
> > +}
> > +
> > +static int sdhci_pci_gl975x_resume(struct sdhci_pci_chip *chip)
> > +{
> > +       int ret;
> > +
> > +       ret = sdhci_pci_gli_resume(chip);
> > +
> > +       unmask_replay_timer_timeout(chip->pdev);
> > +
> > +       return ret;
> > +}
> > +
> >  static int gl9763e_resume(struct sdhci_pci_chip *chip)
> >  {
> >         struct sdhci_pci_slot *slot = chip->slots[0];
> > @@ -1547,7 +1596,8 @@ const struct sdhci_pci_fixes sdhci_gl9755 = {
> >         .probe_slot     = gli_probe_slot_gl9755,
> >         .ops            = &sdhci_gl9755_ops,
> >  #ifdef CONFIG_PM_SLEEP
> > -       .resume         = sdhci_pci_gli_resume,
> > +       .suspend        = sdhci_pci_gl975x_suspend,
> > +       .resume         = sdhci_pci_gl975x_resume,
> >  #endif
> >  };
> >
> > @@ -1570,7 +1620,8 @@ const struct sdhci_pci_fixes sdhci_gl9750 = {
> >         .probe_slot     = gli_probe_slot_gl9750,
> >         .ops            = &sdhci_gl9750_ops,
> >  #ifdef CONFIG_PM_SLEEP
> > -       .resume         = sdhci_pci_gli_resume,
> > +       .suspend        = sdhci_pci_gl975x_suspend,
> > +       .resume         = sdhci_pci_gl975x_resume,
> >  #endif
> >  };
> >
> > diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
> > index 153704f812ed..19253dce687d 100644
> > --- a/drivers/mmc/host/sdhci-pci.h
> > +++ b/drivers/mmc/host/sdhci-pci.h
> > @@ -190,6 +190,7 @@ static inline void *sdhci_pci_priv(struct sdhci_pci_slot *slot)
> >  }
> >
> >  #ifdef CONFIG_PM_SLEEP
> > +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip);
> >  int sdhci_pci_resume_host(struct sdhci_pci_chip *chip);
> >  #endif
> >  int sdhci_pci_enable_dma(struct sdhci_host *host);
>
> Kind regards
> Uffe
Adrian Hunter Jan. 4, 2024, 7:42 p.m. UTC | #3
On 4/01/24 06:10, Kai-Heng Feng wrote:
> On Wed, Jan 3, 2024 at 6:53 PM Ulf Hansson <ulf.hansson@linaro.org> wrote:
>>
>> On Thu, 21 Dec 2023 at 04:23, Kai-Heng Feng <kai.heng.feng@canonical.com> wrote:
>>>
>>> Spamming `lspci -vv` can still observe the replay timer timeout error
>>> even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
>>> replay timer timeout of AER"), albeit with a lower reproduce rate.
>>>
>>> Such AER interrupt can still prevent the system from suspending, so let
>>> root port mask and unmask replay timer timeout during suspend and
>>> resume, respectively.
>>>
>>> Cc: Victor Shih <victor.shih@genesyslogic.com.tw>
>>> Cc: Ben Chuang <benchuanggli@gmail.com>
>>> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
>>> ---
>>> v2:
>>>  - Change subject to reflect it works on GL9750 & GL9755
>>>  - Fix when aer_cap is missing
>>>
>>>  drivers/mmc/host/sdhci-pci-core.c |  2 +-
>>>  drivers/mmc/host/sdhci-pci-gli.c  | 55 +++++++++++++++++++++++++++++--
>>>  drivers/mmc/host/sdhci-pci.h      |  1 +
>>>  3 files changed, 55 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
>>> index 025b31aa712c..59ae4da72974 100644
>>> --- a/drivers/mmc/host/sdhci-pci-core.c
>>> +++ b/drivers/mmc/host/sdhci-pci-core.c
>>> @@ -68,7 +68,7 @@ static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip)
>>>         return 0;
>>>  }
>>>
>>> -static int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
>>> +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
>>>  {
>>>         int i, ret;
>>>
>>> diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
>>> index 77911a57b12c..54943e9df835 100644
>>> --- a/drivers/mmc/host/sdhci-pci-gli.c
>>> +++ b/drivers/mmc/host/sdhci-pci-gli.c
>>> @@ -1429,6 +1429,55 @@ static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip)
>>>         return sdhci_pci_resume_host(chip);
>>>  }
>>>
>>> +#ifdef CONFIG_PCIEAER
>>> +static void mask_replay_timer_timeout(struct pci_dev *pdev)
>>> +{
>>> +       struct pci_dev *parent = pci_upstream_bridge(pdev);
>>> +       u32 val;
>>> +
>>> +       if (!parent || !parent->aer_cap)
>>
>> Wouldn't it be more correct to use pci_aer_available(), rather than
>> just checking the aer_cap?
> 
> pci_aer_available() is more of a global check, so checking aer_cap is
> still required for the device.

It is not obvious whether aer_cap is meant to be used outside PCI
internal code.  Maybe reading the offset directly is more
appropriate?

	aer_pos = pci_find_ext_capability(root, PCI_EXT_CAP_ID_ERR);


> 
>>
>> If pci_aer_available() can be used, we wouldn't even need the stubs as
>> the is already stubs for pci_aer_available().
> 
> A helper that checks both aer_cap and  pci_aer_available() can be
> added for such purpose, but there aren't many users of that.
> 
> Kai-Heng
> 
>>
>>> +               return;
>>> +
>>> +       pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
>>> +       val |= PCI_ERR_COR_REP_TIMER;
>>> +       pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
>>> +}
>>> +
>>> +static void unmask_replay_timer_timeout(struct pci_dev *pdev)
>>> +{
>>> +       struct pci_dev *parent = pci_upstream_bridge(pdev);
>>> +       u32 val;
>>> +
>>> +       if (!parent || !parent->aer_cap)
>>> +               return;
>>> +
>>> +       pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
>>> +       val &= ~PCI_ERR_COR_REP_TIMER;
>>> +       pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
>>> +}
>>> +#else
>>> +static inline void mask_replay_timer_timeout(struct pci_dev *pdev) { }
>>> +static inline void unmask_replay_timer_timeout(struct pci_dev *pdev) {  }
>>> +#endif
>>> +
>>> +static int sdhci_pci_gl975x_suspend(struct sdhci_pci_chip *chip)
>>> +{
>>> +       mask_replay_timer_timeout(chip->pdev);
>>> +
>>> +       return sdhci_pci_suspend_host(chip);
>>> +}
>>> +
>>> +static int sdhci_pci_gl975x_resume(struct sdhci_pci_chip *chip)
>>> +{
>>> +       int ret;
>>> +
>>> +       ret = sdhci_pci_gli_resume(chip);
>>> +
>>> +       unmask_replay_timer_timeout(chip->pdev);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>>  static int gl9763e_resume(struct sdhci_pci_chip *chip)
>>>  {
>>>         struct sdhci_pci_slot *slot = chip->slots[0];
>>> @@ -1547,7 +1596,8 @@ const struct sdhci_pci_fixes sdhci_gl9755 = {
>>>         .probe_slot     = gli_probe_slot_gl9755,
>>>         .ops            = &sdhci_gl9755_ops,
>>>  #ifdef CONFIG_PM_SLEEP
>>> -       .resume         = sdhci_pci_gli_resume,
>>> +       .suspend        = sdhci_pci_gl975x_suspend,
>>> +       .resume         = sdhci_pci_gl975x_resume,
>>>  #endif
>>>  };
>>>
>>> @@ -1570,7 +1620,8 @@ const struct sdhci_pci_fixes sdhci_gl9750 = {
>>>         .probe_slot     = gli_probe_slot_gl9750,
>>>         .ops            = &sdhci_gl9750_ops,
>>>  #ifdef CONFIG_PM_SLEEP
>>> -       .resume         = sdhci_pci_gli_resume,
>>> +       .suspend        = sdhci_pci_gl975x_suspend,
>>> +       .resume         = sdhci_pci_gl975x_resume,
>>>  #endif
>>>  };
>>>
>>> diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
>>> index 153704f812ed..19253dce687d 100644
>>> --- a/drivers/mmc/host/sdhci-pci.h
>>> +++ b/drivers/mmc/host/sdhci-pci.h
>>> @@ -190,6 +190,7 @@ static inline void *sdhci_pci_priv(struct sdhci_pci_slot *slot)
>>>  }
>>>
>>>  #ifdef CONFIG_PM_SLEEP
>>> +int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip);
>>>  int sdhci_pci_resume_host(struct sdhci_pci_chip *chip);
>>>  #endif
>>>  int sdhci_pci_enable_dma(struct sdhci_host *host);
>>
>> Kind regards
>> Uffe
Bjorn Helgaas Jan. 5, 2024, 9:19 p.m. UTC | #4
On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> Spamming `lspci -vv` can still observe the replay timer timeout error
> even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> replay timer timeout of AER"), albeit with a lower reproduce rate.

I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
you mean that if you run lspci continually, you still see Replay Timer
Timeout logged, e.g.,

  CESta:	... Timeout+

015c9cbcf0ad uses hard-coded PCI_GLI_9750_CORRERR_MASK offset and
PCI_GLI_9750_CORRERR_MASK_REPLAY_TIMER_TIMEOUT value, which look like
they *could* be PCI_ERR_COR_MASK and PCI_ERR_COR_REP_TIMER, but
without the lspci output I can't tell for sure.  If they are, it would
be nice to use the generic macros instead of defining new ones so it's
easier to analyze PCI_ERR_COR_MASK usage.

If 015c9cbcf0ad is updating the generic PCI_ERR_COR_MASK, it should
only prevent sending ERR_COR.  It should not affect the *logging* in
PCI_ERR_COR_STATUS (see PCIe r6.0, sec 6.2.3.2.2), so it shouldn't
affect the lspci output.

> Such AER interrupt can still prevent the system from suspending, so let
> root port mask and unmask replay timer timeout during suspend and
> resume, respectively.

015c9cbcf0ad looks like it masks PCI_ERR_COR_REP_TIMER in the gl975x
Endpoint, while this patch masks it in the upstream bridge (which
might be either a Root Port or a Switch Downstream Port, so the
subject and this sentence are not quite right).

015c9cbcf0ad says it is related to a hardware defect, and maybe this
patch is also (mention it if so).  Both patches can prevent ERR_COR
messages and the eventual AER interrupt, depending on whether the
Downstream Port or the Endpoint detects the Replay Timer Timeout.
Maybe this should have a Fixes: tag for 015c9cbcf0ad to try to keep
these together?

If 015c9cbcf0ad is actually updating PCI_ERR_COR_MASK, it would be
nice to clean that up, too.  And maybe PCI_ERR_COR_REP_TIMER should be
masked/restored at the same place for both the Downstream Port and the
Endpoint?

> +#ifdef CONFIG_PCIEAER
> +static void mask_replay_timer_timeout(struct pci_dev *pdev)
> +{
> +	struct pci_dev *parent = pci_upstream_bridge(pdev);
> +	u32 val;
> +
> +	if (!parent || !parent->aer_cap)
> +		return;
> +
> +	pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> +	val |= PCI_ERR_COR_REP_TIMER;
> +	pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
> +}
> +
> +static void unmask_replay_timer_timeout(struct pci_dev *pdev)
> +{
> +	struct pci_dev *parent = pci_upstream_bridge(pdev);
> +	u32 val;
> +
> +	if (!parent || !parent->aer_cap)
> +		return;
> +
> +	pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> +	val &= ~PCI_ERR_COR_REP_TIMER;

I think I would save the previous PCI_ERR_COR_REP_TIMER value and
restore it here, so it is preserved if there is ever a generic
interface via sysfs or similar to manage correctable error masking.

> +	pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
> +}
Kai-Heng Feng Jan. 12, 2024, 5:14 a.m. UTC | #5
On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > Spamming `lspci -vv` can still observe the replay timer timeout error
> > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > replay timer timeout of AER"), albeit with a lower reproduce rate.
>
> I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> you mean that if you run lspci continually, you still see Replay Timer
> Timeout logged, e.g.,
>
>   CESta:        ... Timeout+

Yes it's logged and the AER IRQ is raised.

>
> 015c9cbcf0ad uses hard-coded PCI_GLI_9750_CORRERR_MASK offset and
> PCI_GLI_9750_CORRERR_MASK_REPLAY_TIMER_TIMEOUT value, which look like
> they *could* be PCI_ERR_COR_MASK and PCI_ERR_COR_REP_TIMER, but
> without the lspci output I can't tell for sure.  If they are, it would
> be nice to use the generic macros instead of defining new ones so it's
> easier to analyze PCI_ERR_COR_MASK usage.
>
> If 015c9cbcf0ad is updating the generic PCI_ERR_COR_MASK, it should
> only prevent sending ERR_COR.  It should not affect the *logging* in
> PCI_ERR_COR_STATUS (see PCIe r6.0, sec 6.2.3.2.2), so it shouldn't
> affect the lspci output.

PCI_GLI_9750_CORRERR_MASK is specific to GLI 975x devices, so it
doesn't conform to generic PCI_ERR_COR_STATUS behavior.

The Timeout is masked with or without commit 015c9cbcf0ad:
CEMsk:  ... Timeout+


>
> > Such AER interrupt can still prevent the system from suspending, so let
> > root port mask and unmask replay timer timeout during suspend and
> > resume, respectively.
>
> 015c9cbcf0ad looks like it masks PCI_ERR_COR_REP_TIMER in the gl975x
> Endpoint, while this patch masks it in the upstream bridge (which
> might be either a Root Port or a Switch Downstream Port, so the
> subject and this sentence are not quite right).

OK, will change it to upstream bridge in next revision.

>
> 015c9cbcf0ad says it is related to a hardware defect, and maybe this
> patch is also (mention it if so).  Both patches can prevent ERR_COR
> messages and the eventual AER interrupt, depending on whether the
> Downstream Port or the Endpoint detects the Replay Timer Timeout.
> Maybe this should have a Fixes: tag for 015c9cbcf0ad to try to keep
> these together?

Sure. This patch is intend to cover more ground based on 015c9cbcf0ad.


>
> If 015c9cbcf0ad is actually updating PCI_ERR_COR_MASK, it would be
> nice to clean that up, too.  And maybe PCI_ERR_COR_REP_TIMER should be
> masked/restored at the same place for both the Downstream Port and the
> Endpoint?

Since PCI_ERR_COR_REP_TIMER is already masked before 015c9cbcf0ad, so
I didn't think that's necessary.
Do you think it should still be masked just to be safe?

>
> > +#ifdef CONFIG_PCIEAER
> > +static void mask_replay_timer_timeout(struct pci_dev *pdev)
> > +{
> > +     struct pci_dev *parent = pci_upstream_bridge(pdev);
> > +     u32 val;
> > +
> > +     if (!parent || !parent->aer_cap)
> > +             return;
> > +
> > +     pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> > +     val |= PCI_ERR_COR_REP_TIMER;
> > +     pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
> > +}
> > +
> > +static void unmask_replay_timer_timeout(struct pci_dev *pdev)
> > +{
> > +     struct pci_dev *parent = pci_upstream_bridge(pdev);
> > +     u32 val;
> > +
> > +     if (!parent || !parent->aer_cap)
> > +             return;
> > +
> > +     pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
> > +     val &= ~PCI_ERR_COR_REP_TIMER;
>
> I think I would save the previous PCI_ERR_COR_REP_TIMER value and
> restore it here, so it is preserved if there is ever a generic
> interface via sysfs or similar to manage correctable error masking.

Makes sense, will do in next revision.

Kai-Heng

>
> > +     pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
> > +}
Bjorn Helgaas Jan. 12, 2024, 5:37 p.m. UTC | #6
On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> >
> > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > you mean that if you run lspci continually, you still see Replay Timer
> > Timeout logged, e.g.,
> >
> >   CESta:        ... Timeout+
> 
> Yes it's logged and the AER IRQ is raised.

IIUC the AER IRQ is the important thing.

Neither 015c9cbcf0ad nor this patch affects logging in
PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
here doesn't add useful information.

I'd suggest more specific wording than "spamming `lspci -vv`", e.g.,

  015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the replay timer
  timeout of AER") masks Replay Timer Timeout errors at the GL975x
  Endpoint.  When the Endpoint detects these errors, it still logs
  them in its PCI_ERR_COR_STATUS, but masking prevents it from sending
  ERR_COR messages upstream.

  The Downstream Port leading to a GL975x Endpoint is unaffected by
  015c9cbcf0ad.  Previously, when that Port detected a Replay Timer
  Timeout, it sent an ERR_COR message upstream, which eventually
  caused an AER IRQ, which prevented the system from suspending.

  Mask Replay Timer Timeout errors at the Downstream Port.  The errors
  will still be logged in PCI_ERR_COR_STATUS, but no ERR_COR will be
  sent.

> > 015c9cbcf0ad uses hard-coded PCI_GLI_9750_CORRERR_MASK offset and
> > PCI_GLI_9750_CORRERR_MASK_REPLAY_TIMER_TIMEOUT value, which look like
> > they *could* be PCI_ERR_COR_MASK and PCI_ERR_COR_REP_TIMER, but
> > without the lspci output I can't tell for sure.  If they are, it would
> > be nice to use the generic macros instead of defining new ones so it's
> > easier to analyze PCI_ERR_COR_MASK usage.
> >
> > If 015c9cbcf0ad is updating the generic PCI_ERR_COR_MASK, it should
> > only prevent sending ERR_COR.  It should not affect the *logging* in
> > PCI_ERR_COR_STATUS (see PCIe r6.0, sec 6.2.3.2.2), so it shouldn't
> > affect the lspci output.
> 
> PCI_GLI_9750_CORRERR_MASK is specific to GLI 975x devices, so it
> doesn't conform to generic PCI_ERR_COR_STATUS behavior.

*Could* 015c9cbcf0ad have used the generic PCI_ERR_COR_MASK to
accomplish the same effect?  Is there an advantage to using the
device-specific PCI_GLI_9750_CORRERR_MASK?

If masking via PCI_ERR_COR_MASK would work, that would be much better
because the PCI core can see, manage, and make that visible, e.g., via
sysfs.  The core doesn't do that today, but people are working on it.

> > If 015c9cbcf0ad is actually updating PCI_ERR_COR_MASK, it would be
> > nice to clean that up, too.  And maybe PCI_ERR_COR_REP_TIMER should be
> > masked/restored at the same place for both the Downstream Port and the
> > Endpoint?
> 
> Since PCI_ERR_COR_REP_TIMER is already masked before 015c9cbcf0ad,
> so I didn't think that's necessary.  Do you think it should still be
> masked just to be safe?

Did you mean "PCI_ERR_COR_REP_TIMER is already masked *by*
015c9cbcf0ad"?

If masking PCI_ERR_COR_REP_TIMER using the generic PCI_ERR_COR_MASK in
the GL975x would have the same effect as masking it with
PCI_GLI_9750_CORRERR_MASK, then I think you should *only* use the
generic PCI_ERR_COR_MASK.

No need to do both if the generic one is sufficient.  And I think both
should be done in the same place since they're basically solving the
same problem, just at both ends of the link.

Bjorn
Kai-Heng Feng Jan. 18, 2024, 6:40 a.m. UTC | #7
On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > >
> > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > >
> > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > you mean that if you run lspci continually, you still see Replay Timer
> > > Timeout logged, e.g.,
> > >
> > >   CESta:        ... Timeout+
> >
> > Yes it's logged and the AER IRQ is raised.
>
> IIUC the AER IRQ is the important thing.
>
> Neither 015c9cbcf0ad nor this patch affects logging in
> PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> here doesn't add useful information.

You are right. That's just a way to access config space to reproduce the issue.

>
> I'd suggest more specific wording than "spamming `lspci -vv`", e.g.,
>
>   015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the replay timer
>   timeout of AER") masks Replay Timer Timeout errors at the GL975x
>   Endpoint.  When the Endpoint detects these errors, it still logs
>   them in its PCI_ERR_COR_STATUS, but masking prevents it from sending
>   ERR_COR messages upstream.
>
>   The Downstream Port leading to a GL975x Endpoint is unaffected by
>   015c9cbcf0ad.  Previously, when that Port detected a Replay Timer
>   Timeout, it sent an ERR_COR message upstream, which eventually
>   caused an AER IRQ, which prevented the system from suspending.
>
>   Mask Replay Timer Timeout errors at the Downstream Port.  The errors
>   will still be logged in PCI_ERR_COR_STATUS, but no ERR_COR will be
>   sent.

That's phrased much better then mine :)

>
> > > 015c9cbcf0ad uses hard-coded PCI_GLI_9750_CORRERR_MASK offset and
> > > PCI_GLI_9750_CORRERR_MASK_REPLAY_TIMER_TIMEOUT value, which look like
> > > they *could* be PCI_ERR_COR_MASK and PCI_ERR_COR_REP_TIMER, but
> > > without the lspci output I can't tell for sure.  If they are, it would
> > > be nice to use the generic macros instead of defining new ones so it's
> > > easier to analyze PCI_ERR_COR_MASK usage.
> > >
> > > If 015c9cbcf0ad is updating the generic PCI_ERR_COR_MASK, it should
> > > only prevent sending ERR_COR.  It should not affect the *logging* in
> > > PCI_ERR_COR_STATUS (see PCIe r6.0, sec 6.2.3.2.2), so it shouldn't
> > > affect the lspci output.
> >
> > PCI_GLI_9750_CORRERR_MASK is specific to GLI 975x devices, so it
> > doesn't conform to generic PCI_ERR_COR_STATUS behavior.
>
> *Could* 015c9cbcf0ad have used the generic PCI_ERR_COR_MASK to
> accomplish the same effect?  Is there an advantage to using the
> device-specific PCI_GLI_9750_CORRERR_MASK?
>
> If masking via PCI_ERR_COR_MASK would work, that would be much better
> because the PCI core can see, manage, and make that visible, e.g., via
> sysfs.  The core doesn't do that today, but people are working on it.

I don't think so. Please see below.

>
> > > If 015c9cbcf0ad is actually updating PCI_ERR_COR_MASK, it would be
> > > nice to clean that up, too.  And maybe PCI_ERR_COR_REP_TIMER should be
> > > masked/restored at the same place for both the Downstream Port and the
> > > Endpoint?
> >
> > Since PCI_ERR_COR_REP_TIMER is already masked before 015c9cbcf0ad,
> > so I didn't think that's necessary.  Do you think it should still be
> > masked just to be safe?
>
> Did you mean "PCI_ERR_COR_REP_TIMER is already masked *by*
> 015c9cbcf0ad"?

No. The PCI_ERR_COR_REP_TIMER is masked with or without 015c9cbcf0ad.
That means before 015c9cbcf0ad, Reply Timeout error was reported with
PCI_ERR_COR_REP_TIMER masked.

So using PCI_GLI_9750_CORRERR_MASK is necessary for the endpoint.

>
> If masking PCI_ERR_COR_REP_TIMER using the generic PCI_ERR_COR_MASK in
> the GL975x would have the same effect as masking it with
> PCI_GLI_9750_CORRERR_MASK, then I think you should *only* use the
> generic PCI_ERR_COR_MASK.
>
> No need to do both if the generic one is sufficient.  And I think both
> should be done in the same place since they're basically solving the
> same problem, just at both ends of the link.

Do you mean only mask PCI_GLI_9750_CORRERR_MASK during suspend?
That will not be ideal because accessing its config space (e.g. `lspci
-vv`) will have many errors logged.

Kai-Heng

>
> Bjorn
Bjorn Helgaas Jan. 19, 2024, 10:40 p.m. UTC | #8
On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > >
> > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > Timeout logged, e.g.,
> > > >
> > > >   CESta:        ... Timeout+
> > >
> > > Yes it's logged and the AER IRQ is raised.
> >
> > IIUC the AER IRQ is the important thing.
> >
> > Neither 015c9cbcf0ad nor this patch affects logging in
> > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > here doesn't add useful information.
> 
> You are right. That's just a way to access config space to reproduce
> the issue.

Oh, I think I completely misunderstood you!  I thought you were saying
that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
you happened to see that it was logged when you ran lspci.

But I guess you mean that running lspci actually *causes* the error?
I.e., lspci does a config access while we're suspending the device
causes the error, and the config access itself causes the error, which
causes the ERR_COR message and ultimately the AER interrupt, and that
interrupt prevents the system suspend.

If that's the case, I wonder if this is a generic problem that could
happen with *any* device, not just GL975x.

What power state do we put the GL975x in during system suspend?
D3hot?  D3cold?  Is there anything that prevents config access while
we suspend it?

We do have dev->block_cfg_access, and there's a comment that says
"we're required to prevent config accesses during D-state
transitions," but I don't see it being used during D-state
transitions.

Also, it doesn't seem suitable for preventing config accesses during
suspend because pci_wait_cfg() just busy-waits and never returns an
error.

Bjorn
Kai-Heng Feng March 21, 2024, 10:05 a.m. UTC | #9
Hi Bjorn,

Sorry for the belated response.

On Sat, Jan 20, 2024 at 6:41 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> > On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > > >
> > > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > > Timeout logged, e.g.,
> > > > >
> > > > >   CESta:        ... Timeout+
> > > >
> > > > Yes it's logged and the AER IRQ is raised.
> > >
> > > IIUC the AER IRQ is the important thing.
> > >
> > > Neither 015c9cbcf0ad nor this patch affects logging in
> > > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > > here doesn't add useful information.
> >
> > You are right. That's just a way to access config space to reproduce
> > the issue.
>
> Oh, I think I completely misunderstood you!  I thought you were saying
> that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
> you happened to see that it was logged when you ran lspci.

Both running lspci and suspending the device can observe the error,
because both are accessing the config space.

>
> But I guess you mean that running lspci actually *causes* the error?
> I.e., lspci does a config access while we're suspending the device
> causes the error, and the config access itself causes the error, which
> causes the ERR_COR message and ultimately the AER interrupt, and that
> interrupt prevents the system suspend.

My point was that any kind of PCI config access can cause the error.
Using lspci is just make the error more easier to reproduce.

>
> If that's the case, I wonder if this is a generic problem that could
> happen with *any* device, not just GL975x.

For now, it's just GL975x.

>
> What power state do we put the GL975x in during system suspend?
> D3hot?  D3cold?  Is there anything that prevents config access while
> we suspend it?

The target device state is D3hot.
However, the issue happens when the devices is in D0, when the PCI
core is saving the device's config space.

So I think the issue isn't related to the device state.

>
> We do have dev->block_cfg_access, and there's a comment that says
> "we're required to prevent config accesses during D-state
> transitions," but I don't see it being used during D-state
> transitions.

Yes, there isn't any D-state change happens here.

Kai-Heng

>
> Also, it doesn't seem suitable for preventing config accesses during
> suspend because pci_wait_cfg() just busy-waits and never returns an
> error.
>
> Bjorn
Bjorn Helgaas March 22, 2024, 4:43 p.m. UTC | #10
On Thu, Mar 21, 2024 at 06:05:33PM +0800, Kai-Heng Feng wrote:
> On Sat, Jan 20, 2024 at 6:41 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> > > On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > > > >
> > > > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > > > Timeout logged, e.g.,
> > > > > >
> > > > > >   CESta:        ... Timeout+
> > > > >
> > > > > Yes it's logged and the AER IRQ is raised.
> > > >
> > > > IIUC the AER IRQ is the important thing.
> > > >
> > > > Neither 015c9cbcf0ad nor this patch affects logging in
> > > > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > > > here doesn't add useful information.
> > >
> > > You are right. That's just a way to access config space to reproduce
> > > the issue.
> >
> > Oh, I think I completely misunderstood you!  I thought you were saying
> > that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
> > you happened to see that it was logged when you ran lspci.
> 
> Both running lspci and suspending the device can observe the error,
> because both are accessing the config space.
> 
> > But I guess you mean that running lspci actually *causes* the error?
> > I.e., lspci does a config access while we're suspending the device
> > causes the error, and the config access itself causes the error, which
> > causes the ERR_COR message and ultimately the AER interrupt, and that
> > interrupt prevents the system suspend.
> 
> My point was that any kind of PCI config access can cause the error.
> Using lspci is just make the error more easier to reproduce.
> 
> > If that's the case, I wonder if this is a generic problem that could
> > happen with *any* device, not just GL975x.
> 
> For now, it's just GL975x.
> 
> > What power state do we put the GL975x in during system suspend?
> > D3hot?  D3cold?  Is there anything that prevents config access while
> > we suspend it?
> 
> The target device state is D3hot.
> However, the issue happens when the devices is in D0, when the PCI
> core is saving the device's config space.
> 
> So I think the issue isn't related to the device state.
> 
> > We do have dev->block_cfg_access, and there's a comment that says
> > "we're required to prevent config accesses during D-state
> > transitions," but I don't see it being used during D-state
> > transitions.
> 
> Yes, there isn't any D-state change happens here.

So the timeout happens sometimes on any config accesses to the device,
no matter what the power state is?  If that's the case, why do the
masking in the suspend/resume callbacks?

If it's not related to a power state change, it sounds like something
that should be a quirk or done at probe time.

Bjorn
Kai-Heng Feng March 25, 2024, 2:02 a.m. UTC | #11
On Sat, Mar 23, 2024 at 12:43 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Thu, Mar 21, 2024 at 06:05:33PM +0800, Kai-Heng Feng wrote:
> > On Sat, Jan 20, 2024 at 6:41 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> > > > On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > > > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > > > > >
> > > > > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > > > > Timeout logged, e.g.,
> > > > > > >
> > > > > > >   CESta:        ... Timeout+
> > > > > >
> > > > > > Yes it's logged and the AER IRQ is raised.
> > > > >
> > > > > IIUC the AER IRQ is the important thing.
> > > > >
> > > > > Neither 015c9cbcf0ad nor this patch affects logging in
> > > > > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > > > > here doesn't add useful information.
> > > >
> > > > You are right. That's just a way to access config space to reproduce
> > > > the issue.
> > >
> > > Oh, I think I completely misunderstood you!  I thought you were saying
> > > that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
> > > you happened to see that it was logged when you ran lspci.
> >
> > Both running lspci and suspending the device can observe the error,
> > because both are accessing the config space.
> >
> > > But I guess you mean that running lspci actually *causes* the error?
> > > I.e., lspci does a config access while we're suspending the device
> > > causes the error, and the config access itself causes the error, which
> > > causes the ERR_COR message and ultimately the AER interrupt, and that
> > > interrupt prevents the system suspend.
> >
> > My point was that any kind of PCI config access can cause the error.
> > Using lspci is just make the error more easier to reproduce.
> >
> > > If that's the case, I wonder if this is a generic problem that could
> > > happen with *any* device, not just GL975x.
> >
> > For now, it's just GL975x.
> >
> > > What power state do we put the GL975x in during system suspend?
> > > D3hot?  D3cold?  Is there anything that prevents config access while
> > > we suspend it?
> >
> > The target device state is D3hot.
> > However, the issue happens when the devices is in D0, when the PCI
> > core is saving the device's config space.
> >
> > So I think the issue isn't related to the device state.
> >
> > > We do have dev->block_cfg_access, and there's a comment that says
> > > "we're required to prevent config accesses during D-state
> > > transitions," but I don't see it being used during D-state
> > > transitions.
> >
> > Yes, there isn't any D-state change happens here.
>
> So the timeout happens sometimes on any config accesses to the device,
> no matter what the power state is?

Yes.

> If that's the case, why do the
> masking in the suspend/resume callbacks?

Because there's no functional impact when the error happens, other
than suspend/resume.

>
> If it's not related to a power state change, it sounds like something
> that should be a quirk or done at probe time.

Sure, I'll change that to be done at probe time.

Kai-Heng

>
> Bjorn
Bjorn Helgaas March 25, 2024, 7:02 p.m. UTC | #12
On Mon, Mar 25, 2024 at 10:02:27AM +0800, Kai-Heng Feng wrote:
> On Sat, Mar 23, 2024 at 12:43 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Thu, Mar 21, 2024 at 06:05:33PM +0800, Kai-Heng Feng wrote:
> > > On Sat, Jan 20, 2024 at 6:41 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> > > > > On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > > > > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > > > > > >
> > > > > > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > > > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > > > > > Timeout logged, e.g.,
> > > > > > > >
> > > > > > > >   CESta:        ... Timeout+
> > > > > > >
> > > > > > > Yes it's logged and the AER IRQ is raised.
> > > > > >
> > > > > > IIUC the AER IRQ is the important thing.
> > > > > >
> > > > > > Neither 015c9cbcf0ad nor this patch affects logging in
> > > > > > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > > > > > here doesn't add useful information.
> > > > >
> > > > > You are right. That's just a way to access config space to reproduce
> > > > > the issue.
> > > >
> > > > Oh, I think I completely misunderstood you!  I thought you were saying
> > > > that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
> > > > you happened to see that it was logged when you ran lspci.
> > >
> > > Both running lspci and suspending the device can observe the error,
> > > because both are accessing the config space.
> > >
> > > > But I guess you mean that running lspci actually *causes* the error?
> > > > I.e., lspci does a config access while we're suspending the device
> > > > causes the error, and the config access itself causes the error, which
> > > > causes the ERR_COR message and ultimately the AER interrupt, and that
> > > > interrupt prevents the system suspend.
> > >
> > > My point was that any kind of PCI config access can cause the error.
> > > Using lspci is just make the error more easier to reproduce.
> > >
> > > > If that's the case, I wonder if this is a generic problem that could
> > > > happen with *any* device, not just GL975x.
> > >
> > > For now, it's just GL975x.
> > >
> > > > What power state do we put the GL975x in during system suspend?
> > > > D3hot?  D3cold?  Is there anything that prevents config access while
> > > > we suspend it?
> > >
> > > The target device state is D3hot.
> > > However, the issue happens when the devices is in D0, when the PCI
> > > core is saving the device's config space.
> > >
> > > So I think the issue isn't related to the device state.
> > >
> > > > We do have dev->block_cfg_access, and there's a comment that says
> > > > "we're required to prevent config accesses during D-state
> > > > transitions," but I don't see it being used during D-state
> > > > transitions.
> > >
> > > Yes, there isn't any D-state change happens here.
> >
> > So the timeout happens sometimes on any config accesses to the device,
> > no matter what the power state is?
> 
> Yes.
> 
> > If that's the case, why do the
> > masking in the suspend/resume callbacks?
> 
> Because there's no functional impact when the error happens, other
> than suspend/resume.

Oh, I think I see.  Is this accurate?

  Due to a hardware defect in GL975x, config accesses when ASPM is
  enabled frequently cause Replay Timer Timeouts in the Port leading
  to the device.

  These are Correctable Errors, so the Downstream Port logs it in its
  PCI_ERR_COR_STATUS and, when the error is not masked, sends an
  ERR_COR message upstream.  The message terminates at a Root Port,
  which may generate an AER interrupt so the OS can log it.

  The Correctable Error logging is an annoyance but normally not a
  major issue.  But when the AER interrupt happens during suspend, it
  can prevent the system from suspending.

> > If it's not related to a power state change, it sounds like something
> > that should be a quirk or done at probe time.
> 
> Sure, I'll change that to be done at probe time.

In general, we want to log Correctable Errors because they give an
indication of link integrity.  But I'm guessing that since this is
related to a GL975x defect, the errors occur pretty frequently and on
all systems, so they aren't an indication of poor link quality and
they only break suspend, annoy users, and cause problem reports.

Masking them at suspend time would avoid the suspend/resume problem,
but we would still have the annoying logging and get the problem
reports.

If this is all true, I think masking via a quirk is probably the right
thing.  That way we won't get reports that "lspci causes errors" even
when the sdhci driver is not loaded.

I think we should log a hint in dmesg that we're masking
PCI_ERR_COR_REP_TIMER because the error will still be logged in the
PCI_ERR_COR_STATUS register, and that will be visible via lspci, and a
dmesg hint will save debugging time when people report that.

Bjorn
Kai-Heng Feng March 26, 2024, 1:52 a.m. UTC | #13
On Tue, Mar 26, 2024 at 3:02 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Mon, Mar 25, 2024 at 10:02:27AM +0800, Kai-Heng Feng wrote:
> > On Sat, Mar 23, 2024 at 12:43 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Thu, Mar 21, 2024 at 06:05:33PM +0800, Kai-Heng Feng wrote:
> > > > On Sat, Jan 20, 2024 at 6:41 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > On Thu, Jan 18, 2024 at 02:40:50PM +0800, Kai-Heng Feng wrote:
> > > > > > On Sat, Jan 13, 2024 at 1:37 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > > On Fri, Jan 12, 2024 at 01:14:42PM +0800, Kai-Heng Feng wrote:
> > > > > > > > On Sat, Jan 6, 2024 at 5:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > > > > > > On Thu, Dec 21, 2023 at 11:21:47AM +0800, Kai-Heng Feng wrote:
> > > > > > > > > > Spamming `lspci -vv` can still observe the replay timer timeout error
> > > > > > > > > > even after commit 015c9cbcf0ad ("mmc: sdhci-pci-gli: GL9750: Mask the
> > > > > > > > > > replay timer timeout of AER"), albeit with a lower reproduce rate.
> > > > > > > > >
> > > > > > > > > I'm not sure what this is telling me.  By "spamming `lspci -vv`, do
> > > > > > > > > you mean that if you run lspci continually, you still see Replay Timer
> > > > > > > > > Timeout logged, e.g.,
> > > > > > > > >
> > > > > > > > >   CESta:        ... Timeout+
> > > > > > > >
> > > > > > > > Yes it's logged and the AER IRQ is raised.
> > > > > > >
> > > > > > > IIUC the AER IRQ is the important thing.
> > > > > > >
> > > > > > > Neither 015c9cbcf0ad nor this patch affects logging in
> > > > > > > PCI_ERR_COR_STATUS, so the lspci output won't change and mentioning it
> > > > > > > here doesn't add useful information.
> > > > > >
> > > > > > You are right. That's just a way to access config space to reproduce
> > > > > > the issue.
> > > > >
> > > > > Oh, I think I completely misunderstood you!  I thought you were saying
> > > > > that suspending the device caused the PCI_ERR_COR_REP_TIMER error, and
> > > > > you happened to see that it was logged when you ran lspci.
> > > >
> > > > Both running lspci and suspending the device can observe the error,
> > > > because both are accessing the config space.
> > > >
> > > > > But I guess you mean that running lspci actually *causes* the error?
> > > > > I.e., lspci does a config access while we're suspending the device
> > > > > causes the error, and the config access itself causes the error, which
> > > > > causes the ERR_COR message and ultimately the AER interrupt, and that
> > > > > interrupt prevents the system suspend.
> > > >
> > > > My point was that any kind of PCI config access can cause the error.
> > > > Using lspci is just make the error more easier to reproduce.
> > > >
> > > > > If that's the case, I wonder if this is a generic problem that could
> > > > > happen with *any* device, not just GL975x.
> > > >
> > > > For now, it's just GL975x.
> > > >
> > > > > What power state do we put the GL975x in during system suspend?
> > > > > D3hot?  D3cold?  Is there anything that prevents config access while
> > > > > we suspend it?
> > > >
> > > > The target device state is D3hot.
> > > > However, the issue happens when the devices is in D0, when the PCI
> > > > core is saving the device's config space.
> > > >
> > > > So I think the issue isn't related to the device state.
> > > >
> > > > > We do have dev->block_cfg_access, and there's a comment that says
> > > > > "we're required to prevent config accesses during D-state
> > > > > transitions," but I don't see it being used during D-state
> > > > > transitions.
> > > >
> > > > Yes, there isn't any D-state change happens here.
> > >
> > > So the timeout happens sometimes on any config accesses to the device,
> > > no matter what the power state is?
> >
> > Yes.
> >
> > > If that's the case, why do the
> > > masking in the suspend/resume callbacks?
> >
> > Because there's no functional impact when the error happens, other
> > than suspend/resume.
>
> Oh, I think I see.  Is this accurate?
>
>   Due to a hardware defect in GL975x, config accesses when ASPM is
>   enabled frequently cause Replay Timer Timeouts in the Port leading
>   to the device.
>
>   These are Correctable Errors, so the Downstream Port logs it in its
>   PCI_ERR_COR_STATUS and, when the error is not masked, sends an
>   ERR_COR message upstream.  The message terminates at a Root Port,
>   which may generate an AER interrupt so the OS can log it.
>
>   The Correctable Error logging is an annoyance but normally not a
>   major issue.  But when the AER interrupt happens during suspend, it
>   can prevent the system from suspending.

That's totally the case here.

This brings up another different but related topic  - should the port
driver disable AER/DPC IRQ during suspend?
We've discussed this many times, I still think that's the right
approach to "quiesce" many unexpected errors during system state
transition.


>
> > > If it's not related to a power state change, it sounds like something
> > > that should be a quirk or done at probe time.
> >
> > Sure, I'll change that to be done at probe time.
>
> In general, we want to log Correctable Errors because they give an
> indication of link integrity.  But I'm guessing that since this is
> related to a GL975x defect, the errors occur pretty frequently and on
> all systems, so they aren't an indication of poor link quality and
> they only break suspend, annoy users, and cause problem reports.
>
> Masking them at suspend time would avoid the suspend/resume problem,
> but we would still have the annoying logging and get the problem
> reports.
>
> If this is all true, I think masking via a quirk is probably the right
> thing.  That way we won't get reports that "lspci causes errors" even
> when the sdhci driver is not loaded.
>
> I think we should log a hint in dmesg that we're masking
> PCI_ERR_COR_REP_TIMER because the error will still be logged in the
> PCI_ERR_COR_STATUS register, and that will be visible via lspci, and a
> dmesg hint will save debugging time when people report that.

Sure. Where do you think it's a better place to implement the quirk? I
Assume PCI quirk is a better place than driver's probe routine?

Kai-Heng

>
> Bjorn
diff mbox series

Patch

diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
index 025b31aa712c..59ae4da72974 100644
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -68,7 +68,7 @@  static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip)
 	return 0;
 }
 
-static int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
+int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip)
 {
 	int i, ret;
 
diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
index 77911a57b12c..54943e9df835 100644
--- a/drivers/mmc/host/sdhci-pci-gli.c
+++ b/drivers/mmc/host/sdhci-pci-gli.c
@@ -1429,6 +1429,55 @@  static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip)
 	return sdhci_pci_resume_host(chip);
 }
 
+#ifdef CONFIG_PCIEAER
+static void mask_replay_timer_timeout(struct pci_dev *pdev)
+{
+	struct pci_dev *parent = pci_upstream_bridge(pdev);
+	u32 val;
+
+	if (!parent || !parent->aer_cap)
+		return;
+
+	pci_read_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, &val);
+	val |= PCI_ERR_COR_REP_TIMER;
+	pci_write_config_dword(parent, parent->aer_cap + PCI_ERR_COR_MASK, val);
+}
+
+static void unmask_replay_timer_timeout(struct pci_dev *pdev)
+{
+	struct pci_dev *parent = pci_upstream_bridge(pdev);
+	u32 val;
+
+	if (!parent || !parent->aer_cap)
+		return;
+
+	pci_read_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, &val);
+	val &= ~PCI_ERR_COR_REP_TIMER;
+	pci_write_config_dword(pdev, parent->aer_cap + PCI_ERR_COR_MASK, val);
+}
+#else
+static inline void mask_replay_timer_timeout(struct pci_dev *pdev) { }
+static inline void unmask_replay_timer_timeout(struct pci_dev *pdev) {  }
+#endif
+
+static int sdhci_pci_gl975x_suspend(struct sdhci_pci_chip *chip)
+{
+	mask_replay_timer_timeout(chip->pdev);
+
+	return sdhci_pci_suspend_host(chip);
+}
+
+static int sdhci_pci_gl975x_resume(struct sdhci_pci_chip *chip)
+{
+	int ret;
+
+	ret = sdhci_pci_gli_resume(chip);
+
+	unmask_replay_timer_timeout(chip->pdev);
+
+	return ret;
+}
+
 static int gl9763e_resume(struct sdhci_pci_chip *chip)
 {
 	struct sdhci_pci_slot *slot = chip->slots[0];
@@ -1547,7 +1596,8 @@  const struct sdhci_pci_fixes sdhci_gl9755 = {
 	.probe_slot	= gli_probe_slot_gl9755,
 	.ops            = &sdhci_gl9755_ops,
 #ifdef CONFIG_PM_SLEEP
-	.resume         = sdhci_pci_gli_resume,
+	.suspend	= sdhci_pci_gl975x_suspend,
+	.resume         = sdhci_pci_gl975x_resume,
 #endif
 };
 
@@ -1570,7 +1620,8 @@  const struct sdhci_pci_fixes sdhci_gl9750 = {
 	.probe_slot	= gli_probe_slot_gl9750,
 	.ops            = &sdhci_gl9750_ops,
 #ifdef CONFIG_PM_SLEEP
-	.resume         = sdhci_pci_gli_resume,
+	.suspend	= sdhci_pci_gl975x_suspend,
+	.resume         = sdhci_pci_gl975x_resume,
 #endif
 };
 
diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
index 153704f812ed..19253dce687d 100644
--- a/drivers/mmc/host/sdhci-pci.h
+++ b/drivers/mmc/host/sdhci-pci.h
@@ -190,6 +190,7 @@  static inline void *sdhci_pci_priv(struct sdhci_pci_slot *slot)
 }
 
 #ifdef CONFIG_PM_SLEEP
+int sdhci_pci_suspend_host(struct sdhci_pci_chip *chip);
 int sdhci_pci_resume_host(struct sdhci_pci_chip *chip);
 #endif
 int sdhci_pci_enable_dma(struct sdhci_host *host);