diff mbox series

[v17,2/4] PCI: Add support for drivers to register optin or veto of D3

Message ID 20230906184354.45846-3-mario.limonciello@amd.com
State New
Headers show
Series Allow drivers to influence D3 behavior for bridges | expand

Commit Message

Mario Limonciello Sept. 6, 2023, 6:43 p.m. UTC
commit 9d26d3a8f1b0 ("PCI: Put PCIe ports into D3 during suspend")
changed pci_bridge_d3_possible() so that any vendor's PCIe ports
from modern machines (>=2015) are allowed to be put into D3.

This policy change has worked for most machines, but the behavior
is improved with `pcie_port_pm=off` on some others.

Add support for drivers to register both 'optin' and 'veto' callbacks
and a priority which can be used for deciding whether a device should
use D3. When drivers register the callbacks, the list is sorted by
priority.

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/pci/pci.c   | 143 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |   9 +++
 2 files changed, 152 insertions(+)

Comments

Rafael J. Wysocki Sept. 11, 2023, 6:34 p.m. UTC | #1
On Wed, Sep 6, 2023 at 9:16 PM Mario Limonciello
<mario.limonciello@amd.com> wrote:
>
> commit 9d26d3a8f1b0 ("PCI: Put PCIe ports into D3 during suspend")
> changed pci_bridge_d3_possible() so that any vendor's PCIe ports
> from modern machines (>=2015) are allowed to be put into D3.
>
> This policy change has worked for most machines, but the behavior
> is improved with `pcie_port_pm=off` on some others.
>
> Add support for drivers to register both 'optin' and 'veto' callbacks
> and a priority which can be used for deciding whether a device should
> use D3. When drivers register the callbacks, the list is sorted by
> priority.
>
> Suggested-by: Hans de Goede <hdegoede@redhat.com>
> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
>  drivers/pci/pci.c   | 143 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pci.h |   9 +++
>  2 files changed, 152 insertions(+)
>
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 59c01d68c6d5..06ab73f58adf 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -21,6 +21,7 @@
>  #include <linux/module.h>
>  #include <linux/spinlock.h>
>  #include <linux/string.h>
> +#include <linux/list_sort.h>
>  #include <linux/log2.h>
>  #include <linux/logic_pio.h>
>  #include <linux/pm_wakeup.h>
> @@ -54,6 +55,8 @@ unsigned int pci_pm_d3hot_delay;
>  static void pci_pme_list_scan(struct work_struct *work);
>
>  static LIST_HEAD(pci_pme_list);
> +static LIST_HEAD(d3_possible_list);
> +static DEFINE_MUTEX(d3_possible_list_mutex);
>  static DEFINE_MUTEX(pci_pme_list_mutex);
>  static DECLARE_DELAYED_WORK(pci_pme_work, pci_pme_list_scan);
>
> @@ -161,6 +164,14 @@ static bool pcie_ats_disabled;
>  /* If set, the PCI config space of each device is printed during boot. */
>  bool pci_early_dump;
>
> +/* Preferences set by a driver for D3 optin/veto */
> +enum driver_d3_pref {
> +       D3_PREF_UNSET,  /* Not configured by driver */
> +       D3_PREF_NONE,   /* Driver does not care */
> +       D3_PREF_OPTIN,  /* Driver prefers D3 */
> +       D3_PREF_VETO,   /* Driver vetoes D3 */
> +};
> +
>  bool pci_ats_disabled(void)
>  {
>         return pcie_ats_disabled;
> @@ -3031,6 +3042,10 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
>                 if (pci_bridge_d3_force)
>                         return true;
>
> +               /* check for any driver optin of D3 for the bridge */
> +               if (bridge->driver_d3 == D3_PREF_OPTIN)
> +                       return true;
> +
>                 /* Even the oldest 2010 Thunderbolt controller supports D3. */
>                 if (bridge->is_thunderbolt)
>                         return true;
> @@ -3050,6 +3065,10 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
>                 if (dmi_check_system(bridge_d3_blacklist))
>                         return false;
>
> +               /* check for any driver veto for D3 for the bridge */
> +               if (bridge->driver_d3 == D3_PREF_VETO)
> +                       return false;
> +
>                 /*
>                  * It should be safe to put PCIe ports from 2015 or newer
>                  * to D3.
> @@ -3168,6 +3187,130 @@ void pci_d3cold_disable(struct pci_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(pci_d3cold_disable);
>
> +static struct pci_dev *pci_get_upstream_port(struct pci_dev *pci_dev)
> +{
> +       struct pci_dev *bridge;
> +
> +       bridge = pci_upstream_bridge(pci_dev);
> +       if (!bridge)
> +               return NULL;
> +
> +       if (!pci_is_pcie(bridge))
> +               return NULL;
> +
> +       switch (pci_pcie_type(bridge)) {
> +       case PCI_EXP_TYPE_ROOT_PORT:
> +       case PCI_EXP_TYPE_UPSTREAM:
> +       case PCI_EXP_TYPE_DOWNSTREAM:
> +               return bridge;
> +       default:
> +               break;
> +       };
> +
> +       return NULL;
> +}
> +
> +static void pci_refresh_d3_possible(void)
> +{
> +       struct pci_d3_driver_ops *driver;
> +       struct pci_dev *pci_dev = NULL;
> +       struct pci_dev *bridge;
> +
> +       /* 1st pass: unset any preferences set a previous invocation */
> +       while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
> +               bridge = pci_get_upstream_port(pci_dev);
> +               if (!bridge)
> +                       continue;
> +
> +               if (bridge->driver_d3 != D3_PREF_UNSET)
> +                       bridge->driver_d3 = D3_PREF_UNSET;
> +       }
> +
> +       pci_dev = NULL;
> +
> +       /* 2nd pass: update the preference and refresh bridge_d3 */
> +       while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
> +               bridge = pci_get_upstream_port(pci_dev);
> +               if (!bridge)
> +                       continue;
> +
> +               /* don't make multiple passes on the same device */
> +               if (bridge->driver_d3 != D3_PREF_UNSET)
> +                       continue;
> +
> +               /* the list is pre-sorted by highest priority */
> +               mutex_lock(&d3_possible_list_mutex);
> +               list_for_each_entry(driver, &d3_possible_list, list_node) {
> +                       /* another higher priority driver already set preference */
> +                       if (bridge->driver_d3 != D3_PREF_UNSET)
> +                               break;
> +                       /* check for opt in to D3 */
> +                       if (driver->optin && driver->optin(bridge))
> +                               bridge->driver_d3 = D3_PREF_OPTIN;
> +                       /* check for opt out of D3 */
> +                       else if (driver->veto && driver->veto(bridge))
> +                               bridge->driver_d3 = D3_PREF_VETO;
> +               }
> +               mutex_unlock(&d3_possible_list_mutex);
> +
> +               /* no driver set a preference */
> +               if (bridge->driver_d3 == D3_PREF_UNSET)
> +                       bridge->driver_d3 = D3_PREF_NONE;
> +
> +               /* update bridge_d3 */
> +               pci_bridge_d3_update(pci_dev);
> +       }
> +}
> +
> +static int pci_d3_driver_cmp(void *priv, const struct list_head *_a,
> +                          const struct list_head *_b)
> +{
> +       struct pci_d3_driver_ops *a = container_of(_a, typeof(*a), list_node);
> +       struct pci_d3_driver_ops *b = container_of(_b, typeof(*b), list_node);
> +
> +       if (a->priority < b->priority)
> +               return -1;
> +       else if (a->priority > b->priority)
> +               return 1;
> +       return 0;
> +}
> +
> +/**
> + * pci_register_d3_possible_cb - Register a driver callback for checking d3 veto
> + * @arg: driver provided structure with function pointer and priority
> + *
> + * This function can be used by drivers to register a callback that can be
> + * used to veto a device going into D3.
> + * Returns 0 on success, error code on failure.
> + */
> +int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg)
> +{
> +       mutex_lock(&d3_possible_list_mutex);
> +       list_add(&arg->list_node, &d3_possible_list);
> +       list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
> +       mutex_unlock(&d3_possible_list_mutex);
> +       pci_refresh_d3_possible();
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_register_d3_possible_cb);
> +
> +/**
> + * pci_unregister_d3_possible_cb - Unregister a driver callback for checking d3 veto
> + * @arg: driver provided structure with function pointer and priority
> + *
> + * This function can be used by drivers to unregister a callback that can be
> + * used to veto a device going into D3.
> + */
> +void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg)
> +{
> +       mutex_lock(&d3_possible_list_mutex);
> +       list_del(&arg->list_node);
> +       list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
> +       mutex_unlock(&d3_possible_list_mutex);
> +       pci_refresh_d3_possible();
> +}
> +EXPORT_SYMBOL_GPL(pci_unregister_d3_possible_cb);
> +
>  /**
>   * pci_pm_init - Initialize PM functions of given PCI device
>   * @dev: PCI device to handle.
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 8c7c2c3c6c65..863399e78869 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -389,6 +389,7 @@ struct pci_dev {
>                                                    bit manually */
>         unsigned int    d3hot_delay;    /* D3hot->D0 transition time in ms */
>         unsigned int    d3cold_delay;   /* D3cold->D0 transition time in ms */
> +       unsigned int    driver_d3;      /* Driver D3 request preference */
>
>  #ifdef CONFIG_PCIEASPM
>         struct pcie_link_state  *link_state;    /* ASPM link state */
> @@ -1404,6 +1405,14 @@ void pci_d3cold_disable(struct pci_dev *dev);
>  bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
>  void pci_resume_bus(struct pci_bus *bus);
>  void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state);
> +struct pci_d3_driver_ops {
> +       struct list_head list_node;
> +       int priority;
> +       bool (*optin)(struct pci_dev *pdev);
> +       bool (*veto)(struct pci_dev *pdev);
> +};
> +int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg);
> +void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg);
>
>  /* For use by arch with custom probe code */
>  void set_pcie_port_type(struct pci_dev *pdev);
> --

I don't like this too much TBH.  It appears overly complicated to me
and it totally misses the wakeup vs non-wakeup point I've been talking
about recently.

IMV, the underlying issue all boils down to the platform firmware
inadequately describing the behavior of the system to the OS.
Specifically, had it provided a _S0W returning 0 for the Root Port(s)
in question, wakeup signaling would have worked (or else there would
have been a defect in the kernel code to be addressed).  Instead, it
decided to kind-of guide Windows in the "right" direction through PEP
constraints which doesn't have the same effect on Linux and honestly
I'm not even sure if it is a good idea to adjust Linux to that.

It looks to me like the issue could be addressed by a PCI device quirk
playing the role of a missing _S0W (but slightly more precise, because
it may distinguish suspend-to-idle from PM-runtime, which _S0W cannot
do).
Mario Limonciello Sept. 11, 2023, 8:23 p.m. UTC | #2
On 9/11/2023 13:34, Rafael J. Wysocki wrote:
> On Wed, Sep 6, 2023 at 9:16 PM Mario Limonciello
> <mario.limonciello@amd.com> wrote:
>>
>> commit 9d26d3a8f1b0 ("PCI: Put PCIe ports into D3 during suspend")
>> changed pci_bridge_d3_possible() so that any vendor's PCIe ports
>> from modern machines (>=2015) are allowed to be put into D3.
>>
>> This policy change has worked for most machines, but the behavior
>> is improved with `pcie_port_pm=off` on some others.
>>
>> Add support for drivers to register both 'optin' and 'veto' callbacks
>> and a priority which can be used for deciding whether a device should
>> use D3. When drivers register the callbacks, the list is sorted by
>> priority.
>>
>> Suggested-by: Hans de Goede <hdegoede@redhat.com>
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>>   drivers/pci/pci.c   | 143 ++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/pci.h |   9 +++
>>   2 files changed, 152 insertions(+)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 59c01d68c6d5..06ab73f58adf 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -21,6 +21,7 @@
>>   #include <linux/module.h>
>>   #include <linux/spinlock.h>
>>   #include <linux/string.h>
>> +#include <linux/list_sort.h>
>>   #include <linux/log2.h>
>>   #include <linux/logic_pio.h>
>>   #include <linux/pm_wakeup.h>
>> @@ -54,6 +55,8 @@ unsigned int pci_pm_d3hot_delay;
>>   static void pci_pme_list_scan(struct work_struct *work);
>>
>>   static LIST_HEAD(pci_pme_list);
>> +static LIST_HEAD(d3_possible_list);
>> +static DEFINE_MUTEX(d3_possible_list_mutex);
>>   static DEFINE_MUTEX(pci_pme_list_mutex);
>>   static DECLARE_DELAYED_WORK(pci_pme_work, pci_pme_list_scan);
>>
>> @@ -161,6 +164,14 @@ static bool pcie_ats_disabled;
>>   /* If set, the PCI config space of each device is printed during boot. */
>>   bool pci_early_dump;
>>
>> +/* Preferences set by a driver for D3 optin/veto */
>> +enum driver_d3_pref {
>> +       D3_PREF_UNSET,  /* Not configured by driver */
>> +       D3_PREF_NONE,   /* Driver does not care */
>> +       D3_PREF_OPTIN,  /* Driver prefers D3 */
>> +       D3_PREF_VETO,   /* Driver vetoes D3 */
>> +};
>> +
>>   bool pci_ats_disabled(void)
>>   {
>>          return pcie_ats_disabled;
>> @@ -3031,6 +3042,10 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
>>                  if (pci_bridge_d3_force)
>>                          return true;
>>
>> +               /* check for any driver optin of D3 for the bridge */
>> +               if (bridge->driver_d3 == D3_PREF_OPTIN)
>> +                       return true;
>> +
>>                  /* Even the oldest 2010 Thunderbolt controller supports D3. */
>>                  if (bridge->is_thunderbolt)
>>                          return true;
>> @@ -3050,6 +3065,10 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
>>                  if (dmi_check_system(bridge_d3_blacklist))
>>                          return false;
>>
>> +               /* check for any driver veto for D3 for the bridge */
>> +               if (bridge->driver_d3 == D3_PREF_VETO)
>> +                       return false;
>> +
>>                  /*
>>                   * It should be safe to put PCIe ports from 2015 or newer
>>                   * to D3.
>> @@ -3168,6 +3187,130 @@ void pci_d3cold_disable(struct pci_dev *dev)
>>   }
>>   EXPORT_SYMBOL_GPL(pci_d3cold_disable);
>>
>> +static struct pci_dev *pci_get_upstream_port(struct pci_dev *pci_dev)
>> +{
>> +       struct pci_dev *bridge;
>> +
>> +       bridge = pci_upstream_bridge(pci_dev);
>> +       if (!bridge)
>> +               return NULL;
>> +
>> +       if (!pci_is_pcie(bridge))
>> +               return NULL;
>> +
>> +       switch (pci_pcie_type(bridge)) {
>> +       case PCI_EXP_TYPE_ROOT_PORT:
>> +       case PCI_EXP_TYPE_UPSTREAM:
>> +       case PCI_EXP_TYPE_DOWNSTREAM:
>> +               return bridge;
>> +       default:
>> +               break;
>> +       };
>> +
>> +       return NULL;
>> +}
>> +
>> +static void pci_refresh_d3_possible(void)
>> +{
>> +       struct pci_d3_driver_ops *driver;
>> +       struct pci_dev *pci_dev = NULL;
>> +       struct pci_dev *bridge;
>> +
>> +       /* 1st pass: unset any preferences set a previous invocation */
>> +       while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
>> +               bridge = pci_get_upstream_port(pci_dev);
>> +               if (!bridge)
>> +                       continue;
>> +
>> +               if (bridge->driver_d3 != D3_PREF_UNSET)
>> +                       bridge->driver_d3 = D3_PREF_UNSET;
>> +       }
>> +
>> +       pci_dev = NULL;
>> +
>> +       /* 2nd pass: update the preference and refresh bridge_d3 */
>> +       while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
>> +               bridge = pci_get_upstream_port(pci_dev);
>> +               if (!bridge)
>> +                       continue;
>> +
>> +               /* don't make multiple passes on the same device */
>> +               if (bridge->driver_d3 != D3_PREF_UNSET)
>> +                       continue;
>> +
>> +               /* the list is pre-sorted by highest priority */
>> +               mutex_lock(&d3_possible_list_mutex);
>> +               list_for_each_entry(driver, &d3_possible_list, list_node) {
>> +                       /* another higher priority driver already set preference */
>> +                       if (bridge->driver_d3 != D3_PREF_UNSET)
>> +                               break;
>> +                       /* check for opt in to D3 */
>> +                       if (driver->optin && driver->optin(bridge))
>> +                               bridge->driver_d3 = D3_PREF_OPTIN;
>> +                       /* check for opt out of D3 */
>> +                       else if (driver->veto && driver->veto(bridge))
>> +                               bridge->driver_d3 = D3_PREF_VETO;
>> +               }
>> +               mutex_unlock(&d3_possible_list_mutex);
>> +
>> +               /* no driver set a preference */
>> +               if (bridge->driver_d3 == D3_PREF_UNSET)
>> +                       bridge->driver_d3 = D3_PREF_NONE;
>> +
>> +               /* update bridge_d3 */
>> +               pci_bridge_d3_update(pci_dev);
>> +       }
>> +}
>> +
>> +static int pci_d3_driver_cmp(void *priv, const struct list_head *_a,
>> +                          const struct list_head *_b)
>> +{
>> +       struct pci_d3_driver_ops *a = container_of(_a, typeof(*a), list_node);
>> +       struct pci_d3_driver_ops *b = container_of(_b, typeof(*b), list_node);
>> +
>> +       if (a->priority < b->priority)
>> +               return -1;
>> +       else if (a->priority > b->priority)
>> +               return 1;
>> +       return 0;
>> +}
>> +
>> +/**
>> + * pci_register_d3_possible_cb - Register a driver callback for checking d3 veto
>> + * @arg: driver provided structure with function pointer and priority
>> + *
>> + * This function can be used by drivers to register a callback that can be
>> + * used to veto a device going into D3.
>> + * Returns 0 on success, error code on failure.
>> + */
>> +int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg)
>> +{
>> +       mutex_lock(&d3_possible_list_mutex);
>> +       list_add(&arg->list_node, &d3_possible_list);
>> +       list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
>> +       mutex_unlock(&d3_possible_list_mutex);
>> +       pci_refresh_d3_possible();
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(pci_register_d3_possible_cb);
>> +
>> +/**
>> + * pci_unregister_d3_possible_cb - Unregister a driver callback for checking d3 veto
>> + * @arg: driver provided structure with function pointer and priority
>> + *
>> + * This function can be used by drivers to unregister a callback that can be
>> + * used to veto a device going into D3.
>> + */
>> +void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg)
>> +{
>> +       mutex_lock(&d3_possible_list_mutex);
>> +       list_del(&arg->list_node);
>> +       list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
>> +       mutex_unlock(&d3_possible_list_mutex);
>> +       pci_refresh_d3_possible();
>> +}
>> +EXPORT_SYMBOL_GPL(pci_unregister_d3_possible_cb);
>> +
>>   /**
>>    * pci_pm_init - Initialize PM functions of given PCI device
>>    * @dev: PCI device to handle.
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 8c7c2c3c6c65..863399e78869 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -389,6 +389,7 @@ struct pci_dev {
>>                                                     bit manually */
>>          unsigned int    d3hot_delay;    /* D3hot->D0 transition time in ms */
>>          unsigned int    d3cold_delay;   /* D3cold->D0 transition time in ms */
>> +       unsigned int    driver_d3;      /* Driver D3 request preference */
>>
>>   #ifdef CONFIG_PCIEASPM
>>          struct pcie_link_state  *link_state;    /* ASPM link state */
>> @@ -1404,6 +1405,14 @@ void pci_d3cold_disable(struct pci_dev *dev);
>>   bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
>>   void pci_resume_bus(struct pci_bus *bus);
>>   void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state);
>> +struct pci_d3_driver_ops {
>> +       struct list_head list_node;
>> +       int priority;
>> +       bool (*optin)(struct pci_dev *pdev);
>> +       bool (*veto)(struct pci_dev *pdev);
>> +};
>> +int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg);
>> +void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg);
>>
>>   /* For use by arch with custom probe code */
>>   void set_pcie_port_type(struct pci_dev *pdev);
>> --
> 
> I don't like this too much TBH.  It appears overly complicated to me
> and it totally misses the wakeup vs non-wakeup point I've been talking
> about recently.

Yeah it does miss the wakeup point at the moment.

> 
> IMV, the underlying issue all boils down to the platform firmware
> inadequately describing the behavior of the system to the OS.
> Specifically, had it provided a _S0W returning 0 for the Root Port(s)
> in question, wakeup signaling would have worked (or else there would
> have been a defect in the kernel code to be addressed).  

I think you're right.  I'll try and get BIOS guys to provide a test BIOS 
to prove this direction is correct.

It wouldn't help all the machines already in the field but if it can be 
done without harm to Windows maybe future SoCs could use it.

> Instead, it
> decided to kind-of guide Windows in the "right" direction through PEP
> constraints which doesn't have the same effect on Linux and honestly
> I'm not even sure if it is a good idea to adjust Linux to that.
> 

What is the worry with bringing Linux in this direction (using constraints)?

My main hope is that by generalizing this fundamental difference in how 
Windows and Linux handle Modern Standby / suspend-to-idle we can avoid 
other future bugs.

> It looks to me like the issue could be addressed by a PCI device quirk
> playing the role of a missing _S0W (but slightly more precise, because
> it may distinguish suspend-to-idle from PM-runtime, which _S0W cannot
> do).

Thanks for this idea.  I'll experiment with this.
Rafael J. Wysocki Sept. 12, 2023, 9:11 a.m. UTC | #3
On Mon, Sep 11, 2023 at 10:23 PM Mario Limonciello
<mario.limonciello@amd.com> wrote:
>
> On 9/11/2023 13:34, Rafael J. Wysocki wrote:
> > On Wed, Sep 6, 2023 at 9:16 PM Mario Limonciello
> > <mario.limonciello@amd.com> wrote:
> >>

[cut]

> >
> > IMV, the underlying issue all boils down to the platform firmware
> > inadequately describing the behavior of the system to the OS.
> > Specifically, had it provided a _S0W returning 0 for the Root Port(s)
> > in question, wakeup signaling would have worked (or else there would
> > have been a defect in the kernel code to be addressed).
>
> I think you're right.  I'll try and get BIOS guys to provide a test BIOS
> to prove this direction is correct.
>
> It wouldn't help all the machines already in the field but if it can be
> done without harm to Windows maybe future SoCs could use it.
>
> > Instead, it
> > decided to kind-of guide Windows in the "right" direction through PEP
> > constraints which doesn't have the same effect on Linux and honestly
> > I'm not even sure if it is a good idea to adjust Linux to that.
> >
>
> What is the worry with bringing Linux in this direction (using constraints)?

First off, ostensibly the purpose of the constraints is to indicate to
Windows when it can attempt to put the system into the deepest power
state.  Specifically, AFAICS, Windows is not expected to do so when
the current power state of a given device is shallower than the
relevant constraint.  Consequently, a constraint of D0 means that
effectively Windows is expected to ignore the given device as far as
Modern Standby goes.

In any case, this has no bearing on the behavior of suspend-to-idle in Linux.

Now, there may be other undocumented side-effects of setting a
constraint of D0 in Windows, but it is generally risky to rely on such
things.

Second, it is not entirely clear to me whether or not the future
versions of Windows will continue to use the constraints in the same
way.

> My main hope is that by generalizing this fundamental difference in how
> Windows and Linux handle Modern Standby / suspend-to-idle we can avoid
> other future bugs.

There is a fundamental difference between Modern Standby and
suspend-to-idle already, as the former is opportunistic and the latter
is on-demand.  They can both follow the exact same set of rules.
diff mbox series

Patch

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 59c01d68c6d5..06ab73f58adf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -21,6 +21,7 @@ 
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/list_sort.h>
 #include <linux/log2.h>
 #include <linux/logic_pio.h>
 #include <linux/pm_wakeup.h>
@@ -54,6 +55,8 @@  unsigned int pci_pm_d3hot_delay;
 static void pci_pme_list_scan(struct work_struct *work);
 
 static LIST_HEAD(pci_pme_list);
+static LIST_HEAD(d3_possible_list);
+static DEFINE_MUTEX(d3_possible_list_mutex);
 static DEFINE_MUTEX(pci_pme_list_mutex);
 static DECLARE_DELAYED_WORK(pci_pme_work, pci_pme_list_scan);
 
@@ -161,6 +164,14 @@  static bool pcie_ats_disabled;
 /* If set, the PCI config space of each device is printed during boot. */
 bool pci_early_dump;
 
+/* Preferences set by a driver for D3 optin/veto */
+enum driver_d3_pref {
+	D3_PREF_UNSET,	/* Not configured by driver */
+	D3_PREF_NONE,	/* Driver does not care */
+	D3_PREF_OPTIN,	/* Driver prefers D3 */
+	D3_PREF_VETO,	/* Driver vetoes D3 */
+};
+
 bool pci_ats_disabled(void)
 {
 	return pcie_ats_disabled;
@@ -3031,6 +3042,10 @@  bool pci_bridge_d3_possible(struct pci_dev *bridge)
 		if (pci_bridge_d3_force)
 			return true;
 
+		/* check for any driver optin of D3 for the bridge */
+		if (bridge->driver_d3 == D3_PREF_OPTIN)
+			return true;
+
 		/* Even the oldest 2010 Thunderbolt controller supports D3. */
 		if (bridge->is_thunderbolt)
 			return true;
@@ -3050,6 +3065,10 @@  bool pci_bridge_d3_possible(struct pci_dev *bridge)
 		if (dmi_check_system(bridge_d3_blacklist))
 			return false;
 
+		/* check for any driver veto for D3 for the bridge */
+		if (bridge->driver_d3 == D3_PREF_VETO)
+			return false;
+
 		/*
 		 * It should be safe to put PCIe ports from 2015 or newer
 		 * to D3.
@@ -3168,6 +3187,130 @@  void pci_d3cold_disable(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_d3cold_disable);
 
+static struct pci_dev *pci_get_upstream_port(struct pci_dev *pci_dev)
+{
+	struct pci_dev *bridge;
+
+	bridge = pci_upstream_bridge(pci_dev);
+	if (!bridge)
+		return NULL;
+
+	if (!pci_is_pcie(bridge))
+		return NULL;
+
+	switch (pci_pcie_type(bridge)) {
+	case PCI_EXP_TYPE_ROOT_PORT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_DOWNSTREAM:
+		return bridge;
+	default:
+		break;
+	};
+
+	return NULL;
+}
+
+static void pci_refresh_d3_possible(void)
+{
+	struct pci_d3_driver_ops *driver;
+	struct pci_dev *pci_dev = NULL;
+	struct pci_dev *bridge;
+
+	/* 1st pass: unset any preferences set a previous invocation */
+	while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
+		bridge = pci_get_upstream_port(pci_dev);
+		if (!bridge)
+			continue;
+
+		if (bridge->driver_d3 != D3_PREF_UNSET)
+			bridge->driver_d3 = D3_PREF_UNSET;
+	}
+
+	pci_dev = NULL;
+
+	/* 2nd pass: update the preference and refresh bridge_d3 */
+	while ((pci_dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pci_dev))) {
+		bridge = pci_get_upstream_port(pci_dev);
+		if (!bridge)
+			continue;
+
+		/* don't make multiple passes on the same device */
+		if (bridge->driver_d3 != D3_PREF_UNSET)
+			continue;
+
+		/* the list is pre-sorted by highest priority */
+		mutex_lock(&d3_possible_list_mutex);
+		list_for_each_entry(driver, &d3_possible_list, list_node) {
+			/* another higher priority driver already set preference */
+			if (bridge->driver_d3 != D3_PREF_UNSET)
+				break;
+			/* check for opt in to D3 */
+			if (driver->optin && driver->optin(bridge))
+				bridge->driver_d3 = D3_PREF_OPTIN;
+			/* check for opt out of D3 */
+			else if (driver->veto && driver->veto(bridge))
+				bridge->driver_d3 = D3_PREF_VETO;
+		}
+		mutex_unlock(&d3_possible_list_mutex);
+
+		/* no driver set a preference */
+		if (bridge->driver_d3 == D3_PREF_UNSET)
+			bridge->driver_d3 = D3_PREF_NONE;
+
+		/* update bridge_d3 */
+		pci_bridge_d3_update(pci_dev);
+	}
+}
+
+static int pci_d3_driver_cmp(void *priv, const struct list_head *_a,
+			   const struct list_head *_b)
+{
+	struct pci_d3_driver_ops *a = container_of(_a, typeof(*a), list_node);
+	struct pci_d3_driver_ops *b = container_of(_b, typeof(*b), list_node);
+
+	if (a->priority < b->priority)
+		return -1;
+	else if (a->priority > b->priority)
+		return 1;
+	return 0;
+}
+
+/**
+ * pci_register_d3_possible_cb - Register a driver callback for checking d3 veto
+ * @arg: driver provided structure with function pointer and priority
+ *
+ * This function can be used by drivers to register a callback that can be
+ * used to veto a device going into D3.
+ * Returns 0 on success, error code on failure.
+ */
+int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg)
+{
+	mutex_lock(&d3_possible_list_mutex);
+	list_add(&arg->list_node, &d3_possible_list);
+	list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
+	mutex_unlock(&d3_possible_list_mutex);
+	pci_refresh_d3_possible();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_register_d3_possible_cb);
+
+/**
+ * pci_unregister_d3_possible_cb - Unregister a driver callback for checking d3 veto
+ * @arg: driver provided structure with function pointer and priority
+ *
+ * This function can be used by drivers to unregister a callback that can be
+ * used to veto a device going into D3.
+ */
+void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg)
+{
+	mutex_lock(&d3_possible_list_mutex);
+	list_del(&arg->list_node);
+	list_sort(NULL, &d3_possible_list, pci_d3_driver_cmp);
+	mutex_unlock(&d3_possible_list_mutex);
+	pci_refresh_d3_possible();
+}
+EXPORT_SYMBOL_GPL(pci_unregister_d3_possible_cb);
+
 /**
  * pci_pm_init - Initialize PM functions of given PCI device
  * @dev: PCI device to handle.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..863399e78869 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -389,6 +389,7 @@  struct pci_dev {
 						   bit manually */
 	unsigned int	d3hot_delay;	/* D3hot->D0 transition time in ms */
 	unsigned int	d3cold_delay;	/* D3cold->D0 transition time in ms */
+	unsigned int	driver_d3;	/* Driver D3 request preference */
 
 #ifdef CONFIG_PCIEASPM
 	struct pcie_link_state	*link_state;	/* ASPM link state */
@@ -1404,6 +1405,14 @@  void pci_d3cold_disable(struct pci_dev *dev);
 bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
 void pci_resume_bus(struct pci_bus *bus);
 void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state);
+struct pci_d3_driver_ops {
+	struct list_head list_node;
+	int priority;
+	bool (*optin)(struct pci_dev *pdev);
+	bool (*veto)(struct pci_dev *pdev);
+};
+int pci_register_d3_possible_cb(struct pci_d3_driver_ops *arg);
+void pci_unregister_d3_possible_cb(struct pci_d3_driver_ops *arg);
 
 /* For use by arch with custom probe code */
 void set_pcie_port_type(struct pci_dev *pdev);