diff mbox series

[v2,8/9] PCI: Exclude PCIe ports used for tunneling in pcie_bandwidth_available()

Message ID 20231103190758.82911-9-mario.limonciello@amd.com
State New
Headers show
Series Improvements to pcie_bandwidth_available() for eGPUs | expand

Commit Message

Mario Limonciello Nov. 3, 2023, 7:07 p.m. UTC
The USB4 spec specifies that PCIe ports that are used for tunneling
PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
behave as a PCIe Gen1 device. The actual performance of these ports is
controlled by the fabric implementation.

Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
to program the device will always find the PCIe ports used for
tunneling as a limiting factor potentially leading to incorrect
performance decisions.

To prevent problems in downstream drivers check explicitly for ports
being used for PCIe tunneling and skip them when looking for bandwidth
limitations of the hierarchy. If the only device connected is a root port
used for tunneling then report that device.

Downstream drivers could make this change on their own but then they
wouldn't be able to detect other potential speed bottlenecks from the
hierarchy without duplicating pcie_bandwidth_available() logic.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925#note_2145860
Link: https://www.usb.org/document-library/usb4r-specification-v20
      USB4 V2 with Errata and ECN through June 2023
      Section 11.2.1
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/pci/pci.c | 74 +++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 25 deletions(-)

Comments

Lazar, Lijo Nov. 4, 2023, 6:57 a.m. UTC | #1
On 11/4/2023 12:37 AM, Mario Limonciello wrote:
> The USB4 spec specifies that PCIe ports that are used for tunneling
> PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
> behave as a PCIe Gen1 device. The actual performance of these ports is
> controlled by the fabric implementation.

The code below ties a generic term 'tunneling' to USB4 spec. I think it 
should be something like if (is_USB4 && is_tunneled), exclude from 
bandwidth calculations - it should specifically identify usb4 based 
tunneling rather than applying to all 'tunneled' cases.

Thanks,
Lijo

> 
> Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
> to program the device will always find the PCIe ports used for
> tunneling as a limiting factor potentially leading to incorrect
> performance decisions.
> 
> To prevent problems in downstream drivers check explicitly for ports
> being used for PCIe tunneling and skip them when looking for bandwidth
> limitations of the hierarchy. If the only device connected is a root port
> used for tunneling then report that device.
> 
> Downstream drivers could make this change on their own but then they
> wouldn't be able to detect other potential speed bottlenecks from the
> hierarchy without duplicating pcie_bandwidth_available() logic.
> 
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925#note_2145860
> Link: https://www.usb.org/document-library/usb4r-specification-v20
>        USB4 V2 with Errata and ECN through June 2023
>        Section 11.2.1
> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
>   drivers/pci/pci.c | 74 +++++++++++++++++++++++++++++++----------------
>   1 file changed, 49 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index d9aa5a39f585..15e37164ce56 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -6223,6 +6223,35 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
>   }
>   EXPORT_SYMBOL(pcie_set_mps);
>   
> +static u32 pcie_calc_bw_limits(struct pci_dev *dev, u32 bw,
> +			       struct pci_dev **limiting_dev,
> +			       enum pci_bus_speed *speed,
> +			       enum pcie_link_width *width)
> +{
> +	enum pcie_link_width next_width;
> +	enum pci_bus_speed next_speed;
> +	u32 next_bw;
> +	u16 lnksta;
> +
> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> +	next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
> +	next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
> +	next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
> +
> +	/* Check if current device limits the total bandwidth */
> +	if (!bw || next_bw <= bw) {
> +		bw = next_bw;
> +		if (limiting_dev)
> +			*limiting_dev = dev;
> +		if (speed)
> +			*speed = next_speed;
> +		if (width)
> +			*width = next_width;
> +	}
> +
> +	return bw;
> +}
> +
>   /**
>    * pcie_bandwidth_available - determine minimum link settings of a PCIe
>    *			      device and its bandwidth limitation
> @@ -6236,47 +6265,42 @@ EXPORT_SYMBOL(pcie_set_mps);
>    * limiting_dev, speed, and width pointers are supplied) information about
>    * that point.  The bandwidth returned is in Mb/s, i.e., megabits/second of
>    * raw bandwidth.
> + *
> + * This excludes the bandwidth calculation that has been returned from a
> + * PCIe device used for transmitting tunneled PCIe traffic over a Thunderbolt
> + * or USB4 link that is part of larger hierarchy. The calculation is excluded
> + * because the USB4 specification specifies that the max speed returned from
> + * PCIe configuration registers for the tunneling link is always PCI 1x 2.5 GT/s.
> + * When only tunneled devices are present, the bandwidth returned is the
> + * bandwidth available from the first tunneled device.
>    */
>   u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
>   			     enum pci_bus_speed *speed,
>   			     enum pcie_link_width *width)
>   {
> -	u16 lnksta;
> -	enum pci_bus_speed next_speed;
> -	enum pcie_link_width next_width;
> -	u32 bw, next_bw;
> +	struct pci_dev *tdev = NULL;
> +	u32 bw = 0;
>   
>   	if (speed)
>   		*speed = PCI_SPEED_UNKNOWN;
>   	if (width)
>   		*width = PCIE_LNK_WIDTH_UNKNOWN;
>   
> -	bw = 0;
> -
>   	while (dev) {
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> -
> -		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
> -		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
> -			PCI_EXP_LNKSTA_NLW_SHIFT;
> -
> -		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
> -
> -		/* Check if current device limits the total bandwidth */
> -		if (!bw || next_bw <= bw) {
> -			bw = next_bw;
> -
> -			if (limiting_dev)
> -				*limiting_dev = dev;
> -			if (speed)
> -				*speed = next_speed;
> -			if (width)
> -				*width = next_width;
> +		if (dev->is_tunneled) {
> +			if (!tdev)
> +				tdev = dev;
> +			goto skip;
>   		}
> -
> +		bw = pcie_calc_bw_limits(dev, bw, limiting_dev, speed, width);
> +skip:
>   		dev = pci_upstream_bridge(dev);
>   	}
>   
> +	/* If nothing "faster" found on link, limit to first tunneled device */
> +	if (tdev && !bw)
> +		bw = pcie_calc_bw_limits(tdev, bw, limiting_dev, speed, width);
> +
>   	return bw;
>   }
>   EXPORT_SYMBOL(pcie_bandwidth_available);
Ilpo Järvinen Nov. 6, 2023, 12:52 p.m. UTC | #2
On Fri, 3 Nov 2023, Mario Limonciello wrote:

> The USB4 spec specifies that PCIe ports that are used for tunneling
> PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
> behave as a PCIe Gen1 device. The actual performance of these ports is
> controlled by the fabric implementation.
> 
> Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
> to program the device will always find the PCIe ports used for
> tunneling as a limiting factor potentially leading to incorrect
> performance decisions.
> 
> To prevent problems in downstream drivers check explicitly for ports
> being used for PCIe tunneling and skip them when looking for bandwidth
> limitations of the hierarchy. If the only device connected is a root port
> used for tunneling then report that device.
> 
> Downstream drivers could make this change on their own but then they
> wouldn't be able to detect other potential speed bottlenecks from the
> hierarchy without duplicating pcie_bandwidth_available() logic.
> 
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925#note_2145860
> Link: https://www.usb.org/document-library/usb4r-specification-v20
>       USB4 V2 with Errata and ECN through June 2023
>       Section 11.2.1
> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
>  drivers/pci/pci.c | 74 +++++++++++++++++++++++++++++++----------------
>  1 file changed, 49 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index d9aa5a39f585..15e37164ce56 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -6223,6 +6223,35 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
>  }
>  EXPORT_SYMBOL(pcie_set_mps);
>  
> +static u32 pcie_calc_bw_limits(struct pci_dev *dev, u32 bw,
> +			       struct pci_dev **limiting_dev,
> +			       enum pci_bus_speed *speed,
> +			       enum pcie_link_width *width)
> +{
> +	enum pcie_link_width next_width;
> +	enum pci_bus_speed next_speed;
> +	u32 next_bw;
> +	u16 lnksta;
> +
> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> +	next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
> +	next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
> +	next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
> +
> +	/* Check if current device limits the total bandwidth */
> +	if (!bw || next_bw <= bw) {
> +		bw = next_bw;
> +		if (limiting_dev)
> +			*limiting_dev = dev;
> +		if (speed)
> +			*speed = next_speed;
> +		if (width)
> +			*width = next_width;
> +	}
> +
> +	return bw;
> +}
> +
>  /**
>   * pcie_bandwidth_available - determine minimum link settings of a PCIe
>   *			      device and its bandwidth limitation
> @@ -6236,47 +6265,42 @@ EXPORT_SYMBOL(pcie_set_mps);
>   * limiting_dev, speed, and width pointers are supplied) information about
>   * that point.  The bandwidth returned is in Mb/s, i.e., megabits/second of
>   * raw bandwidth.
> + *
> + * This excludes the bandwidth calculation that has been returned from a
> + * PCIe device used for transmitting tunneled PCIe traffic over a Thunderbolt
> + * or USB4 link that is part of larger hierarchy. The calculation is excluded
> + * because the USB4 specification specifies that the max speed returned from
> + * PCIe configuration registers for the tunneling link is always PCI 1x 2.5 GT/s.
> + * When only tunneled devices are present, the bandwidth returned is the
> + * bandwidth available from the first tunneled device.
>   */
>  u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
>  			     enum pci_bus_speed *speed,
>  			     enum pcie_link_width *width)
>  {
> -	u16 lnksta;
> -	enum pci_bus_speed next_speed;
> -	enum pcie_link_width next_width;
> -	u32 bw, next_bw;
> +	struct pci_dev *tdev = NULL;
> +	u32 bw = 0;
>  
>  	if (speed)
>  		*speed = PCI_SPEED_UNKNOWN;
>  	if (width)
>  		*width = PCIE_LNK_WIDTH_UNKNOWN;
>  
> -	bw = 0;
> -
>  	while (dev) {
> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> -
> -		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
> -		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
> -			PCI_EXP_LNKSTA_NLW_SHIFT;
> -
> -		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
> -
> -		/* Check if current device limits the total bandwidth */
> -		if (!bw || next_bw <= bw) {
> -			bw = next_bw;
> -
> -			if (limiting_dev)
> -				*limiting_dev = dev;
> -			if (speed)
> -				*speed = next_speed;
> -			if (width)
> -				*width = next_width;
> +		if (dev->is_tunneled) {
> +			if (!tdev)
> +				tdev = dev;
> +			goto skip;
>  		}
> -
> +		bw = pcie_calc_bw_limits(dev, bw, limiting_dev, speed, width);
> +skip:
>  		dev = pci_upstream_bridge(dev);
>  	}
>  
> +	/* If nothing "faster" found on link, limit to first tunneled device */
> +	if (tdev && !bw)
> +		bw = pcie_calc_bw_limits(tdev, bw, limiting_dev, speed, width);
> +
>  	return bw;
>  }
>  EXPORT_SYMBOL(pcie_bandwidth_available);
> 

This patch should be split into two, where one just moves the code to the 
new function.

Also note that this will conflict with the FIELD_GET() changes (try to 
not reintroduce non-FIELD_GET() code when you rebase this on top of 
v6.7-rc1 :-)).
Mario Limonciello Nov. 6, 2023, 4:51 p.m. UTC | #3
On 11/6/2023 06:52, Ilpo Järvinen wrote:
> On Fri, 3 Nov 2023, Mario Limonciello wrote:
> 
>> The USB4 spec specifies that PCIe ports that are used for tunneling
>> PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
>> behave as a PCIe Gen1 device. The actual performance of these ports is
>> controlled by the fabric implementation.
>>
>> Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
>> to program the device will always find the PCIe ports used for
>> tunneling as a limiting factor potentially leading to incorrect
>> performance decisions.
>>
>> To prevent problems in downstream drivers check explicitly for ports
>> being used for PCIe tunneling and skip them when looking for bandwidth
>> limitations of the hierarchy. If the only device connected is a root port
>> used for tunneling then report that device.
>>
>> Downstream drivers could make this change on their own but then they
>> wouldn't be able to detect other potential speed bottlenecks from the
>> hierarchy without duplicating pcie_bandwidth_available() logic.
>>
>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925#note_2145860
>> Link: https://www.usb.org/document-library/usb4r-specification-v20
>>        USB4 V2 with Errata and ECN through June 2023
>>        Section 11.2.1
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>>   drivers/pci/pci.c | 74 +++++++++++++++++++++++++++++++----------------
>>   1 file changed, 49 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index d9aa5a39f585..15e37164ce56 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -6223,6 +6223,35 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
>>   }
>>   EXPORT_SYMBOL(pcie_set_mps);
>>   
>> +static u32 pcie_calc_bw_limits(struct pci_dev *dev, u32 bw,
>> +			       struct pci_dev **limiting_dev,
>> +			       enum pci_bus_speed *speed,
>> +			       enum pcie_link_width *width)
>> +{
>> +	enum pcie_link_width next_width;
>> +	enum pci_bus_speed next_speed;
>> +	u32 next_bw;
>> +	u16 lnksta;
>> +
>> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
>> +	next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
>> +	next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
>> +	next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
>> +
>> +	/* Check if current device limits the total bandwidth */
>> +	if (!bw || next_bw <= bw) {
>> +		bw = next_bw;
>> +		if (limiting_dev)
>> +			*limiting_dev = dev;
>> +		if (speed)
>> +			*speed = next_speed;
>> +		if (width)
>> +			*width = next_width;
>> +	}
>> +
>> +	return bw;
>> +}
>> +
>>   /**
>>    * pcie_bandwidth_available - determine minimum link settings of a PCIe
>>    *			      device and its bandwidth limitation
>> @@ -6236,47 +6265,42 @@ EXPORT_SYMBOL(pcie_set_mps);
>>    * limiting_dev, speed, and width pointers are supplied) information about
>>    * that point.  The bandwidth returned is in Mb/s, i.e., megabits/second of
>>    * raw bandwidth.
>> + *
>> + * This excludes the bandwidth calculation that has been returned from a
>> + * PCIe device used for transmitting tunneled PCIe traffic over a Thunderbolt
>> + * or USB4 link that is part of larger hierarchy. The calculation is excluded
>> + * because the USB4 specification specifies that the max speed returned from
>> + * PCIe configuration registers for the tunneling link is always PCI 1x 2.5 GT/s.
>> + * When only tunneled devices are present, the bandwidth returned is the
>> + * bandwidth available from the first tunneled device.
>>    */
>>   u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
>>   			     enum pci_bus_speed *speed,
>>   			     enum pcie_link_width *width)
>>   {
>> -	u16 lnksta;
>> -	enum pci_bus_speed next_speed;
>> -	enum pcie_link_width next_width;
>> -	u32 bw, next_bw;
>> +	struct pci_dev *tdev = NULL;
>> +	u32 bw = 0;
>>   
>>   	if (speed)
>>   		*speed = PCI_SPEED_UNKNOWN;
>>   	if (width)
>>   		*width = PCIE_LNK_WIDTH_UNKNOWN;
>>   
>> -	bw = 0;
>> -
>>   	while (dev) {
>> -		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
>> -
>> -		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
>> -		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
>> -			PCI_EXP_LNKSTA_NLW_SHIFT;
>> -
>> -		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
>> -
>> -		/* Check if current device limits the total bandwidth */
>> -		if (!bw || next_bw <= bw) {
>> -			bw = next_bw;
>> -
>> -			if (limiting_dev)
>> -				*limiting_dev = dev;
>> -			if (speed)
>> -				*speed = next_speed;
>> -			if (width)
>> -				*width = next_width;
>> +		if (dev->is_tunneled) {
>> +			if (!tdev)
>> +				tdev = dev;
>> +			goto skip;
>>   		}
>> -
>> +		bw = pcie_calc_bw_limits(dev, bw, limiting_dev, speed, width);
>> +skip:
>>   		dev = pci_upstream_bridge(dev);
>>   	}
>>   
>> +	/* If nothing "faster" found on link, limit to first tunneled device */
>> +	if (tdev && !bw)
>> +		bw = pcie_calc_bw_limits(tdev, bw, limiting_dev, speed, width);
>> +
>>   	return bw;
>>   }
>>   EXPORT_SYMBOL(pcie_bandwidth_available);
>>
> 
> This patch should be split into two, where one just moves the code to the
> new function.

Good idea, thanks.

> 
> Also note that this will conflict with the FIELD_GET() changes (try to
> not reintroduce non-FIELD_GET() code when you rebase this on top of
> v6.7-rc1 :-)).
> 

Sure, will adjust for that when it's rebased to 6.7-rc1.
Lukas Wunner Nov. 6, 2023, 6:10 p.m. UTC | #4
On Fri, Nov 03, 2023 at 02:07:57PM -0500, Mario Limonciello wrote:
> The USB4 spec specifies that PCIe ports that are used for tunneling
> PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
> behave as a PCIe Gen1 device. The actual performance of these ports is
> controlled by the fabric implementation.
> 
> Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
> to program the device will always find the PCIe ports used for
> tunneling as a limiting factor potentially leading to incorrect
> performance decisions.
> 
> To prevent problems in downstream drivers check explicitly for ports
> being used for PCIe tunneling and skip them when looking for bandwidth
> limitations of the hierarchy. If the only device connected is a root port
> used for tunneling then report that device.

I think a better approach would be to define three new bandwidths for
Thunderbolt in enum pci_bus_speed and add appropriate descriptions in
pci_speed_string().  Those three bandwidths would be 10 GBit/s for
Thunderbolt 1, 20 GBit/s for Thunderbolt 2, 40 GBit/s for Thunderbolt 3
and 4.

Code to determine the Thunderbolt generation from the PCI ID already exists
in tb_switch_get_generation().

This will not only address the amdgpu issue you're trying to solve,
but also emit an accurate speed from __pcie_print_link_status().

The speed you're reporting with your approach is not necessarily
accurate because the next non-tunneled device in the hierarchy might
be connected with a far higher PCIe speed than what the Thunderbolt
fabric allows.

Thanks,

Lukas
Mario Limonciello Nov. 6, 2023, 6:44 p.m. UTC | #5
On 11/6/2023 12:10, Lukas Wunner wrote:
> On Fri, Nov 03, 2023 at 02:07:57PM -0500, Mario Limonciello wrote:
>> The USB4 spec specifies that PCIe ports that are used for tunneling
>> PCIe traffic over USB4 fabric will be hardcoded to advertise 2.5GT/s and
>> behave as a PCIe Gen1 device. The actual performance of these ports is
>> controlled by the fabric implementation.
>>
>> Downstream drivers such as amdgpu which utilize pcie_bandwidth_available()
>> to program the device will always find the PCIe ports used for
>> tunneling as a limiting factor potentially leading to incorrect
>> performance decisions.
>>
>> To prevent problems in downstream drivers check explicitly for ports
>> being used for PCIe tunneling and skip them when looking for bandwidth
>> limitations of the hierarchy. If the only device connected is a root port
>> used for tunneling then report that device.
> 
> I think a better approach would be to define three new bandwidths for
> Thunderbolt in enum pci_bus_speed and add appropriate descriptions in
> pci_speed_string().  Those three bandwidths would be 10 GBit/s for
> Thunderbolt 1, 20 GBit/s for Thunderbolt 2, 40 GBit/s for Thunderbolt 3
> and 4.

It's an interesting idea, but there's a few short comings I can think of.

1) The USB4 specification doesn't actually require 40GB/s support, this 
is only a Thunderbolt 4 requirement.

https://www.tomsguide.com/features/thunderbolt-4-vs-usb4-whats-the-difference

The TBT/USB4 link speed can be discovered, but it's not a property of 
the *switch* not of the PCIe tunneling port.

Tangentially related; the link speed is currently symmetric but there 
are two sysfs files.  Mika left a comment in 
drivers/thunderbolt/switch.c it may be asymmetric in the future. So we 
may need to keep that in mind on any design that builds on top of them.

On an AMD Phoenix system connected to a TBT3 Alpine Ridge based eGPU 
enclosure I can see:

$ cat /sys/bus/thunderbolt/devices/1-0/generation
4
$ cat /sys/bus/thunderbolt/devices/1-2/generation
3
$ cat /sys/bus/thunderbolt/devices/1-2/tx_speed
20.0 Gb/s
$ cat /sys/bus/thunderbolt/devices/1-2/rx_speed
20.0 Gb/s

2) This works until you end up with USB4v2 which supports 80GBit/s.

So this might mean an extra 80GB/s enum and porting some variation of 
usb4_switch_version() outside of the thunderbolt driver so that PCI core 
can use it too.

> 
> Code to determine the Thunderbolt generation from the PCI ID already exists
> in tb_switch_get_generation().

As 'thunderbolt' can be a module or built in, we need to bring code into 
PCI core so that it works in early boot before it loads.

On the presumption that no more "new" TBT3 devices will be released to 
the market I suppose that *a lot* of that code could come over to PCI 
core especially if we can bring some variation of usb4_switch_version() 
over too.

The other option is that we can stop allowing thunderbolt as a module 
and require it to be built-in. If we do this we can export symbols from 
it and use some of them in PCI core too.

> 
> This will not only address the amdgpu issue you're trying to solve,
> but also emit an accurate speed from __pcie_print_link_status().
> 
> The speed you're reporting with your approach is not necessarily
> accurate because the next non-tunneled device in the hierarchy might
> be connected with a far higher PCIe speed than what the Thunderbolt
> fabric allows.
>  > Thanks,
> 
> Lukas
Lukas Wunner Nov. 6, 2023, 6:56 p.m. UTC | #6
On Mon, Nov 06, 2023 at 12:44:25PM -0600, Mario Limonciello wrote:
> Tangentially related; the link speed is currently symmetric but there are
> two sysfs files.  Mika left a comment in drivers/thunderbolt/switch.c it may
> be asymmetric in the future. So we may need to keep that in mind on any
> design that builds on top of them.

Aren't asymmetric Thunderbolt speeds just a DisplayPort thing?


> As 'thunderbolt' can be a module or built in, we need to bring code into PCI
> core so that it works in early boot before it loads.

tb_switch_get_generation() is small enough that it could be moved to the
PCI core.  I doubt that we need to make thunderbolt built-in only
or move a large amount of code to the PCI core.

Thanks,

Lukas
Mika Westerberg Nov. 7, 2023, 5:45 a.m. UTC | #7
Hi,

On Mon, Nov 06, 2023 at 07:56:52PM +0100, Lukas Wunner wrote:
> On Mon, Nov 06, 2023 at 12:44:25PM -0600, Mario Limonciello wrote:
> > Tangentially related; the link speed is currently symmetric but there are
> > two sysfs files.  Mika left a comment in drivers/thunderbolt/switch.c it may
> > be asymmetric in the future. So we may need to keep that in mind on any
> > design that builds on top of them.
> 
> Aren't asymmetric Thunderbolt speeds just a DisplayPort thing?

No, they affect the whole fabric. We have the initial code for
asymmetric switching in v6.7-rc1.

> > As 'thunderbolt' can be a module or built in, we need to bring code into PCI
> > core so that it works in early boot before it loads.
> 
> tb_switch_get_generation() is small enough that it could be moved to the
> PCI core.  I doubt that we need to make thunderbolt built-in only
> or move a large amount of code to the PCI core.

If at all possible I would like to avoid this and littering PCI side
with non-PCI stuff. There could be other similar "mediums" in the future
where you can transfer packets of "native" protocols such as PCIe so
instead of making it Thunderbolt/USB4 specific it should be generic
enough to support future extensions.

In case of Thunderbolt/USB4 there is no real way to figure out how much
bandwidth each PCIe tunnel gets (it is kind of bulk traffic that gets
what is left from isochronous protocols) so I would not even try that
and instead use the real PCIe links in pcie_bandwidth_available() and
skip all the "virtual" ones.
Mika Westerberg Nov. 7, 2023, 6:24 a.m. UTC | #8
On Tue, Nov 07, 2023 at 07:45:26AM +0200, Mika Westerberg wrote:
> Hi,
> 
> On Mon, Nov 06, 2023 at 07:56:52PM +0100, Lukas Wunner wrote:
> > On Mon, Nov 06, 2023 at 12:44:25PM -0600, Mario Limonciello wrote:
> > > Tangentially related; the link speed is currently symmetric but there are
> > > two sysfs files.  Mika left a comment in drivers/thunderbolt/switch.c it may
> > > be asymmetric in the future. So we may need to keep that in mind on any
> > > design that builds on top of them.
> > 
> > Aren't asymmetric Thunderbolt speeds just a DisplayPort thing?
> 
> No, they affect the whole fabric. We have the initial code for
> asymmetric switching in v6.7-rc1.
> 
> > > As 'thunderbolt' can be a module or built in, we need to bring code into PCI
> > > core so that it works in early boot before it loads.
> > 
> > tb_switch_get_generation() is small enough that it could be moved to the
> > PCI core.  I doubt that we need to make thunderbolt built-in only
> > or move a large amount of code to the PCI core.
> 
> If at all possible I would like to avoid this and littering PCI side
> with non-PCI stuff. There could be other similar "mediums" in the future
> where you can transfer packets of "native" protocols such as PCIe so
> instead of making it Thunderbolt/USB4 specific it should be generic
> enough to support future extensions.
> 
> In case of Thunderbolt/USB4 there is no real way to figure out how much
> bandwidth each PCIe tunnel gets (it is kind of bulk traffic that gets
> what is left from isochronous protocols) so I would not even try that
> and instead use the real PCIe links in pcie_bandwidth_available() and
> skip all the "virtual" ones.

Actually can we call the new function something like pci_link_is_virtual()
instead and make pcie_bandwidth_available() call it? That would be more
future proof IMHO.
diff mbox series

Patch

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d9aa5a39f585..15e37164ce56 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6223,6 +6223,35 @@  int pcie_set_mps(struct pci_dev *dev, int mps)
 }
 EXPORT_SYMBOL(pcie_set_mps);
 
+static u32 pcie_calc_bw_limits(struct pci_dev *dev, u32 bw,
+			       struct pci_dev **limiting_dev,
+			       enum pci_bus_speed *speed,
+			       enum pcie_link_width *width)
+{
+	enum pcie_link_width next_width;
+	enum pci_bus_speed next_speed;
+	u32 next_bw;
+	u16 lnksta;
+
+	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+	next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+	next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+	next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
+
+	/* Check if current device limits the total bandwidth */
+	if (!bw || next_bw <= bw) {
+		bw = next_bw;
+		if (limiting_dev)
+			*limiting_dev = dev;
+		if (speed)
+			*speed = next_speed;
+		if (width)
+			*width = next_width;
+	}
+
+	return bw;
+}
+
 /**
  * pcie_bandwidth_available - determine minimum link settings of a PCIe
  *			      device and its bandwidth limitation
@@ -6236,47 +6265,42 @@  EXPORT_SYMBOL(pcie_set_mps);
  * limiting_dev, speed, and width pointers are supplied) information about
  * that point.  The bandwidth returned is in Mb/s, i.e., megabits/second of
  * raw bandwidth.
+ *
+ * This excludes the bandwidth calculation that has been returned from a
+ * PCIe device used for transmitting tunneled PCIe traffic over a Thunderbolt
+ * or USB4 link that is part of larger hierarchy. The calculation is excluded
+ * because the USB4 specification specifies that the max speed returned from
+ * PCIe configuration registers for the tunneling link is always PCI 1x 2.5 GT/s.
+ * When only tunneled devices are present, the bandwidth returned is the
+ * bandwidth available from the first tunneled device.
  */
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width)
 {
-	u16 lnksta;
-	enum pci_bus_speed next_speed;
-	enum pcie_link_width next_width;
-	u32 bw, next_bw;
+	struct pci_dev *tdev = NULL;
+	u32 bw = 0;
 
 	if (speed)
 		*speed = PCI_SPEED_UNKNOWN;
 	if (width)
 		*width = PCIE_LNK_WIDTH_UNKNOWN;
 
-	bw = 0;
-
 	while (dev) {
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
-
-		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
-		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
-			PCI_EXP_LNKSTA_NLW_SHIFT;
-
-		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
-
-		/* Check if current device limits the total bandwidth */
-		if (!bw || next_bw <= bw) {
-			bw = next_bw;
-
-			if (limiting_dev)
-				*limiting_dev = dev;
-			if (speed)
-				*speed = next_speed;
-			if (width)
-				*width = next_width;
+		if (dev->is_tunneled) {
+			if (!tdev)
+				tdev = dev;
+			goto skip;
 		}
-
+		bw = pcie_calc_bw_limits(dev, bw, limiting_dev, speed, width);
+skip:
 		dev = pci_upstream_bridge(dev);
 	}
 
+	/* If nothing "faster" found on link, limit to first tunneled device */
+	if (tdev && !bw)
+		bw = pcie_calc_bw_limits(tdev, bw, limiting_dev, speed, width);
+
 	return bw;
 }
 EXPORT_SYMBOL(pcie_bandwidth_available);