diff mbox series

wifi: ath12k: use 128 bytes aligned iova in transmit path for WCN7850

Message ID 20240715023814.20242-1-quic_bqiang@quicinc.com
State New
Headers show
Series wifi: ath12k: use 128 bytes aligned iova in transmit path for WCN7850 | expand

Commit Message

Baochen Qiang July 15, 2024, 2:38 a.m. UTC
In transmit path, it is likely that the iova is not aligned to PCIe TLP
max payload size, which is 128 for WCN7850. Normally in such cases hardware
is expected to split the packet into several parts in a manner such that
they, other than the first one, have aligned iova. However due to hardware
limitations, WCN7850 does not behave like that properly with some specific
unaligned iova in transmit path. This easily results in target hang in a
KPI transmit test: packet send/receive failure, WMI command send timeout
etc. Also fatal error seen in PCIe level:

	...
	Capabilities: ...
		...
		DevSta: ... FatalErr+ ...
		...
	...

Work around this by manually moving/reallocating payload buffer such that
we can map it to a 128 bytes aligned iova. The moving requires sufficient
head room or tail room in skb: for the former we can do ourselves a favor
by asking some extra bytes when registering with mac80211, while for the
latter we can do nothing.

Moving/reallocating buffer consumes additional CPU cycles, but the good news
is that an aligned iova increases PCIe efficiency. In my tests on some X86
platforms the KPI results are almost consistent.

Since this is seen only with WCN7850, add a new hardware parameter to
differentiate from others.

Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3

Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
---
 drivers/net/wireless/ath/ath12k/dp_tx.c | 72 +++++++++++++++++++++++++
 drivers/net/wireless/ath/ath12k/hw.c    |  6 +++
 drivers/net/wireless/ath/ath12k/hw.h    |  4 ++
 drivers/net/wireless/ath/ath12k/mac.c   |  1 +
 4 files changed, 83 insertions(+)


base-commit: db1ce56e6e1d395dd42a3cd6332a871d9be59c45

Comments

Mark Pearson July 19, 2024, 1:07 p.m. UTC | #1
On Sun, Jul 14, 2024, at 10:38 PM, Baochen Qiang wrote:
> In transmit path, it is likely that the iova is not aligned to PCIe TLP
> max payload size, which is 128 for WCN7850. Normally in such cases hardware
> is expected to split the packet into several parts in a manner such that
> they, other than the first one, have aligned iova. However due to hardware
> limitations, WCN7850 does not behave like that properly with some specific
> unaligned iova in transmit path. This easily results in target hang in a
> KPI transmit test: packet send/receive failure, WMI command send timeout
> etc. Also fatal error seen in PCIe level:
>
> 	...
> 	Capabilities: ...
> 		...
> 		DevSta: ... FatalErr+ ...
> 		...
> 	...
>
> Work around this by manually moving/reallocating payload buffer such that
> we can map it to a 128 bytes aligned iova. The moving requires sufficient
> head room or tail room in skb: for the former we can do ourselves a favor
> by asking some extra bytes when registering with mac80211, while for the
> latter we can do nothing.
>
> Moving/reallocating buffer consumes additional CPU cycles, but the good news
> is that an aligned iova increases PCIe efficiency. In my tests on some X86
> platforms the KPI results are almost consistent.
>
> Since this is seen only with WCN7850, add a new hardware parameter to
> differentiate from others.
>
> Tested-on: WCN7850 hw2.0 PCI 
> WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
>
> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
> ---
>  drivers/net/wireless/ath/ath12k/dp_tx.c | 72 +++++++++++++++++++++++++
>  drivers/net/wireless/ath/ath12k/hw.c    |  6 +++
>  drivers/net/wireless/ath/ath12k/hw.h    |  4 ++
>  drivers/net/wireless/ath/ath12k/mac.c   |  1 +
>  4 files changed, 83 insertions(+)
>
>
> base-commit: db1ce56e6e1d395dd42a3cd6332a871d9be59c45
>
> diff --git a/drivers/net/wireless/ath/ath12k/dp_tx.c 
> b/drivers/net/wireless/ath/ath12k/dp_tx.c
> index d08c04343e90..00475d0848e1 100644
> --- a/drivers/net/wireless/ath/ath12k/dp_tx.c
> +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
> @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct 
> sk_buff *skb)
>  	return 0;
>  }
> 
> +static void ath12k_dp_tx_move_payload(struct sk_buff *skb,
> +				      unsigned long delta,
> +				      bool head)
> +{
> +	unsigned long len = skb->len;
> +
> +	if (head) {
> +		skb_push(skb, delta);
> +		memmove(skb->data, skb->data + delta, len);
> +		skb_trim(skb, len);
> +	} else {
> +		skb_put(skb, delta);
> +		memmove(skb->data + delta, skb->data, len);
> +		skb_pull(skb, delta);
> +	}
> +}
> +
> +static int ath12k_dp_tx_align_payload(struct ath12k_base *ab,
> +				      struct sk_buff **pskb)
> +{
> +	u32 iova_mask = ab->hw_params->iova_mask;
> +	unsigned long offset, delta1, delta2;
> +	struct sk_buff *skb2, *skb = *pskb;
> +	unsigned int headroom = skb_headroom(skb);
> +	int tailroom = skb_tailroom(skb);
> +	int ret = 0;
> +
> +	offset = (unsigned long)skb->data & iova_mask;
> +	delta1 = offset;
> +	delta2 = iova_mask - offset + 1;
> +
> +	if (headroom >= delta1) {
> +		ath12k_dp_tx_move_payload(skb, delta1, true);
> +	} else if (tailroom >= delta2) {
> +		ath12k_dp_tx_move_payload(skb, delta2, false);
> +	} else {
> +		skb2 = skb_realloc_headroom(skb, iova_mask);
> +		if (!skb2) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		dev_kfree_skb_any(skb);
> +
> +		offset = (unsigned long)skb2->data & iova_mask;
> +		if (offset)
> +			ath12k_dp_tx_move_payload(skb2, offset, true);
> +		*pskb = skb2;
> +	}
> +
> +out:
> +	return ret;
> +}
> +
>  int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
>  		 struct sk_buff *skb)
>  {
> @@ -184,6 +238,7 @@ int ath12k_dp_tx(struct ath12k *ar, struct 
> ath12k_vif *arvif,
>  	bool tcl_ring_retry;
>  	bool msdu_ext_desc = false;
>  	bool add_htt_metadata = false;
> +	u32 iova_mask = ab->hw_params->iova_mask;
> 
>  	if (test_bit(ATH12K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags))
>  		return -ESHUTDOWN;
> @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct 
> ath12k_vif *arvif,
>  		goto fail_remove_tx_buf;
>  	}
> 
> +	if (iova_mask &&
> +	    (unsigned long)skb->data & iova_mask) {
> +		ret = ath12k_dp_tx_align_payload(ab, &skb);
> +		if (ret) {
> +			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);
> +			/* don't bail out, give original buffer
> +			 * a chance even unaligned.
> +			 */
> +			goto map;
> +		}
> +
> +		/* hdr is pointing to a wrong place after alignment,
> +		 * so refresh it for later use.
> +		 */
> +		hdr = (void *)skb->data;
> +	}
> +map:
>  	ti.paddr = dma_map_single(ab->dev, skb->data, skb->len, 
> DMA_TO_DEVICE);
>  	if (dma_mapping_error(ab->dev, ti.paddr)) {
>  		atomic_inc(&ab->soc_stats.tx_err.misc_fail);
> diff --git a/drivers/net/wireless/ath/ath12k/hw.c 
> b/drivers/net/wireless/ath/ath12k/hw.c
> index 2e11ea763574..7b3e2420e3c5 100644
> --- a/drivers/net/wireless/ath/ath12k/hw.c
> +++ b/drivers/net/wireless/ath/ath12k/hw.c
> @@ -924,6 +924,8 @@ static const struct ath12k_hw_params 
> ath12k_hw_params[] = {
> 
>  		.acpi_guid = NULL,
>  		.supports_dynamic_smps_6ghz = true,
> +
> +		.iova_mask = 0,
>  	},
>  	{
>  		.name = "wcn7850 hw2.0",
> @@ -1000,6 +1002,8 @@ static const struct ath12k_hw_params 
> ath12k_hw_params[] = {
> 
>  		.acpi_guid = &wcn7850_uuid,
>  		.supports_dynamic_smps_6ghz = false,
> +
> +		.iova_mask = PCIE_MAX_PAYLOAD_SIZE - 1,
>  	},
>  	{
>  		.name = "qcn9274 hw2.0",
> @@ -1072,6 +1076,8 @@ static const struct ath12k_hw_params 
> ath12k_hw_params[] = {
> 
>  		.acpi_guid = NULL,
>  		.supports_dynamic_smps_6ghz = true,
> +
> +		.iova_mask = 0,
>  	},
>  };
> 
> diff --git a/drivers/net/wireless/ath/ath12k/hw.h 
> b/drivers/net/wireless/ath/ath12k/hw.h
> index e792eb6b249b..49668aa0efc8 100644
> --- a/drivers/net/wireless/ath/ath12k/hw.h
> +++ b/drivers/net/wireless/ath/ath12k/hw.h
> @@ -96,6 +96,8 @@
>  #define ATH12K_M3_FILE			"m3.bin"
>  #define ATH12K_REGDB_FILE_NAME		"regdb.bin"
> 
> +#define PCIE_MAX_PAYLOAD_SIZE		128
> +
>  enum ath12k_hw_rate_cck {
>  	ATH12K_HW_RATE_CCK_LP_11M = 0,
>  	ATH12K_HW_RATE_CCK_LP_5_5M,
> @@ -215,6 +217,8 @@ struct ath12k_hw_params {
> 
>  	const guid_t *acpi_guid;
>  	bool supports_dynamic_smps_6ghz;
> +
> +	u32 iova_mask;
>  };
> 
>  struct ath12k_hw_ops {
> diff --git a/drivers/net/wireless/ath/ath12k/mac.c 
> b/drivers/net/wireless/ath/ath12k/mac.c
> index 8106297f0bc1..ce41c8153080 100644
> --- a/drivers/net/wireless/ath/ath12k/mac.c
> +++ b/drivers/net/wireless/ath/ath12k/mac.c
> @@ -9193,6 +9193,7 @@ static int ath12k_mac_hw_register(struct 
> ath12k_hw *ah)
> 
>  	hw->vif_data_size = sizeof(struct ath12k_vif);
>  	hw->sta_data_size = sizeof(struct ath12k_sta);
> +	hw->extra_tx_headroom = ab->hw_params->iova_mask;
> 
>  	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST);
>  	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_STA_TX_PWR);

We've tested this in the Lenovo lab using the T14 G5 AMD with a 6.10.0-rc7+ kernel from wireless-next and this patch applied.
Previously we had stability issues under traffic load. With the patch applied we can no longer reproduce the issue.

Tested-by: Mark Pearson <mpearson-lenovo@squebb.ca>

Can this be tagged for stable backporting? It's an important fix.

Thanks for getting this fix done. Very much appreciated.

Mark
Jeff Johnson July 19, 2024, 2:10 p.m. UTC | #2
On 7/14/2024 7:38 PM, Baochen Qiang wrote:
> In transmit path, it is likely that the iova is not aligned to PCIe TLP
> max payload size, which is 128 for WCN7850. Normally in such cases hardware
> is expected to split the packet into several parts in a manner such that
> they, other than the first one, have aligned iova. However due to hardware
> limitations, WCN7850 does not behave like that properly with some specific
> unaligned iova in transmit path. This easily results in target hang in a
> KPI transmit test: packet send/receive failure, WMI command send timeout
> etc. Also fatal error seen in PCIe level:
> 
> 	...
> 	Capabilities: ...
> 		...
> 		DevSta: ... FatalErr+ ...
> 		...
> 	...
> 
> Work around this by manually moving/reallocating payload buffer such that
> we can map it to a 128 bytes aligned iova. The moving requires sufficient
> head room or tail room in skb: for the former we can do ourselves a favor
> by asking some extra bytes when registering with mac80211, while for the
> latter we can do nothing.
> 
> Moving/reallocating buffer consumes additional CPU cycles, but the good news
> is that an aligned iova increases PCIe efficiency. In my tests on some X86
> platforms the KPI results are almost consistent.
> 
> Since this is seen only with WCN7850, add a new hardware parameter to
> differentiate from others.

I asked for expert opinion on this patch and received the following response.
Baochen, can you take a look at this suggestion?

> Aligning headers is sometimes done, but it appears the driver
> doesn't support scatter gather? I think the author may want to advertise
> scatter and linearize manually in the driver, to a correct offset.
> Because now core is linearizing the skb in validate_xmit_skb()
> and then the driver moves it a second time..
Baochen Qiang July 22, 2024, 6:54 a.m. UTC | #3
On 7/19/2024 10:10 PM, Jeff Johnson wrote:
> On 7/14/2024 7:38 PM, Baochen Qiang wrote:
>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>> is expected to split the packet into several parts in a manner such that
>> they, other than the first one, have aligned iova. However due to hardware
>> limitations, WCN7850 does not behave like that properly with some specific
>> unaligned iova in transmit path. This easily results in target hang in a
>> KPI transmit test: packet send/receive failure, WMI command send timeout
>> etc. Also fatal error seen in PCIe level:
>>
>> 	...
>> 	Capabilities: ...
>> 		...
>> 		DevSta: ... FatalErr+ ...
>> 		...
>> 	...
>>
>> Work around this by manually moving/reallocating payload buffer such that
>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>> head room or tail room in skb: for the former we can do ourselves a favor
>> by asking some extra bytes when registering with mac80211, while for the
>> latter we can do nothing.
>>
>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>> platforms the KPI results are almost consistent.
>>
>> Since this is seen only with WCN7850, add a new hardware parameter to
>> differentiate from others.
> 
> I asked for expert opinion on this patch and received the following response.
> Baochen, can you take a look at this suggestion?
> 
>> Aligning headers is sometimes done, but it appears the driver
>> doesn't support scatter gather? I think the author may want to advertise
right, ath12k does not support SG currently.

>> scatter and linearize manually in the driver, to a correct offset.
is there an existing skb API or API combinations which can do that for me? I checked __skb_linearize() and it does not take an 'offset' argument.

>> Because now core is linearizing the skb in validate_xmit_skb()
>> and then the driver moves it a second time..
> 
>
Baochen Qiang July 22, 2024, 7:02 a.m. UTC | #4
On 7/22/2024 2:54 PM, Baochen Qiang wrote:
> 
> 
> On 7/19/2024 10:10 PM, Jeff Johnson wrote:
>> On 7/14/2024 7:38 PM, Baochen Qiang wrote:
>>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>>> is expected to split the packet into several parts in a manner such that
>>> they, other than the first one, have aligned iova. However due to hardware
>>> limitations, WCN7850 does not behave like that properly with some specific
>>> unaligned iova in transmit path. This easily results in target hang in a
>>> KPI transmit test: packet send/receive failure, WMI command send timeout
>>> etc. Also fatal error seen in PCIe level:
>>>
>>> 	...
>>> 	Capabilities: ...
>>> 		...
>>> 		DevSta: ... FatalErr+ ...
>>> 		...
>>> 	...
>>>
>>> Work around this by manually moving/reallocating payload buffer such that
>>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>>> head room or tail room in skb: for the former we can do ourselves a favor
>>> by asking some extra bytes when registering with mac80211, while for the
>>> latter we can do nothing.
>>>
>>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>>> platforms the KPI results are almost consistent.
>>>
>>> Since this is seen only with WCN7850, add a new hardware parameter to
>>> differentiate from others.
>>
>> I asked for expert opinion on this patch and received the following response.
>> Baochen, can you take a look at this suggestion?
>>
>>> Aligning headers is sometimes done, but it appears the driver
>>> doesn't support scatter gather? I think the author may want to advertise
> right, ath12k does not support SG currently.
> 
>>> scatter and linearize manually in the driver, to a correct offset.
> is there an existing skb API or API combinations which can do that for me? I checked __skb_linearize() and it does not take an 'offset' argument.
or do I need to implement it myself from a very low level basis? like (if required) allocating skb structure, allocating/aligning payload buffer, copying/freeing paged frag/frag list, etc..

> 
>>> Because now core is linearizing the skb in validate_xmit_skb()
>>> and then the driver moves it a second time..
>>
>>
>
Kalle Valo Aug. 1, 2024, 3:07 p.m. UTC | #5
Baochen Qiang <quic_bqiang@quicinc.com> writes:

> In transmit path, it is likely that the iova is not aligned to PCIe TLP
> max payload size, which is 128 for WCN7850. Normally in such cases hardware
> is expected to split the packet into several parts in a manner such that
> they, other than the first one, have aligned iova. However due to hardware
> limitations, WCN7850 does not behave like that properly with some specific
> unaligned iova in transmit path. This easily results in target hang in a
> KPI transmit test: packet send/receive failure, WMI command send timeout
> etc. Also fatal error seen in PCIe level:
>
> 	...
> 	Capabilities: ...
> 		...
> 		DevSta: ... FatalErr+ ...
> 		...
> 	...
>
> Work around this by manually moving/reallocating payload buffer such that
> we can map it to a 128 bytes aligned iova. The moving requires sufficient
> head room or tail room in skb: for the former we can do ourselves a favor
> by asking some extra bytes when registering with mac80211, while for the
> latter we can do nothing.
>
> Moving/reallocating buffer consumes additional CPU cycles, but the good news
> is that an aligned iova increases PCIe efficiency. In my tests on some X86
> platforms the KPI results are almost consistent.
>
> Since this is seen only with WCN7850, add a new hardware parameter to
> differentiate from others.
>
> Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
>
> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>

[...]

> --- a/drivers/net/wireless/ath/ath12k/dp_tx.c
> +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
> @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb)
>  	return 0;
>  }
>  
> +static void ath12k_dp_tx_move_payload(struct sk_buff *skb,
> +				      unsigned long delta,
> +				      bool head)
> +{
> +	unsigned long len = skb->len;
> +
> +	if (head) {
> +		skb_push(skb, delta);
> +		memmove(skb->data, skb->data + delta, len);
> +		skb_trim(skb, len);
> +	} else {
> +		skb_put(skb, delta);
> +		memmove(skb->data + delta, skb->data, len);
> +		skb_pull(skb, delta);
> +	}
> +}

I'm nitpicking, but usually booleans like the head variable here don't
help with readability. Having two separate functions would be easier to
read, but this is fine as it's so small.

> @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
>  		goto fail_remove_tx_buf;
>  	}
>  
> +	if (iova_mask &&
> +	    (unsigned long)skb->data & iova_mask) {
> +		ret = ath12k_dp_tx_align_payload(ab, &skb);
> +		if (ret) {
> +			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);

Why dev_warn_once()? I changed it to ath12k_warn() in the pending
branch.

> --- a/drivers/net/wireless/ath/ath12k/hw.h
> +++ b/drivers/net/wireless/ath/ath12k/hw.h
> @@ -96,6 +96,8 @@
>  #define ATH12K_M3_FILE			"m3.bin"
>  #define ATH12K_REGDB_FILE_NAME		"regdb.bin"
>  
> +#define PCIE_MAX_PAYLOAD_SIZE		128

PCIE prefix implies that this is in PCI subsystem. I renamed it to
ATH12K_PCIE_MAX_PAYLOAD_SIZE.

Please check my changes:

https://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git/commit/?h=pending&id=b603c1e0d94fb1eb0576ef48ebe37c8c1ce86328
Kalle Valo Aug. 1, 2024, 3:20 p.m. UTC | #6
"Mark Pearson" <mpearson-lenovo@squebb.ca> writes:

> On Sun, Jul 14, 2024, at 10:38 PM, Baochen Qiang wrote:
>
>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>> is expected to split the packet into several parts in a manner such that
>> they, other than the first one, have aligned iova. However due to hardware
>> limitations, WCN7850 does not behave like that properly with some specific
>> unaligned iova in transmit path. This easily results in target hang in a
>> KPI transmit test: packet send/receive failure, WMI command send timeout
>> etc. Also fatal error seen in PCIe level:
>>
>> 	...
>> 	Capabilities: ...
>> 		...
>> 		DevSta: ... FatalErr+ ...
>> 		...
>> 	...
>>
>> Work around this by manually moving/reallocating payload buffer such that
>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>> head room or tail room in skb: for the former we can do ourselves a favor
>> by asking some extra bytes when registering with mac80211, while for the
>> latter we can do nothing.
>>
>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>> platforms the KPI results are almost consistent.
>>
>> Since this is seen only with WCN7850, add a new hardware parameter to
>> differentiate from others.
>>
>> Tested-on: WCN7850 hw2.0 PCI 
>> WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
>>
>> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
>
> We've tested this in the Lenovo lab using the T14 G5 AMD with a
> 6.10.0-rc7+ kernel from wireless-next and this patch applied.
> Previously we had stability issues under traffic load. With the patch
> applied we can no longer reproduce the issue.
>
> Tested-by: Mark Pearson <mpearson-lenovo@squebb.ca>
>
> Can this be tagged for stable backporting? It's an important fix.

I added cc stable to the commit message. I forgot to do it before I
pushed my changes out, but it's in my local branch.
Kalle Valo Aug. 1, 2024, 3:22 p.m. UTC | #7
Baochen Qiang <quic_bqiang@quicinc.com> writes:

> On 7/22/2024 2:54 PM, Baochen Qiang wrote:
>> 
>> 
>> On 7/19/2024 10:10 PM, Jeff Johnson wrote:
>>> On 7/14/2024 7:38 PM, Baochen Qiang wrote:
>>>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>>>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>>>> is expected to split the packet into several parts in a manner such that
>>>> they, other than the first one, have aligned iova. However due to hardware
>>>> limitations, WCN7850 does not behave like that properly with some specific
>>>> unaligned iova in transmit path. This easily results in target hang in a
>>>> KPI transmit test: packet send/receive failure, WMI command send timeout
>>>> etc. Also fatal error seen in PCIe level:
>>>>
>>>> 	...
>>>> 	Capabilities: ...
>>>> 		...
>>>> 		DevSta: ... FatalErr+ ...
>>>> 		...
>>>> 	...
>>>>
>>>> Work around this by manually moving/reallocating payload buffer such that
>>>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>>>> head room or tail room in skb: for the former we can do ourselves a favor
>>>> by asking some extra bytes when registering with mac80211, while for the
>>>> latter we can do nothing.
>>>>
>>>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>>>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>>>> platforms the KPI results are almost consistent.
>>>>
>>>> Since this is seen only with WCN7850, add a new hardware parameter to
>>>> differentiate from others.
>>>
>>> I asked for expert opinion on this patch and received the following response.
>>> Baochen, can you take a look at this suggestion?
>>>
>>>> Aligning headers is sometimes done, but it appears the driver
>>>> doesn't support scatter gather? I think the author may want to advertise
>> right, ath12k does not support SG currently.
>> 
>>>> scatter and linearize manually in the driver, to a correct offset.
>> is there an existing skb API or API combinations which can do that
>> for me? I checked __skb_linearize() and it does not take an 'offset'
>> argument.
>
> or do I need to implement it myself from a very low level basis? like
> (if required) allocating skb structure, allocating/aligning payload
> buffer, copying/freeing paged frag/frag list, etc..

I don't know __skb_linearize() well enough to really comment here. I'm
leaning towards we just take this patch to ath-current for v6.11 and, if
needed, optimise it later. Thoughts?
Jeff Johnson Aug. 1, 2024, 6:04 p.m. UTC | #8
On 8/1/2024 8:07 AM, Kalle Valo wrote:
> Baochen Qiang <quic_bqiang@quicinc.com> writes:
> 
>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>> is expected to split the packet into several parts in a manner such that
>> they, other than the first one, have aligned iova. However due to hardware
>> limitations, WCN7850 does not behave like that properly with some specific
>> unaligned iova in transmit path. This easily results in target hang in a
>> KPI transmit test: packet send/receive failure, WMI command send timeout
>> etc. Also fatal error seen in PCIe level:
>>
>> 	...
>> 	Capabilities: ...
>> 		...
>> 		DevSta: ... FatalErr+ ...
>> 		...
>> 	...
>>
>> Work around this by manually moving/reallocating payload buffer such that
>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>> head room or tail room in skb: for the former we can do ourselves a favor
>> by asking some extra bytes when registering with mac80211, while for the
>> latter we can do nothing.
>>
>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>> platforms the KPI results are almost consistent.
>>
>> Since this is seen only with WCN7850, add a new hardware parameter to
>> differentiate from others.
>>
>> Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
>>
>> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
> 
> [...]
> 
>> --- a/drivers/net/wireless/ath/ath12k/dp_tx.c
>> +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
>> @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb)
>>  	return 0;
>>  }
>>  
>> +static void ath12k_dp_tx_move_payload(struct sk_buff *skb,
>> +				      unsigned long delta,
>> +				      bool head)
>> +{
>> +	unsigned long len = skb->len;
>> +
>> +	if (head) {
>> +		skb_push(skb, delta);
>> +		memmove(skb->data, skb->data + delta, len);
>> +		skb_trim(skb, len);>> +	} else {
>> +		skb_put(skb, delta);
>> +		memmove(skb->data + delta, skb->data, len);
>> +		skb_pull(skb, delta);
>> +	}
>> +}
> 
> I'm nitpicking, but usually booleans like the head variable here don't
> help with readability. Having two separate functions would be easier to
> read, but this is fine as it's so small.
> 
>> @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
>>  		goto fail_remove_tx_buf;
>>  	}
>>  
>> +	if (iova_mask &&
>> +	    (unsigned long)skb->data & iova_mask) {
>> +		ret = ath12k_dp_tx_align_payload(ab, &skb);
>> +		if (ret) {
>> +			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);
> 
> Why dev_warn_once()? I changed it to ath12k_warn() in the pending
> branch.

My concern was that if this is an ongoing issue that you'd end up spamming the
kernel log. But I guess the rate limiting will reduce the spam to no more than
10 logs in a 5 second interval

> 
>> --- a/drivers/net/wireless/ath/ath12k/hw.h
>> +++ b/drivers/net/wireless/ath/ath12k/hw.h
>> @@ -96,6 +96,8 @@
>>  #define ATH12K_M3_FILE			"m3.bin"
>>  #define ATH12K_REGDB_FILE_NAME		"regdb.bin"
>>  
>> +#define PCIE_MAX_PAYLOAD_SIZE		128
> 
> PCIE prefix implies that this is in PCI subsystem. I renamed it to
> ATH12K_PCIE_MAX_PAYLOAD_SIZE.
> 
> Please check my changes:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git/commit/?h=pending&id=b603c1e0d94fb1eb0576ef48ebe37c8c1ce86328

Acked-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Baochen Qiang Aug. 5, 2024, 3:20 a.m. UTC | #9
On 8/1/2024 11:07 PM, Kalle Valo wrote:
> Baochen Qiang <quic_bqiang@quicinc.com> writes:
> 
>> In transmit path, it is likely that the iova is not aligned to PCIe TLP
>> max payload size, which is 128 for WCN7850. Normally in such cases hardware
>> is expected to split the packet into several parts in a manner such that
>> they, other than the first one, have aligned iova. However due to hardware
>> limitations, WCN7850 does not behave like that properly with some specific
>> unaligned iova in transmit path. This easily results in target hang in a
>> KPI transmit test: packet send/receive failure, WMI command send timeout
>> etc. Also fatal error seen in PCIe level:
>>
>> 	...
>> 	Capabilities: ...
>> 		...
>> 		DevSta: ... FatalErr+ ...
>> 		...
>> 	...
>>
>> Work around this by manually moving/reallocating payload buffer such that
>> we can map it to a 128 bytes aligned iova. The moving requires sufficient
>> head room or tail room in skb: for the former we can do ourselves a favor
>> by asking some extra bytes when registering with mac80211, while for the
>> latter we can do nothing.
>>
>> Moving/reallocating buffer consumes additional CPU cycles, but the good news
>> is that an aligned iova increases PCIe efficiency. In my tests on some X86
>> platforms the KPI results are almost consistent.
>>
>> Since this is seen only with WCN7850, add a new hardware parameter to
>> differentiate from others.
>>
>> Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
>>
>> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
> 
> [...]
> 
>> --- a/drivers/net/wireless/ath/ath12k/dp_tx.c
>> +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
>> @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb)
>>  	return 0;
>>  }
>>  
>> +static void ath12k_dp_tx_move_payload(struct sk_buff *skb,
>> +				      unsigned long delta,
>> +				      bool head)
>> +{
>> +	unsigned long len = skb->len;
>> +
>> +	if (head) {
>> +		skb_push(skb, delta);
>> +		memmove(skb->data, skb->data + delta, len);
>> +		skb_trim(skb, len);
>> +	} else {
>> +		skb_put(skb, delta);
>> +		memmove(skb->data + delta, skb->data, len);
>> +		skb_pull(skb, delta);
>> +	}
>> +}
> 
> I'm nitpicking, but usually booleans like the head variable here don't
> help with readability. Having two separate functions would be easier to
> read, but this is fine as it's so small.
> 
>> @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
>>  		goto fail_remove_tx_buf;
>>  	}
>>  
>> +	if (iova_mask &&
>> +	    (unsigned long)skb->data & iova_mask) {
>> +		ret = ath12k_dp_tx_align_payload(ab, &skb);
>> +		if (ret) {
>> +			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);
> 
> Why dev_warn_once()? I changed it to ath12k_warn() in the pending
> branch.
> 
>> --- a/drivers/net/wireless/ath/ath12k/hw.h
>> +++ b/drivers/net/wireless/ath/ath12k/hw.h
>> @@ -96,6 +96,8 @@
>>  #define ATH12K_M3_FILE			"m3.bin"
>>  #define ATH12K_REGDB_FILE_NAME		"regdb.bin"
>>  
>> +#define PCIE_MAX_PAYLOAD_SIZE		128
> 
> PCIE prefix implies that this is in PCI subsystem. I renamed it to
> ATH12K_PCIE_MAX_PAYLOAD_SIZE.
> 
> Please check my changes:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git/commit/?h=pending&id=b603c1e0d94fb1eb0576ef48ebe37c8c1ce86328
looks good to me, thanks.

>
Kalle Valo Aug. 5, 2024, 9:28 a.m. UTC | #10
Baochen Qiang <quic_bqiang@quicinc.com> wrote:

> In transmit path, it is likely that the iova is not aligned to PCIe TLP
> max payload size, which is 128 for WCN7850. Normally in such cases hardware
> is expected to split the packet into several parts in a manner such that
> they, other than the first one, have aligned iova. However due to hardware
> limitations, WCN7850 does not behave like that properly with some specific
> unaligned iova in transmit path. This easily results in target hang in a
> KPI transmit test: packet send/receive failure, WMI command send timeout
> etc. Also fatal error seen in PCIe level:
> 
>         ...
>         Capabilities: ...
>                 ...
>                 DevSta: ... FatalErr+ ...
>                 ...
>         ...
> 
> Work around this by manually moving/reallocating payload buffer such that
> we can map it to a 128 bytes aligned iova. The moving requires sufficient
> head room or tail room in skb: for the former we can do ourselves a favor
> by asking some extra bytes when registering with mac80211, while for the
> latter we can do nothing.
> 
> Moving/reallocating buffer consumes additional CPU cycles, but the good news
> is that an aligned iova increases PCIe efficiency. In my tests on some X86
> platforms the KPI results are almost consistent.
> 
> Since this is seen only with WCN7850, add a new hardware parameter to
> differentiate from others.
> 
> Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
> 
> Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
> Cc: <stable@vger.kernel.org>
> Tested-by: Mark Pearson <mpearson-lenovo@squebb.ca>
> Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>

Patch applied to ath-current branch of ath.git, thanks.

38055789d151 wifi: ath12k: use 128 bytes aligned iova in transmit path for WCN7850
Kalle Valo Aug. 5, 2024, 9:36 a.m. UTC | #11
Jeff Johnson <quic_jjohnson@quicinc.com> writes:

>>> @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
>>>  		goto fail_remove_tx_buf;
>>>  	}
>>>  
>>> +	if (iova_mask &&
>>> +	    (unsigned long)skb->data & iova_mask) {
>>> +		ret = ath12k_dp_tx_align_payload(ab, &skb);
>>> +		if (ret) {
>>> +			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);
>> 
>> Why dev_warn_once()? I changed it to ath12k_warn() in the pending
>> branch.
>
> My concern was that if this is an ongoing issue that you'd end up spamming the
> kernel log. But I guess the rate limiting will reduce the spam to no more than
> 10 logs in a 5 second interval

Yeah, ratelimiting used by ath12k_warn() should be safe. It would be
consistent to have ath12k_warn_once() instead of using dev_warn_once()
but in this it's better to print the warning more than once so that
users don't miss it.
diff mbox series

Patch

diff --git a/drivers/net/wireless/ath/ath12k/dp_tx.c b/drivers/net/wireless/ath/ath12k/dp_tx.c
index d08c04343e90..00475d0848e1 100644
--- a/drivers/net/wireless/ath/ath12k/dp_tx.c
+++ b/drivers/net/wireless/ath/ath12k/dp_tx.c
@@ -162,6 +162,60 @@  static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb)
 	return 0;
 }
 
+static void ath12k_dp_tx_move_payload(struct sk_buff *skb,
+				      unsigned long delta,
+				      bool head)
+{
+	unsigned long len = skb->len;
+
+	if (head) {
+		skb_push(skb, delta);
+		memmove(skb->data, skb->data + delta, len);
+		skb_trim(skb, len);
+	} else {
+		skb_put(skb, delta);
+		memmove(skb->data + delta, skb->data, len);
+		skb_pull(skb, delta);
+	}
+}
+
+static int ath12k_dp_tx_align_payload(struct ath12k_base *ab,
+				      struct sk_buff **pskb)
+{
+	u32 iova_mask = ab->hw_params->iova_mask;
+	unsigned long offset, delta1, delta2;
+	struct sk_buff *skb2, *skb = *pskb;
+	unsigned int headroom = skb_headroom(skb);
+	int tailroom = skb_tailroom(skb);
+	int ret = 0;
+
+	offset = (unsigned long)skb->data & iova_mask;
+	delta1 = offset;
+	delta2 = iova_mask - offset + 1;
+
+	if (headroom >= delta1) {
+		ath12k_dp_tx_move_payload(skb, delta1, true);
+	} else if (tailroom >= delta2) {
+		ath12k_dp_tx_move_payload(skb, delta2, false);
+	} else {
+		skb2 = skb_realloc_headroom(skb, iova_mask);
+		if (!skb2) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		dev_kfree_skb_any(skb);
+
+		offset = (unsigned long)skb2->data & iova_mask;
+		if (offset)
+			ath12k_dp_tx_move_payload(skb2, offset, true);
+		*pskb = skb2;
+	}
+
+out:
+	return ret;
+}
+
 int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
 		 struct sk_buff *skb)
 {
@@ -184,6 +238,7 @@  int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
 	bool tcl_ring_retry;
 	bool msdu_ext_desc = false;
 	bool add_htt_metadata = false;
+	u32 iova_mask = ab->hw_params->iova_mask;
 
 	if (test_bit(ATH12K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags))
 		return -ESHUTDOWN;
@@ -279,6 +334,23 @@  int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif,
 		goto fail_remove_tx_buf;
 	}
 
+	if (iova_mask &&
+	    (unsigned long)skb->data & iova_mask) {
+		ret = ath12k_dp_tx_align_payload(ab, &skb);
+		if (ret) {
+			dev_warn_once(ab->dev, "failed to align TX buffer %d\n", ret);
+			/* don't bail out, give original buffer
+			 * a chance even unaligned.
+			 */
+			goto map;
+		}
+
+		/* hdr is pointing to a wrong place after alignment,
+		 * so refresh it for later use.
+		 */
+		hdr = (void *)skb->data;
+	}
+map:
 	ti.paddr = dma_map_single(ab->dev, skb->data, skb->len, DMA_TO_DEVICE);
 	if (dma_mapping_error(ab->dev, ti.paddr)) {
 		atomic_inc(&ab->soc_stats.tx_err.misc_fail);
diff --git a/drivers/net/wireless/ath/ath12k/hw.c b/drivers/net/wireless/ath/ath12k/hw.c
index 2e11ea763574..7b3e2420e3c5 100644
--- a/drivers/net/wireless/ath/ath12k/hw.c
+++ b/drivers/net/wireless/ath/ath12k/hw.c
@@ -924,6 +924,8 @@  static const struct ath12k_hw_params ath12k_hw_params[] = {
 
 		.acpi_guid = NULL,
 		.supports_dynamic_smps_6ghz = true,
+
+		.iova_mask = 0,
 	},
 	{
 		.name = "wcn7850 hw2.0",
@@ -1000,6 +1002,8 @@  static const struct ath12k_hw_params ath12k_hw_params[] = {
 
 		.acpi_guid = &wcn7850_uuid,
 		.supports_dynamic_smps_6ghz = false,
+
+		.iova_mask = PCIE_MAX_PAYLOAD_SIZE - 1,
 	},
 	{
 		.name = "qcn9274 hw2.0",
@@ -1072,6 +1076,8 @@  static const struct ath12k_hw_params ath12k_hw_params[] = {
 
 		.acpi_guid = NULL,
 		.supports_dynamic_smps_6ghz = true,
+
+		.iova_mask = 0,
 	},
 };
 
diff --git a/drivers/net/wireless/ath/ath12k/hw.h b/drivers/net/wireless/ath/ath12k/hw.h
index e792eb6b249b..49668aa0efc8 100644
--- a/drivers/net/wireless/ath/ath12k/hw.h
+++ b/drivers/net/wireless/ath/ath12k/hw.h
@@ -96,6 +96,8 @@ 
 #define ATH12K_M3_FILE			"m3.bin"
 #define ATH12K_REGDB_FILE_NAME		"regdb.bin"
 
+#define PCIE_MAX_PAYLOAD_SIZE		128
+
 enum ath12k_hw_rate_cck {
 	ATH12K_HW_RATE_CCK_LP_11M = 0,
 	ATH12K_HW_RATE_CCK_LP_5_5M,
@@ -215,6 +217,8 @@  struct ath12k_hw_params {
 
 	const guid_t *acpi_guid;
 	bool supports_dynamic_smps_6ghz;
+
+	u32 iova_mask;
 };
 
 struct ath12k_hw_ops {
diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c
index 8106297f0bc1..ce41c8153080 100644
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -9193,6 +9193,7 @@  static int ath12k_mac_hw_register(struct ath12k_hw *ah)
 
 	hw->vif_data_size = sizeof(struct ath12k_vif);
 	hw->sta_data_size = sizeof(struct ath12k_sta);
+	hw->extra_tx_headroom = ab->hw_params->iova_mask;
 
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST);
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_STA_TX_PWR);