diff mbox series

[1/3] wifi: ath11k: fix dest ring-buffer corruption

Message ID 20250526114803.2122-2-johan+linaro@kernel.org
State New
Headers show
Series wifi: ath11k: fix dest ring-buffer corruption | expand

Commit Message

Johan Hovold May 26, 2025, 11:48 a.m. UTC
Add the missing memory barriers to make sure that destination ring
descriptors are read after the head pointers to avoid using stale data
on weakly ordered architectures like aarch64.

Tested-on: WCN6855 hw2.1 WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.41

Fixes: d5c65159f289 ("ath11k: driver for Qualcomm IEEE 802.11ax devices")
Cc: stable@vger.kernel.org	# 5.6
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
---
 drivers/net/wireless/ath/ath11k/dp_rx.c | 19 +++++++++++++++++++
 drivers/net/wireless/ath/ath11k/dp_tx.c |  3 +++
 2 files changed, 22 insertions(+)

Comments

Miaoqing Pan May 29, 2025, 7:03 a.m. UTC | #1
On 5/26/2025 7:48 PM, Johan Hovold wrote:
> Add the missing memory barriers to make sure that destination ring
> descriptors are read after the head pointers to avoid using stale data
> on weakly ordered architectures like aarch64.
> 
> Tested-on: WCN6855 hw2.1 WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.41
> 
> Fixes: d5c65159f289 ("ath11k: driver for Qualcomm IEEE 802.11ax devices")
> Cc: stable@vger.kernel.org	# 5.6
> Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
> ---
>   drivers/net/wireless/ath/ath11k/dp_rx.c | 19 +++++++++++++++++++
>   drivers/net/wireless/ath/ath11k/dp_tx.c |  3 +++
>   2 files changed, 22 insertions(+)
> 
> diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c
> index ea2959305dec..dfe2d889c20f 100644
> --- a/drivers/net/wireless/ath/ath11k/dp_rx.c
> +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c
> @@ -3851,6 +3851,9 @@ int ath11k_dp_process_rx_err(struct ath11k_base *ab, struct napi_struct *napi,
>   
>   	ath11k_hal_srng_access_begin(ab, srng);
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +

Thanks Johan, for continuing to follow up on this issue. I have some 
different opinions.

This change somewhat deviates from the fix approach described in 
https://lore.kernel.org/all/20250321095219.19369-1-johan+linaro@kernel.org/. 
In this case, the descriptor might be accessed before it is updated or 
while it is still being updated. Therefore, a dma_rmb() should be added 
after the call to ath11k_hal_srng_dst_get_next_entry() and before 
accessing ath11k_hal_ce_dst_status_get_length(), to ensure that the DMA 
has completed before reading the descriptor.

However, in this patch, the memory barrier is used to protect the head 
pointer (HP). I don't think a memory barrier is necessary for HP, 
because even if an outdated HP is fetched, 
ath11k_hal_srng_dst_get_next_entry() will return NULL and exit safely. 
So, placing the memory barrier inside 
ath11k_hal_srng_dst_get_next_entry() would be more appropriate.

@@ -678,6 +678,8 @@ u32 *ath11k_hal_srng_dst_get_next_entry(struct 
ath11k_base *ab,
         if (srng->flags & HAL_SRNG_FLAGS_CACHED)
                 ath11k_hal_srng_prefetch_desc(ab, srng);

+       dma_rmb();
+
         return desc;
  }


>   	while (budget &&
>   	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
>   		struct hal_reo_dest_ring *reo_desc = (struct hal_reo_dest_ring *)desc;
> @@ -4154,6 +4157,9 @@ int ath11k_dp_rx_process_wbm_err(struct ath11k_base *ab,
>   
>   	ath11k_hal_srng_access_begin(ab, srng);
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while (budget) {
>   		rx_desc = ath11k_hal_srng_dst_get_next_entry(ab, srng);
>   		if (!rx_desc)
> @@ -4280,6 +4286,9 @@ int ath11k_dp_process_rxdma_err(struct ath11k_base *ab, int mac_id, int budget)
>   
>   	ath11k_hal_srng_access_begin(ab, srng);
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while (quota-- &&
>   	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
>   		ath11k_hal_rx_reo_ent_paddr_get(ab, desc, &paddr, &desc_bank);
> @@ -4353,6 +4362,9 @@ void ath11k_dp_process_reo_status(struct ath11k_base *ab)
>   
>   	ath11k_hal_srng_access_begin(ab, srng);
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while ((reo_desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
>   		tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc);
>   
> @@ -5168,6 +5180,9 @@ static void ath11k_dp_rx_mon_dest_process(struct ath11k *ar, int mac_id,
>   	rx_bufs_used = 0;
>   	rx_mon_stats = &pmon->rx_mon_stats;
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while ((ring_entry = ath11k_hal_srng_dst_peek(ar->ab, mon_dst_srng))) {
>   		struct sk_buff *head_msdu, *tail_msdu;
>   
> @@ -5630,6 +5645,10 @@ static int ath11k_dp_full_mon_process_rx(struct ath11k_base *ab, int mac_id,
>   	spin_lock_bh(&mon_dst_srng->lock);
>   
>   	ath11k_hal_srng_access_begin(ar->ab, mon_dst_srng);
> +
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while ((ring_entry = ath11k_hal_srng_dst_peek(ar->ab, mon_dst_srng))) {
>   		head_msdu = NULL;
>   		tail_msdu = NULL;
> diff --git a/drivers/net/wireless/ath/ath11k/dp_tx.c b/drivers/net/wireless/ath/ath11k/dp_tx.c
> index 8522c67baabf..549d17d90503 100644
> --- a/drivers/net/wireless/ath/ath11k/dp_tx.c
> +++ b/drivers/net/wireless/ath/ath11k/dp_tx.c
> @@ -700,6 +700,9 @@ void ath11k_dp_tx_completion_handler(struct ath11k_base *ab, int ring_id)
>   
>   	ath11k_hal_srng_access_begin(ab, status_ring);
>   
> +	/* Make sure descriptor is read after the head pointer. */
> +	dma_rmb();
> +
>   	while ((ATH11K_TX_COMPL_NEXT(tx_ring->tx_status_head) !=
>   		tx_ring->tx_status_tail) &&
>   	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, status_ring))) {
Johan Hovold June 2, 2025, 8:03 a.m. UTC | #2
On Thu, May 29, 2025 at 03:03:38PM +0800, Miaoqing Pan wrote:
> On 5/26/2025 7:48 PM, Johan Hovold wrote:
> > Add the missing memory barriers to make sure that destination ring
> > descriptors are read after the head pointers to avoid using stale data
> > on weakly ordered architectures like aarch64.

> > @@ -3851,6 +3851,9 @@ int ath11k_dp_process_rx_err(struct ath11k_base *ab, struct napi_struct *napi,
> >   
> >   	ath11k_hal_srng_access_begin(ab, srng);
> >   
> > +	/* Make sure descriptor is read after the head pointer. */
> > +	dma_rmb();
> > +
> 
> Thanks Johan, for continuing to follow up on this issue. I have some 
> different opinions.
> 
> This change somewhat deviates from the fix approach described in 
> https://lore.kernel.org/all/20250321095219.19369-1-johan+linaro@kernel.org/. 
> In this case, the descriptor might be accessed before it is updated or 
> while it is still being updated. Therefore, a dma_rmb() should be added 
> after the call to ath11k_hal_srng_dst_get_next_entry() and before 
> accessing ath11k_hal_ce_dst_status_get_length(), to ensure that the DMA 
> has completed before reading the descriptor.
> 
> However, in this patch, the memory barrier is used to protect the head 
> pointer (HP). I don't think a memory barrier is necessary for HP, 
> because even if an outdated HP is fetched, 
> ath11k_hal_srng_dst_get_next_entry() will return NULL and exit safely. 

No, the barrier is needed between reading the head pointer and accessing
descriptor fields, that's what matters.

You can still end up with reading stale descriptor data even when
ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
(that's what happens on the X13s).

Whether to place it before or after (or inside)
ath11k_hal_srng_dst_get_next_entry() is a trade off between readability, 
maintainability and whether we want to avoid unnecessary barriers in
cases like the above where we strictly only need one barrier before the
loop (or if we want to avoid the barrier in case the ring is ever
empty).

> So, placing the memory barrier inside 
> ath11k_hal_srng_dst_get_next_entry() would be more appropriate.
> 
> @@ -678,6 +678,8 @@ u32 *ath11k_hal_srng_dst_get_next_entry(struct 
> ath11k_base *ab,
>          if (srng->flags & HAL_SRNG_FLAGS_CACHED)
>                  ath11k_hal_srng_prefetch_desc(ab, srng);
> 
> +       dma_rmb();
> +
>          return desc;
>   }

So this will add a barrier in each iteration of the loop, but we only
need a single one after reading the head pointer.

[ Also note that ath11k_hal_srng_dst_peek() would similarly need a
barrier if we were to move them into those helpers. ]

Johan
Baochen Qiang June 3, 2025, 10:52 a.m. UTC | #3
On 6/2/2025 4:03 PM, Johan Hovold wrote:
> On Thu, May 29, 2025 at 03:03:38PM +0800, Miaoqing Pan wrote:
>> On 5/26/2025 7:48 PM, Johan Hovold wrote:
>>> Add the missing memory barriers to make sure that destination ring
>>> descriptors are read after the head pointers to avoid using stale data
>>> on weakly ordered architectures like aarch64.
> 
>>> @@ -3851,6 +3851,9 @@ int ath11k_dp_process_rx_err(struct ath11k_base *ab, struct napi_struct *napi,
>>>   
>>>   	ath11k_hal_srng_access_begin(ab, srng);
>>>   
>>> +	/* Make sure descriptor is read after the head pointer. */
>>> +	dma_rmb();
>>> +
>>
>> Thanks Johan, for continuing to follow up on this issue. I have some 
>> different opinions.
>>
>> This change somewhat deviates from the fix approach described in 
>> https://lore.kernel.org/all/20250321095219.19369-1-johan+linaro@kernel.org/. 
>> In this case, the descriptor might be accessed before it is updated or 
>> while it is still being updated. Therefore, a dma_rmb() should be added 
>> after the call to ath11k_hal_srng_dst_get_next_entry() and before 
>> accessing ath11k_hal_ce_dst_status_get_length(), to ensure that the DMA 
>> has completed before reading the descriptor.
>>
>> However, in this patch, the memory barrier is used to protect the head 
>> pointer (HP). I don't think a memory barrier is necessary for HP, 
>> because even if an outdated HP is fetched, 
>> ath11k_hal_srng_dst_get_next_entry() will return NULL and exit safely. 
> 
> No, the barrier is needed between reading the head pointer and accessing
> descriptor fields, that's what matters.
> 
> You can still end up with reading stale descriptor data even when
> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
> (that's what happens on the X13s).

The fact is that a dma_rmb() does not even prevent speculation, no matter where it is
placed, right? If so the whole point of dma_rmb() is to prevent from compiler reordering
or CPU reordering, but is it really possible?

The sequence is

	1# reading HP
		srng->u.dst_ring.cached_hp = READ_ONCE(*srng->u.dst_ring.hp_addr);

	2# validate HP
		if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
			return NULL;

	3# get desc
		desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;

	4# accessing desc
		ath11k_hal_desc_reo_parse_err(... desc, ...)

Clearly each step depends on the results of previous steps. In this case the compiler/CPU
is expected to be smart enough to not do any reordering, isn't it?

> 
> Whether to place it before or after (or inside)
> ath11k_hal_srng_dst_get_next_entry() is a trade off between readability, 
> maintainability and whether we want to avoid unnecessary barriers in
> cases like the above where we strictly only need one barrier before the
> loop (or if we want to avoid the barrier in case the ring is ever
> empty).
> 
>> So, placing the memory barrier inside 
>> ath11k_hal_srng_dst_get_next_entry() would be more appropriate.
>>
>> @@ -678,6 +678,8 @@ u32 *ath11k_hal_srng_dst_get_next_entry(struct 
>> ath11k_base *ab,
>>          if (srng->flags & HAL_SRNG_FLAGS_CACHED)
>>                  ath11k_hal_srng_prefetch_desc(ab, srng);
>>
>> +       dma_rmb();
>> +
>>          return desc;
>>   }
> 
> So this will add a barrier in each iteration of the loop, but we only
> need a single one after reading the head pointer.
> 
> [ Also note that ath11k_hal_srng_dst_peek() would similarly need a
> barrier if we were to move them into those helpers. ]
> 
> Johan
>
Johan Hovold June 3, 2025, 11:51 a.m. UTC | #4
On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
> On 6/2/2025 4:03 PM, Johan Hovold wrote:

> > No, the barrier is needed between reading the head pointer and accessing
> > descriptor fields, that's what matters.
> > 
> > You can still end up with reading stale descriptor data even when
> > ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
> > (that's what happens on the X13s).
> 
> The fact is that a dma_rmb() does not even prevent speculation, no matter where it is
> placed, right?

It prevents the speculated load from being used.

> If so the whole point of dma_rmb() is to prevent from compiler reordering
> or CPU reordering, but is it really possible?
> 
> The sequence is
> 
> 	1# reading HP
> 		srng->u.dst_ring.cached_hp = READ_ONCE(*srng->u.dst_ring.hp_addr);
> 
> 	2# validate HP
> 		if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
> 			return NULL;
> 
> 	3# get desc
> 		desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
> 
> 	4# accessing desc
> 		ath11k_hal_desc_reo_parse_err(... desc, ...)
> 
> Clearly each step depends on the results of previous steps. In this case the compiler/CPU
> is expected to be smart enough to not do any reordering, isn't it?

Steps 3 and 4 can be done speculatively before the load in step 1 is
complete as long as the result is discarded if it turns out not to be
needed.

Johan
Baochen Qiang June 4, 2025, 2:16 a.m. UTC | #5
On 6/3/2025 7:51 PM, Johan Hovold wrote:
> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
>> On 6/2/2025 4:03 PM, Johan Hovold wrote:
> 
>>> No, the barrier is needed between reading the head pointer and accessing
>>> descriptor fields, that's what matters.
>>>
>>> You can still end up with reading stale descriptor data even when
>>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
>>> (that's what happens on the X13s).
>>
>> The fact is that a dma_rmb() does not even prevent speculation, no matter where it is
>> placed, right?
> 
> It prevents the speculated load from being used.

Sorry, still not get it. To my knowledge whether the speculated load (steps 3 and 4) would
get used depends on whether the condition check pass in step 2. How does a dma_rmb() make
any difference in this process?

Could you help give an detailed explanation on how dma_rmb() work here? I mean what issue
is there if without the barrier, and then how does the barrier prevent the issue? It would
be better if you can follow below pattern in the explanation, i.e, steps and code lines ...

> 
>> If so the whole point of dma_rmb() is to prevent from compiler reordering
>> or CPU reordering, but is it really possible?
>>
>> The sequence is
>>
>> 	1# reading HP
>> 		srng->u.dst_ring.cached_hp = READ_ONCE(*srng->u.dst_ring.hp_addr);
>>
>> 	2# validate HP
>> 		if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
>> 			return NULL;
>>
>> 	3# get desc
>> 		desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
>>
>> 	4# accessing desc
>> 		ath11k_hal_desc_reo_parse_err(... desc, ...)
>>
>> Clearly each step depends on the results of previous steps. In this case the compiler/CPU
>> is expected to be smart enough to not do any reordering, isn't it?
> 
> Steps 3 and 4 can be done speculatively before the load in step 1 is
> complete as long as the result is discarded if it turns out not to be
> needed.
> 
> Johan
Miaoqing Pan June 4, 2025, 2:34 a.m. UTC | #6
On 6/3/2025 7:51 PM, Johan Hovold wrote:
> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
>> On 6/2/2025 4:03 PM, Johan Hovold wrote:
> 
>>> No, the barrier is needed between reading the head pointer and accessing
>>> descriptor fields, that's what matters.
>>>
>>> You can still end up with reading stale descriptor data even when
>>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
>>> (that's what happens on the X13s).
>>
>> The fact is that a dma_rmb() does not even prevent speculation, no matter where it is
>> placed, right?
> 
> It prevents the speculated load from being used.
> 
>> If so the whole point of dma_rmb() is to prevent from compiler reordering
>> or CPU reordering, but is it really possible?
>>
>> The sequence is
>>
>> 	1# reading HP
>> 		srng->u.dst_ring.cached_hp = READ_ONCE(*srng->u.dst_ring.hp_addr);
>>
>> 	2# validate HP
>> 		if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
>> 			return NULL;
>>
>> 	3# get desc
>> 		desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
>>
>> 	4# accessing desc
>> 		ath11k_hal_desc_reo_parse_err(... desc, ...)
>>
>> Clearly each step depends on the results of previous steps. In this case the compiler/CPU
>> is expected to be smart enough to not do any reordering, isn't it?
> 
> Steps 3 and 4 can be done speculatively before the load in step 1 is
> complete as long as the result is discarded if it turns out not to be
> needed.
> 

If the condition in step 2 is true and step 3 speculatively loads 
descriptor from TP before step 1, could this cause issues?

We previously had extensive discussions on this topic in the 
https://lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888-c34178e690fc@quicinc.com/ 
thread. On my platform, dma_rmb() did not work as expected. The issue 
only disappeared after disabling PCIe endpoint relaxed ordering in 
firmware side. So it seems that HP was updated (Memory write) before 
descriptor (Memory write), which led to the problem.
Miaoqing Pan June 4, 2025, 5:32 a.m. UTC | #7
On 6/4/2025 10:34 AM, Miaoqing Pan wrote:
> 
> 
> On 6/3/2025 7:51 PM, Johan Hovold wrote:
>> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
>>> On 6/2/2025 4:03 PM, Johan Hovold wrote:
>>
>>>> No, the barrier is needed between reading the head pointer and 
>>>> accessing
>>>> descriptor fields, that's what matters.
>>>>
>>>> You can still end up with reading stale descriptor data even when
>>>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to 
>>>> speculation
>>>> (that's what happens on the X13s).
>>>
>>> The fact is that a dma_rmb() does not even prevent speculation, no 
>>> matter where it is
>>> placed, right?
>>
>> It prevents the speculated load from being used.
>>
>>> If so the whole point of dma_rmb() is to prevent from compiler 
>>> reordering
>>> or CPU reordering, but is it really possible?
>>>
>>> The sequence is
>>>
>>>     1# reading HP
>>>         srng->u.dst_ring.cached_hp = READ_ONCE(*srng- 
>>> >u.dst_ring.hp_addr);
>>>
>>>     2# validate HP
>>>         if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
>>>             return NULL;
>>>
>>>     3# get desc
>>>         desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
>>>
>>>     4# accessing desc
>>>         ath11k_hal_desc_reo_parse_err(... desc, ...)
>>>
>>> Clearly each step depends on the results of previous steps. In this 
>>> case the compiler/CPU
>>> is expected to be smart enough to not do any reordering, isn't it?
>>
>> Steps 3 and 4 can be done speculatively before the load in step 1 is
>> complete as long as the result is discarded if it turns out not to be
>> needed.
>>
> 
> If the condition in step 2 is true and step 3 speculatively loads 
> descriptor from TP before step 1, could this cause issues?

Sorry for typo, if the condition in step 2 is false and step 3 
speculatively loads descriptor from TP before step 1, could this cause 
issues?

> 
> We previously had extensive discussions on this topic in the https:// 
> lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888- 
> c34178e690fc@quicinc.com/ thread. On my platform, dma_rmb() did not work 
> as expected. The issue only disappeared after disabling PCIe endpoint 
> relaxed ordering in firmware side. So it seems that HP was updated 
> (Memory write) before descriptor (Memory write), which led to the problem.
> 
> 
> 
> 
>
Johan Hovold June 4, 2025, 6:59 a.m. UTC | #8
On Wed, Jun 04, 2025 at 10:16:23AM +0800, Baochen Qiang wrote:
> On 6/3/2025 7:51 PM, Johan Hovold wrote:
> > On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
> >> On 6/2/2025 4:03 PM, Johan Hovold wrote:
> > 
> >>> No, the barrier is needed between reading the head pointer and accessing
> >>> descriptor fields, that's what matters.
> >>>
> >>> You can still end up with reading stale descriptor data even when
> >>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to speculation
> >>> (that's what happens on the X13s).
> >>
> >> The fact is that a dma_rmb() does not even prevent speculation, no matter where it is
> >> placed, right?
> > 
> > It prevents the speculated load from being used.
> 
> Sorry, still not get it. To my knowledge whether the speculated load (steps 3 and 4) would
> get used depends on whether the condition check pass in step 2. How does a dma_rmb() make
> any difference in this process?

It orders the two loads from the device so that the descriptor is not
(speculatively) loaded before the head pointer.

When the CPU sees the updated head pointer it may otherwise proceed with
using stale descriptor data. The barrier prevents this.

Johan
Johan Hovold June 4, 2025, 7:06 a.m. UTC | #9
On Wed, Jun 04, 2025 at 01:32:08PM +0800, Miaoqing Pan wrote:
> On 6/4/2025 10:34 AM, Miaoqing Pan wrote:
> > On 6/3/2025 7:51 PM, Johan Hovold wrote:
> >> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
> >>> On 6/2/2025 4:03 PM, Johan Hovold wrote:
> >>
> >>>> No, the barrier is needed between reading the head pointer and 
> >>>> accessing
> >>>> descriptor fields, that's what matters.
> >>>>
> >>>> You can still end up with reading stale descriptor data even when
> >>>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to 
> >>>> speculation
> >>>> (that's what happens on the X13s).
> >>>
> >>> The fact is that a dma_rmb() does not even prevent speculation, no 
> >>> matter where it is
> >>> placed, right?
> >>
> >> It prevents the speculated load from being used.
> >>
> >>> If so the whole point of dma_rmb() is to prevent from compiler 
> >>> reordering
> >>> or CPU reordering, but is it really possible?
> >>>
> >>> The sequence is
> >>>
> >>>     1# reading HP
> >>>         srng->u.dst_ring.cached_hp = READ_ONCE(*srng- 
> >>> >u.dst_ring.hp_addr);
> >>>
> >>>     2# validate HP
> >>>         if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
> >>>             return NULL;
> >>>
> >>>     3# get desc
> >>>         desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
> >>>
> >>>     4# accessing desc
> >>>         ath11k_hal_desc_reo_parse_err(... desc, ...)
> >>>
> >>> Clearly each step depends on the results of previous steps. In this 
> >>> case the compiler/CPU
> >>> is expected to be smart enough to not do any reordering, isn't it?
> >>
> >> Steps 3 and 4 can be done speculatively before the load in step 1 is
> >> complete as long as the result is discarded if it turns out not to be
> >> needed.

> > If the condition in step 2 is true and step 3 speculatively loads 
> > descriptor from TP before step 1, could this cause issues?
> 
> Sorry for typo, if the condition in step 2 is false and step 3 
> speculatively loads descriptor from TP before step 1, could this cause 
> issues?

Almost correct; the descriptor can be loaded (from TP) before the head
pointer is loaded and thus before the condition in step 2 has been
evaluated. And if the condition in step 2 later turns out to be false,
step 4 may use stale data from before the head pointer was updated.

Johan
Miaoqing Pan June 4, 2025, 7:57 a.m. UTC | #10
On 6/4/2025 3:06 PM, Johan Hovold wrote:
> On Wed, Jun 04, 2025 at 01:32:08PM +0800, Miaoqing Pan wrote:
>> On 6/4/2025 10:34 AM, Miaoqing Pan wrote:
>>> On 6/3/2025 7:51 PM, Johan Hovold wrote:
>>>> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
>>>>> On 6/2/2025 4:03 PM, Johan Hovold wrote:
>>>>
>>>>>> No, the barrier is needed between reading the head pointer and
>>>>>> accessing
>>>>>> descriptor fields, that's what matters.
>>>>>>
>>>>>> You can still end up with reading stale descriptor data even when
>>>>>> ath11k_hal_srng_dst_get_next_entry() returns non-NULL due to
>>>>>> speculation
>>>>>> (that's what happens on the X13s).
>>>>>
>>>>> The fact is that a dma_rmb() does not even prevent speculation, no
>>>>> matter where it is
>>>>> placed, right?
>>>>
>>>> It prevents the speculated load from being used.
>>>>
>>>>> If so the whole point of dma_rmb() is to prevent from compiler
>>>>> reordering
>>>>> or CPU reordering, but is it really possible?
>>>>>
>>>>> The sequence is
>>>>>
>>>>>      1# reading HP
>>>>>          srng->u.dst_ring.cached_hp = READ_ONCE(*srng-
>>>>>> u.dst_ring.hp_addr);
>>>>>
>>>>>      2# validate HP
>>>>>          if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
>>>>>              return NULL;
>>>>>
>>>>>      3# get desc
>>>>>          desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
>>>>>
>>>>>      4# accessing desc
>>>>>          ath11k_hal_desc_reo_parse_err(... desc, ...)
>>>>>
>>>>> Clearly each step depends on the results of previous steps. In this
>>>>> case the compiler/CPU
>>>>> is expected to be smart enough to not do any reordering, isn't it?
>>>>
>>>> Steps 3 and 4 can be done speculatively before the load in step 1 is
>>>> complete as long as the result is discarded if it turns out not to be
>>>> needed.
> 
>>> If the condition in step 2 is true and step 3 speculatively loads
>>> descriptor from TP before step 1, could this cause issues?
>>
>> Sorry for typo, if the condition in step 2 is false and step 3
>> speculatively loads descriptor from TP before step 1, could this cause
>> issues?
> 
> Almost correct; the descriptor can be loaded (from TP) before the head
> pointer is loaded and thus before the condition in step 2 has been
> evaluated. And if the condition in step 2 later turns out to be false,
> step 4 may use stale data from before the head pointer was updated.
> 

Actually, there's a missing step between step 3 and step 4: TP+1.

TP+1:
	srng->u.dst_ring.tp += srng->entry_size

TP is managed by the CPU and points to the current first unprocessed 
descriptor, while HP and the descriptor are asynchronously updated by 
DMA. So are you saying that the descriptor obtained through speculative 
loading has not yet been updated, or is in the process of being updated?
Johan Hovold June 4, 2025, 8:07 a.m. UTC | #11
On Wed, Jun 04, 2025 at 03:57:57PM +0800, Miaoqing Pan wrote:
> On 6/4/2025 3:06 PM, Johan Hovold wrote:
> > On Wed, Jun 04, 2025 at 01:32:08PM +0800, Miaoqing Pan wrote:
> >> On 6/4/2025 10:34 AM, Miaoqing Pan wrote:
> >>> On 6/3/2025 7:51 PM, Johan Hovold wrote:
> >>>> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:

> >>>>> The sequence is
> >>>>>
> >>>>>      1# reading HP
> >>>>>          srng->u.dst_ring.cached_hp = READ_ONCE(*srng-
> >>>>>> u.dst_ring.hp_addr);
> >>>>>
> >>>>>      2# validate HP
> >>>>>          if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
> >>>>>              return NULL;
> >>>>>
> >>>>>      3# get desc
> >>>>>          desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
> >>>>>
> >>>>>      4# accessing desc
> >>>>>          ath11k_hal_desc_reo_parse_err(... desc, ...)
> >>>>>
> >>>>> Clearly each step depends on the results of previous steps. In this
> >>>>> case the compiler/CPU
> >>>>> is expected to be smart enough to not do any reordering, isn't it?
> >>>>
> >>>> Steps 3 and 4 can be done speculatively before the load in step 1 is
> >>>> complete as long as the result is discarded if it turns out not to be
> >>>> needed.
> > 
> >>> If the condition in step 2 is true and step 3 speculatively loads
> >>> descriptor from TP before step 1, could this cause issues?
> >>
> >> Sorry for typo, if the condition in step 2 is false and step 3
> >> speculatively loads descriptor from TP before step 1, could this cause
> >> issues?
> > 
> > Almost correct; the descriptor can be loaded (from TP) before the head
> > pointer is loaded and thus before the condition in step 2 has been
> > evaluated. And if the condition in step 2 later turns out to be false,
> > step 4 may use stale data from before the head pointer was updated.
> 
> Actually, there's a missing step between step 3 and step 4: TP+1.
> 
> TP+1:
> 	srng->u.dst_ring.tp += srng->entry_size

Sure, but that is not relevant for the issue at hand.

> TP is managed by the CPU and points to the current first unprocessed 
> descriptor, while HP and the descriptor are asynchronously updated by 
> DMA. So are you saying that the descriptor obtained through speculative 
> loading has not yet been updated, or is in the process of being updated?

Exactly.

Johan
Miaoqing Pan June 4, 2025, 8:18 a.m. UTC | #12
On 6/4/2025 4:07 PM, Johan Hovold wrote:
> On Wed, Jun 04, 2025 at 03:57:57PM +0800, Miaoqing Pan wrote:
>> On 6/4/2025 3:06 PM, Johan Hovold wrote:
>>> On Wed, Jun 04, 2025 at 01:32:08PM +0800, Miaoqing Pan wrote:
>>>> On 6/4/2025 10:34 AM, Miaoqing Pan wrote:
>>>>> On 6/3/2025 7:51 PM, Johan Hovold wrote:
>>>>>> On Tue, Jun 03, 2025 at 06:52:37PM +0800, Baochen Qiang wrote:
> 
>>>>>>> The sequence is
>>>>>>>
>>>>>>>       1# reading HP
>>>>>>>           srng->u.dst_ring.cached_hp = READ_ONCE(*srng-
>>>>>>>> u.dst_ring.hp_addr);
>>>>>>>
>>>>>>>       2# validate HP
>>>>>>>           if (srng->u.dst_ring.tp == srng->u.dst_ring.cached_hp)
>>>>>>>               return NULL;
>>>>>>>
>>>>>>>       3# get desc
>>>>>>>           desc = srng->ring_base_vaddr + srng->u.dst_ring.tp;
>>>>>>>
>>>>>>>       4# accessing desc
>>>>>>>           ath11k_hal_desc_reo_parse_err(... desc, ...)
>>>>>>>
>>>>>>> Clearly each step depends on the results of previous steps. In this
>>>>>>> case the compiler/CPU
>>>>>>> is expected to be smart enough to not do any reordering, isn't it?
>>>>>>
>>>>>> Steps 3 and 4 can be done speculatively before the load in step 1 is
>>>>>> complete as long as the result is discarded if it turns out not to be
>>>>>> needed.
>>>
>>>>> If the condition in step 2 is true and step 3 speculatively loads
>>>>> descriptor from TP before step 1, could this cause issues?
>>>>
>>>> Sorry for typo, if the condition in step 2 is false and step 3
>>>> speculatively loads descriptor from TP before step 1, could this cause
>>>> issues?
>>>
>>> Almost correct; the descriptor can be loaded (from TP) before the head
>>> pointer is loaded and thus before the condition in step 2 has been
>>> evaluated. And if the condition in step 2 later turns out to be false,
>>> step 4 may use stale data from before the head pointer was updated.
>>
>> Actually, there's a missing step between step 3 and step 4: TP+1.
>>
>> TP+1:
>> 	srng->u.dst_ring.tp += srng->entry_size
> 
> Sure, but that is not relevant for the issue at hand.
> 
>> TP is managed by the CPU and points to the current first unprocessed
>> descriptor, while HP and the descriptor are asynchronously updated by
>> DMA. So are you saying that the descriptor obtained through speculative
>> loading has not yet been updated, or is in the process of being updated?
> 
> Exactly.
> 
> Johan

Thanks, make sense.

Reviewed-by: Miaoqing Pan <quic_miaoqing@quicinc.com>
Jeff Johnson June 4, 2025, 4:24 p.m. UTC | #13
On 6/3/2025 7:34 PM, Miaoqing Pan wrote:
> We previously had extensive discussions on this topic in the 
> https://lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888-c34178e690fc@quicinc.com/ 
> thread. On my platform, dma_rmb() did not work as expected. The issue 
> only disappeared after disabling PCIe endpoint relaxed ordering in 
> firmware side. So it seems that HP was updated (Memory write) before 
> descriptor (Memory write), which led to the problem.

Have all ath11k and ath12k firmware been updated to prevent this problem from
the firmware side?

Or do we need both the barriers and the logic to retry reading the descriptor?
Miaoqing Pan June 5, 2025, 4:01 a.m. UTC | #14
On 6/5/2025 12:24 AM, Jeff Johnson wrote:
> On 6/3/2025 7:34 PM, Miaoqing Pan wrote:
>> We previously had extensive discussions on this topic in the
>> https://lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888-c34178e690fc@quicinc.com/
>> thread. On my platform, dma_rmb() did not work as expected. The issue
>> only disappeared after disabling PCIe endpoint relaxed ordering in
>> firmware side. So it seems that HP was updated (Memory write) before
>> descriptor (Memory write), which led to the problem.
> 
> Have all ath11k and ath12k firmware been updated to prevent this problem from
> the firmware side?
> 
No, as this is not a widespread issue, and addressing it would require 
modifying the core underlying modules of the firmware. Therefore, we 
chose not to proceed with that approach but instead used the workaround 
patch I previously submitted.

> Or do we need both the barriers and the logic to retry reading the descriptor?
> 

I think so, that would be best.
Johan Hovold June 5, 2025, 10:17 a.m. UTC | #15
On Thu, Jun 05, 2025 at 12:01:29PM +0800, Miaoqing Pan wrote:
> On 6/5/2025 12:24 AM, Jeff Johnson wrote:
> > On 6/3/2025 7:34 PM, Miaoqing Pan wrote:
> >> We previously had extensive discussions on this topic in the
> >> https://lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888-c34178e690fc@quicinc.com/
> >> thread. On my platform, dma_rmb() did not work as expected. The issue
> >> only disappeared after disabling PCIe endpoint relaxed ordering in
> >> firmware side. So it seems that HP was updated (Memory write) before
> >> descriptor (Memory write), which led to the problem.
> > 
> > Have all ath11k and ath12k firmware been updated to prevent this problem from
> > the firmware side?
> > 
> No, as this is not a widespread issue, and addressing it would require 
> modifying the core underlying modules of the firmware. Therefore, we 
> chose not to proceed with that approach but instead used the workaround 
> patch I previously submitted.

I strongly suggest you fix this at the firmware level rather than try to
work around it in the kernel to avoid playing whack-a-mole whenever a
new (hard to track down) bug shows up.

The barriers should be enough, but if they are not then the firmware
must be fixed.

Johan
Baochen Qiang June 5, 2025, 10:54 a.m. UTC | #16
On 6/5/2025 6:17 PM, Johan Hovold wrote:
> On Thu, Jun 05, 2025 at 12:01:29PM +0800, Miaoqing Pan wrote:
>> On 6/5/2025 12:24 AM, Jeff Johnson wrote:
>>> On 6/3/2025 7:34 PM, Miaoqing Pan wrote:
>>>> We previously had extensive discussions on this topic in the
>>>> https://lore.kernel.org/linux-wireless/ecfe850c-b263-4bee-b888-c34178e690fc@quicinc.com/
>>>> thread. On my platform, dma_rmb() did not work as expected. The issue
>>>> only disappeared after disabling PCIe endpoint relaxed ordering in
>>>> firmware side. So it seems that HP was updated (Memory write) before
>>>> descriptor (Memory write), which led to the problem.
>>>
>>> Have all ath11k and ath12k firmware been updated to prevent this problem from
>>> the firmware side?
>>>
>> No, as this is not a widespread issue, and addressing it would require 
>> modifying the core underlying modules of the firmware. Therefore, we 
>> chose not to proceed with that approach but instead used the workaround 
>> patch I previously submitted.

If firmware has a concern, how about doing it in host? As I know it is only a register in
PCI config space?

> 
> I strongly suggest you fix this at the firmware level rather than try to
> work around it in the kernel to avoid playing whack-a-mole whenever a
> new (hard to track down) bug shows up.
> 
> The barriers should be enough, but if they are not then the firmware
> must be fixed.
> 
> Johan
diff mbox series

Patch

diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c
index ea2959305dec..dfe2d889c20f 100644
--- a/drivers/net/wireless/ath/ath11k/dp_rx.c
+++ b/drivers/net/wireless/ath/ath11k/dp_rx.c
@@ -3851,6 +3851,9 @@  int ath11k_dp_process_rx_err(struct ath11k_base *ab, struct napi_struct *napi,
 
 	ath11k_hal_srng_access_begin(ab, srng);
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while (budget &&
 	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
 		struct hal_reo_dest_ring *reo_desc = (struct hal_reo_dest_ring *)desc;
@@ -4154,6 +4157,9 @@  int ath11k_dp_rx_process_wbm_err(struct ath11k_base *ab,
 
 	ath11k_hal_srng_access_begin(ab, srng);
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while (budget) {
 		rx_desc = ath11k_hal_srng_dst_get_next_entry(ab, srng);
 		if (!rx_desc)
@@ -4280,6 +4286,9 @@  int ath11k_dp_process_rxdma_err(struct ath11k_base *ab, int mac_id, int budget)
 
 	ath11k_hal_srng_access_begin(ab, srng);
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while (quota-- &&
 	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
 		ath11k_hal_rx_reo_ent_paddr_get(ab, desc, &paddr, &desc_bank);
@@ -4353,6 +4362,9 @@  void ath11k_dp_process_reo_status(struct ath11k_base *ab)
 
 	ath11k_hal_srng_access_begin(ab, srng);
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while ((reo_desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) {
 		tag = FIELD_GET(HAL_SRNG_TLV_HDR_TAG, *reo_desc);
 
@@ -5168,6 +5180,9 @@  static void ath11k_dp_rx_mon_dest_process(struct ath11k *ar, int mac_id,
 	rx_bufs_used = 0;
 	rx_mon_stats = &pmon->rx_mon_stats;
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while ((ring_entry = ath11k_hal_srng_dst_peek(ar->ab, mon_dst_srng))) {
 		struct sk_buff *head_msdu, *tail_msdu;
 
@@ -5630,6 +5645,10 @@  static int ath11k_dp_full_mon_process_rx(struct ath11k_base *ab, int mac_id,
 	spin_lock_bh(&mon_dst_srng->lock);
 
 	ath11k_hal_srng_access_begin(ar->ab, mon_dst_srng);
+
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while ((ring_entry = ath11k_hal_srng_dst_peek(ar->ab, mon_dst_srng))) {
 		head_msdu = NULL;
 		tail_msdu = NULL;
diff --git a/drivers/net/wireless/ath/ath11k/dp_tx.c b/drivers/net/wireless/ath/ath11k/dp_tx.c
index 8522c67baabf..549d17d90503 100644
--- a/drivers/net/wireless/ath/ath11k/dp_tx.c
+++ b/drivers/net/wireless/ath/ath11k/dp_tx.c
@@ -700,6 +700,9 @@  void ath11k_dp_tx_completion_handler(struct ath11k_base *ab, int ring_id)
 
 	ath11k_hal_srng_access_begin(ab, status_ring);
 
+	/* Make sure descriptor is read after the head pointer. */
+	dma_rmb();
+
 	while ((ATH11K_TX_COMPL_NEXT(tx_ring->tx_status_head) !=
 		tx_ring->tx_status_tail) &&
 	       (desc = ath11k_hal_srng_dst_get_next_entry(ab, status_ring))) {