diff mbox series

[12/15] xhci: Prevent early endpoint restart when handling STALL errors.

Message ID 20250306144954.3507700-13-mathias.nyman@linux.intel.com
State New
Headers show
Series xhci features for usb-next | expand

Commit Message

Mathias Nyman March 6, 2025, 2:49 p.m. UTC
Ensure that an endpoint halted due to device STALL is not
restarted before a Clear_Feature(ENDPOINT_HALT) request is sent to
the device.

The host side of the endpoint may otherwise be started early by the
'Set TR Deq' command completion handler which is called if dequeue
is moved past a cancelled or halted TD.

Prevent this with a new flag set for bulk and interrupt endpoints
when a Stall Error is received. Clear it in hcd->endpoint_reset()
which is called after Clear_Feature(ENDPOINT_HALT) is sent.

Also add a debug message if a class driver queues a new URB after the
STALL. Note that class driver might not be aware of the STALL
yet when it submits the URB as URBs are given back in BH.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
---
 drivers/usb/host/xhci-ring.c | 7 +++++--
 drivers/usb/host/xhci.c      | 6 ++++++
 drivers/usb/host/xhci.h      | 3 ++-
 3 files changed, 13 insertions(+), 3 deletions(-)

Comments

Michał Pecio March 7, 2025, 6:54 a.m. UTC | #1
> Ensure that an endpoint halted due to device STALL is not
> restarted before a Clear_Feature(ENDPOINT_HALT) request is sent to
> the device.
> 
> The host side of the endpoint may otherwise be started early by the
> 'Set TR Deq' command completion handler which is called if dequeue
> is moved past a cancelled or halted TD.
> 
> Prevent this with a new flag set for bulk and interrupt endpoints
> when a Stall Error is received. Clear it in hcd->endpoint_reset()
> which is called after Clear_Feature(ENDPOINT_HALT) is sent.
> 
> Also add a debug message if a class driver queues a new URB after
> the STALL. Note that class driver might not be aware of the STALL
> yet when it submits the URB as URBs are given back in BH.
> 
> Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>

Sorry for coming this late, but I haven't looked closely at some
of those xhci/for-next patches before.

This one is unfortunately incomplete, as follows:

> drivers/usb/host/xhci-ring.c | 7 +++++--
> drivers/usb/host/xhci.c      | 6 ++++++
> drivers/usb/host/xhci.h      | 3 ++-
> 3 files changed, 13 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
>index c2e15a27338b..7643ab9ec3b4 100644
>--- a/drivers/usb/host/xhci-ring.c
>+++ b/drivers/usb/host/xhci-ring.c
>@@ -556,8 +556,8 @@ void xhci_ring_ep_doorbell(struct xhci_hcd *xhci,
> 	 * pointer command pending because the device can choose to start any
> 	 * stream once the endpoint is on the HW schedule.
> 	 */
>-	if ((ep_state & EP_STOP_CMD_PENDING) || (ep_state & SET_DEQ_PENDING) ||
>-	    (ep_state & EP_HALTED) || (ep_state & EP_CLEARING_TT))
>+	if (ep_state & (EP_STOP_CMD_PENDING | SET_DEQ_PENDING | EP_HALTED |
>+			EP_CLEARING_TT | EP_STALLED))
> 		return;

Any flag added to this list needs to be added to xhci_urb_dequeue() too
so it knowns that the endpoint is held in Stopped state and URBs can be
unlinked without trying to stop it again.

There really should be a helper function used both here and there, but
those Stop EP patches were meant for stable and I strived to make them
small and noninvasive. Then I forgot about this cleanup.

NB: I also forgot about a bunch of low-impact halted EP handling bugs,
I will try to rebase and send them out today or over the weekend.

>  	trace_xhci_ring_ep_doorbell(slot_id, DB_VALUE(ep_index, stream_id));
> @@ -2555,6 +2555,9 @@ static void process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_virt_ep *ep,
>  
>  		xhci_handle_halted_endpoint(xhci, ep, td, EP_SOFT_RESET);
>  		return;
> +	case COMP_STALL_ERROR:
> +		ep->ep_state |= EP_STALLED;
> +		break;
>  	default:
>  		/* do nothing */
>  		break;
> diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> index 3f2cd546a7a2..0c22b78358b9 100644
> --- a/drivers/usb/host/xhci.c
> +++ b/drivers/usb/host/xhci.c
> @@ -1604,6 +1604,11 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>  		goto free_priv;
>  	}
>  
> +	/* Class driver might not be aware ep halted due to async URB giveback */
> +	if (*ep_state & EP_STALLED)
> +		dev_dbg(&urb->dev->dev, "URB %p queued before clearing halt\n",
> +			urb);
> +
>  	switch (usb_endpoint_type(&urb->ep->desc)) {
>  
>  	case USB_ENDPOINT_XFER_CONTROL:
> @@ -3202,6 +3207,7 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd,
>  		return;
>  
>  	ep = &vdev->eps[ep_index];
> +	ep->ep_state &= ~EP_STALLED;

... and clearing any of those flags has always been followed by calling
xhci_ring_ep_doorbell() again, to ensure that the endpoint is restarted
if it has URBs on it but restart was held off due to the flag.

xhci_urb_dequeue() relies on this too, because it looked lke sensible
design: if you have reasons not to run the EP, you set a flag. Reasons
are gone, you clear the flag and it's running again.

> 	/* Bail out if toggle is already being cleared by a endpoint reset */
> 	spin_lock_irqsave(&xhci->lock, flags);
>diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
>index cd96e0a8c593..4ee14f651d36 100644
>--- a/drivers/usb/host/xhci.h
>+++ b/drivers/usb/host/xhci.h
>@@ -664,7 +664,7 @@ struct xhci_virt_ep {
> 	unsigned int			err_count;
> 	unsigned int			ep_state;
> #define SET_DEQ_PENDING		(1 << 0)
>-#define EP_HALTED		(1 << 1)	/* For stall handling */
>+#define EP_HALTED		(1 << 1)	/* Halted host ep handling */
> #define EP_STOP_CMD_PENDING	(1 << 2)	/* For URB cancellation */
> /* Transitioning the endpoint to using streams, don't enqueue URBs */
> #define EP_GETTING_STREAMS	(1 << 3)
>@@ -675,6 +675,7 @@ struct xhci_virt_ep {
> #define EP_SOFT_CLEAR_TOGGLE	(1 << 7)
> /* usb_hub_clear_tt_buffer is in progress */
> #define EP_CLEARING_TT		(1 << 8)
>+#define EP_STALLED		(1 << 9)	/* For stall handling */

I guess usage rules of those flags should be documented somewhere here
and helpers added such as:

xhci_ep_cancel_pending()
xhci_ep_held_stopped()

to improve maintainability and prevent similar problems in the future.


I could sit and write something, I still have this stuff quite fresh
in memory after spending a few weeks debugging those crazy HW races.

Regards,
Michal
Mathias Nyman March 7, 2025, 4:18 p.m. UTC | #2
On 7.3.2025 17.44, Michał Pecio wrote:
> On Fri, 7 Mar 2025 16:23:17 +0200, Mathias Nyman wrote:
>>> Any flag added to this list needs to be added to xhci_urb_dequeue()
>>> too so it knowns that the endpoint is held in Stopped state and
>>> URBs can be unlinked without trying to stop it again.
>>
>> In this case it's intentional.
>>
>> If we prevent xhci_urb_dequeue() from queuing a stop endpoint command
>> due to a flag, then we must make sure the cancelled URB is given back
>> in the same place we clear the flag, like we do in the command
>> completion handlers that clear EP_HALTED and SET_DEQ_PENDING.
> 
> I'm not sure why this would be, what's the problem with the approach
> used for EP_CLEARING_TT currently? And if there is a problem, doesn't
> EP_CLEARING_TT also have this problem?
> 
> In this case, xhci_urb_dequeue() simply takes xhci->lock and calls:
> 
> void xhci_process_cancelled_tds(struct xhci_virt_ep *ep)
> {
>          xhci_invalidate_cancelled_tds(ep);
>          xhci_giveback_invalidated_tds(ep);
> }
> 
> Unlinked URBs are either given back instantly, or Set TR Dequeue is
> queued (and flagged on ep->ep_state) and the rest of the process goes
> same way as usual when called from xhci_handle_cmd_stop_ep().
> 
> The EP will be restarted when the last flag is cleared, which may be
> either SET_DEQ_PENDING or EP_CLEARING_TT/EP_STALLED.
> 
> It's practically an optimization which eliminates the dummy Stop EP
> command from the process. I thought EP_STALLED could use it.
> 

This should work, and avoid that unnecessary stop endpoint command.

Just need to make sure we check for EP_STALLED flag after the other
(EP_STOP_CMD_PENDING | EP_HALTED | SET_DEQ_PENDING) flags in
xhci_urb_dequeue(), just like EP_CLEARING_TT case.

Also need to protect clearing the EP_STALLED flag with the lock

I'll either send an update patch next week, or during rc cycle if
that's too late.

Thanks
Mathias
diff mbox series

Patch

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index c2e15a27338b..7643ab9ec3b4 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -556,8 +556,8 @@  void xhci_ring_ep_doorbell(struct xhci_hcd *xhci,
 	 * pointer command pending because the device can choose to start any
 	 * stream once the endpoint is on the HW schedule.
 	 */
-	if ((ep_state & EP_STOP_CMD_PENDING) || (ep_state & SET_DEQ_PENDING) ||
-	    (ep_state & EP_HALTED) || (ep_state & EP_CLEARING_TT))
+	if (ep_state & (EP_STOP_CMD_PENDING | SET_DEQ_PENDING | EP_HALTED |
+			EP_CLEARING_TT | EP_STALLED))
 		return;
 
 	trace_xhci_ring_ep_doorbell(slot_id, DB_VALUE(ep_index, stream_id));
@@ -2555,6 +2555,9 @@  static void process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_virt_ep *ep,
 
 		xhci_handle_halted_endpoint(xhci, ep, td, EP_SOFT_RESET);
 		return;
+	case COMP_STALL_ERROR:
+		ep->ep_state |= EP_STALLED;
+		break;
 	default:
 		/* do nothing */
 		break;
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 3f2cd546a7a2..0c22b78358b9 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1604,6 +1604,11 @@  static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 		goto free_priv;
 	}
 
+	/* Class driver might not be aware ep halted due to async URB giveback */
+	if (*ep_state & EP_STALLED)
+		dev_dbg(&urb->dev->dev, "URB %p queued before clearing halt\n",
+			urb);
+
 	switch (usb_endpoint_type(&urb->ep->desc)) {
 
 	case USB_ENDPOINT_XFER_CONTROL:
@@ -3202,6 +3207,7 @@  static void xhci_endpoint_reset(struct usb_hcd *hcd,
 		return;
 
 	ep = &vdev->eps[ep_index];
+	ep->ep_state &= ~EP_STALLED;
 
 	/* Bail out if toggle is already being cleared by a endpoint reset */
 	spin_lock_irqsave(&xhci->lock, flags);
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index cd96e0a8c593..4ee14f651d36 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -664,7 +664,7 @@  struct xhci_virt_ep {
 	unsigned int			err_count;
 	unsigned int			ep_state;
 #define SET_DEQ_PENDING		(1 << 0)
-#define EP_HALTED		(1 << 1)	/* For stall handling */
+#define EP_HALTED		(1 << 1)	/* Halted host ep handling */
 #define EP_STOP_CMD_PENDING	(1 << 2)	/* For URB cancellation */
 /* Transitioning the endpoint to using streams, don't enqueue URBs */
 #define EP_GETTING_STREAMS	(1 << 3)
@@ -675,6 +675,7 @@  struct xhci_virt_ep {
 #define EP_SOFT_CLEAR_TOGGLE	(1 << 7)
 /* usb_hub_clear_tt_buffer is in progress */
 #define EP_CLEARING_TT		(1 << 8)
+#define EP_STALLED		(1 << 9)	/* For stall handling */
 	/* ----  Related to URB cancellation ---- */
 	struct list_head	cancelled_td_list;
 	struct xhci_hcd		*xhci;