diff mbox series

[v5,3/5] iommu/arm-smmu: Fix spurious interrupts with stall-on-fault

Message ID 20250319-msm-gpu-fault-fixes-next-v5-3-97561209dd8c@gmail.com
State New
Headers show
Series iommu/arm-smmu, drm/msm: Fixes for stall-on-fault | expand

Commit Message

Connor Abbott March 19, 2025, 2:44 p.m. UTC
On some SMMUv2 implementations, including MMU-500, SMMU_CBn_FSR.SS
asserts an interrupt. The only way to clear that bit is to resume the
transaction by writing SMMU_CBn_RESUME, but typically resuming the
transaction requires complex operations (copying in pages, etc.) that
can't be done in IRQ context. drm/msm already has a problem, because
its fault handler sometimes schedules a job to dump the GPU state and
doesn't resume translation until this is complete.

Work around this by disabling context fault interrupts until after the
transaction is resumed. Because other context banks can share an IRQ
line, we may still get an interrupt intended for another context bank,
but in this case only SMMU_CBn_FSR.SS will be asserted and we can skip
it assuming that interrupts are disabled which is accomplished by
removing the bit from ARM_SMMU_CB_FSR_FAULT. SMMU_CBn_FSR.SS won't be
asserted unless an external user enabled stall-on-fault, and they are
expected to resume the translation and re-enable interrupts.

Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
Reviewed-by Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 15 ++++++++++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c      | 41 +++++++++++++++++++++++++++++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h      |  1 -
 3 files changed, 54 insertions(+), 3 deletions(-)

Comments

Will Deacon May 6, 2025, 12:24 p.m. UTC | #1
On Wed, Mar 19, 2025 at 10:44:02AM -0400, Connor Abbott wrote:
> On some SMMUv2 implementations, including MMU-500, SMMU_CBn_FSR.SS
> asserts an interrupt. The only way to clear that bit is to resume the
> transaction by writing SMMU_CBn_RESUME, but typically resuming the
> transaction requires complex operations (copying in pages, etc.) that
> can't be done in IRQ context. drm/msm already has a problem, because
> its fault handler sometimes schedules a job to dump the GPU state and
> doesn't resume translation until this is complete.
> 
> Work around this by disabling context fault interrupts until after the
> transaction is resumed. Because other context banks can share an IRQ
> line, we may still get an interrupt intended for another context bank,
> but in this case only SMMU_CBn_FSR.SS will be asserted and we can skip
> it assuming that interrupts are disabled which is accomplished by
> removing the bit from ARM_SMMU_CB_FSR_FAULT. SMMU_CBn_FSR.SS won't be
> asserted unless an external user enabled stall-on-fault, and they are
> expected to resume the translation and re-enable interrupts.
> 
> Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
> Reviewed-by Robin Murphy <robin.murphy@arm.com>
> Reviewed-by: Rob Clark <robdclark@gmail.com>
> ---
>  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 15 ++++++++++-
>  drivers/iommu/arm/arm-smmu/arm-smmu.c      | 41 +++++++++++++++++++++++++++++-
>  drivers/iommu/arm/arm-smmu/arm-smmu.h      |  1 -
>  3 files changed, 54 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> index 186d6ad4fd1c990398df4dec53f4d58ada9e658c..a428e53add08d451fb2152e3ab80e0fba936e214 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> @@ -90,12 +90,25 @@ static void qcom_adreno_smmu_resume_translation(const void *cookie, bool termina
>  	struct arm_smmu_domain *smmu_domain = (void *)cookie;
>  	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
>  	struct arm_smmu_device *smmu = smmu_domain->smmu;
> -	u32 reg = 0;
> +	u32 reg = 0, sctlr;
> +	unsigned long flags;
>  
>  	if (terminate)
>  		reg |= ARM_SMMU_RESUME_TERMINATE;
>  
> +	spin_lock_irqsave(&smmu_domain->cb_lock, flags);
> +
>  	arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg);
> +
> +	/*
> +	 * Re-enable interrupts after they were disabled by
> +	 * arm_smmu_context_fault().
> +	 */
> +	sctlr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR);
> +	sctlr |= ARM_SMMU_SCTLR_CFIE;
> +	arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR, sctlr);
> +
> +	spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
>  }
>  
>  #define QCOM_ADRENO_SMMU_GPU_SID 0
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index c7b5d7c093e71050d29a834c8d33125e96b04d81..9927f3431a2eab913750e6079edc6393d1938c98 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -470,13 +470,52 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
>  	if (!(cfi->fsr & ARM_SMMU_CB_FSR_FAULT))
>  		return IRQ_NONE;
>  
> +	/*
> +	 * On some implementations FSR.SS asserts a context fault
> +	 * interrupt. We do not want this behavior, because resolving the
> +	 * original context fault typically requires operations that cannot be
> +	 * performed in IRQ context but leaving the stall unacknowledged will
> +	 * immediately lead to another spurious interrupt as FSR.SS is still
> +	 * set. Work around this by disabling interrupts for this context bank.
> +	 * It's expected that interrupts are re-enabled after resuming the
> +	 * translation.

s/translation/transaction/

> +	 *
> +	 * We have to do this before report_iommu_fault() so that we don't
> +	 * leave interrupts disabled in case the downstream user decides the
> +	 * fault can be resolved inside its fault handler.
> +	 *
> +	 * There is a possible race if there are multiple context banks sharing
> +	 * the same interrupt and both signal an interrupt in between writing
> +	 * RESUME and SCTLR. We could disable interrupts here before we
> +	 * re-enable them in the resume handler, leaving interrupts enabled.
> +	 * Lock the write to serialize it with the resume handler.
> +	 */

I'm struggling to understand this last part. If the resume handler runs
synchronously from report_iommu_fault(), then there's no need for
locking because we're in interrupt context. If the resume handler can
run asynchronously from report_iommu_fault(), then the locking doesn't
help because the code below could clear CFIE right after the resume
handler has set it.

I think the broader problem is that we can end up reporting a spurious
fault when the irq is shared between context banks. However, your change
to ARM_SMMU_CB_FSR_FAULT fixes that afaict:

> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> index ff84ce3b8d8567d3402e945e8277ca2a87df9a4e..5fe8e482457f905529a08aea14ea5656d3e31328 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> @@ -214,7 +214,6 @@ enum arm_smmu_cbar_type {
>  					 ARM_SMMU_CB_FSR_TLBLKF)
>  
>  #define ARM_SMMU_CB_FSR_FAULT		(ARM_SMMU_CB_FSR_MULTI |	\
> -					 ARM_SMMU_CB_FSR_SS |		\
>  					 ARM_SMMU_CB_FSR_UUT |		\
>  					 ARM_SMMU_CB_FSR_EF |		\
>  					 ARM_SMMU_CB_FSR_PF |		\

because now any subsequent spurious IRQs will return IRQ_NONE. So I
don't see why we need the locking at all. What am I missing?

Also, I think your change probably exposes a benign bug in the qcom
debug code as qcom_tbu_halt() has the following:

if ((fsr & ARM_SMMU_CB_FSR_FAULT) && (fsr & ARM_SMMU_CB_FSR_SS))
	...

Will
Connor Abbott May 6, 2025, 2:08 p.m. UTC | #2
On Tue, May 6, 2025 at 8:24 AM Will Deacon <will@kernel.org> wrote:
>
> On Wed, Mar 19, 2025 at 10:44:02AM -0400, Connor Abbott wrote:
> > On some SMMUv2 implementations, including MMU-500, SMMU_CBn_FSR.SS
> > asserts an interrupt. The only way to clear that bit is to resume the
> > transaction by writing SMMU_CBn_RESUME, but typically resuming the
> > transaction requires complex operations (copying in pages, etc.) that
> > can't be done in IRQ context. drm/msm already has a problem, because
> > its fault handler sometimes schedules a job to dump the GPU state and
> > doesn't resume translation until this is complete.
> >
> > Work around this by disabling context fault interrupts until after the
> > transaction is resumed. Because other context banks can share an IRQ
> > line, we may still get an interrupt intended for another context bank,
> > but in this case only SMMU_CBn_FSR.SS will be asserted and we can skip
> > it assuming that interrupts are disabled which is accomplished by
> > removing the bit from ARM_SMMU_CB_FSR_FAULT. SMMU_CBn_FSR.SS won't be
> > asserted unless an external user enabled stall-on-fault, and they are
> > expected to resume the translation and re-enable interrupts.
> >
> > Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
> > Reviewed-by Robin Murphy <robin.murphy@arm.com>
> > Reviewed-by: Rob Clark <robdclark@gmail.com>
> > ---
> >  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 15 ++++++++++-
> >  drivers/iommu/arm/arm-smmu/arm-smmu.c      | 41 +++++++++++++++++++++++++++++-
> >  drivers/iommu/arm/arm-smmu/arm-smmu.h      |  1 -
> >  3 files changed, 54 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> > index 186d6ad4fd1c990398df4dec53f4d58ada9e658c..a428e53add08d451fb2152e3ab80e0fba936e214 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> > @@ -90,12 +90,25 @@ static void qcom_adreno_smmu_resume_translation(const void *cookie, bool termina
> >       struct arm_smmu_domain *smmu_domain = (void *)cookie;
> >       struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
> >       struct arm_smmu_device *smmu = smmu_domain->smmu;
> > -     u32 reg = 0;
> > +     u32 reg = 0, sctlr;
> > +     unsigned long flags;
> >
> >       if (terminate)
> >               reg |= ARM_SMMU_RESUME_TERMINATE;
> >
> > +     spin_lock_irqsave(&smmu_domain->cb_lock, flags);
> > +
> >       arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg);
> > +
> > +     /*
> > +      * Re-enable interrupts after they were disabled by
> > +      * arm_smmu_context_fault().
> > +      */
> > +     sctlr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR);
> > +     sctlr |= ARM_SMMU_SCTLR_CFIE;
> > +     arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR, sctlr);
> > +
> > +     spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
> >  }
> >
> >  #define QCOM_ADRENO_SMMU_GPU_SID 0
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > index c7b5d7c093e71050d29a834c8d33125e96b04d81..9927f3431a2eab913750e6079edc6393d1938c98 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > @@ -470,13 +470,52 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
> >       if (!(cfi->fsr & ARM_SMMU_CB_FSR_FAULT))
> >               return IRQ_NONE;
> >
> > +     /*
> > +      * On some implementations FSR.SS asserts a context fault
> > +      * interrupt. We do not want this behavior, because resolving the
> > +      * original context fault typically requires operations that cannot be
> > +      * performed in IRQ context but leaving the stall unacknowledged will
> > +      * immediately lead to another spurious interrupt as FSR.SS is still
> > +      * set. Work around this by disabling interrupts for this context bank.
> > +      * It's expected that interrupts are re-enabled after resuming the
> > +      * translation.
>
> s/translation/transaction/
>
> > +      *
> > +      * We have to do this before report_iommu_fault() so that we don't
> > +      * leave interrupts disabled in case the downstream user decides the
> > +      * fault can be resolved inside its fault handler.
> > +      *
> > +      * There is a possible race if there are multiple context banks sharing
> > +      * the same interrupt and both signal an interrupt in between writing
> > +      * RESUME and SCTLR. We could disable interrupts here before we
> > +      * re-enable them in the resume handler, leaving interrupts enabled.
> > +      * Lock the write to serialize it with the resume handler.
> > +      */
>
> I'm struggling to understand this last part. If the resume handler runs
> synchronously from report_iommu_fault(), then there's no need for
> locking because we're in interrupt context. If the resume handler can
> run asynchronously from report_iommu_fault(), then the locking doesn't
> help because the code below could clear CFIE right after the resume
> handler has set it.

The problem is indeed when the resume handler runs asynchronously.
Clearing CFIE right after the resume handler has set it is normal and
expected. The issue is the opposite, i.e. something like:

- Resume handler writes RESUME and stalls for some reason
- The interrupt handler runs through and clears CFIE while it's already cleared
- Resume handler sets CFIE, assuming that the handler hasn't run yet
but it actually has

This wouldn't happen with only one context bank, because we wouldn't
get an interrupt until the resume handler sets CFIE, but with multiple
context banks and a shared interrupt line we could get a "spurious"
interrupt due to a fault in an earlier context bank that becomes not
spurious if the resume handler writes RESUME before the context fault
handler for this bank reads FSR above.

Connor

>
> I think the broader problem is that we can end up reporting a spurious
> fault when the irq is shared between context banks. However, your change
> to ARM_SMMU_CB_FSR_FAULT fixes that afaict:
>
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > index ff84ce3b8d8567d3402e945e8277ca2a87df9a4e..5fe8e482457f905529a08aea14ea5656d3e31328 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > @@ -214,7 +214,6 @@ enum arm_smmu_cbar_type {
> >                                        ARM_SMMU_CB_FSR_TLBLKF)
> >
> >  #define ARM_SMMU_CB_FSR_FAULT                (ARM_SMMU_CB_FSR_MULTI |        \
> > -                                      ARM_SMMU_CB_FSR_SS |           \
> >                                        ARM_SMMU_CB_FSR_UUT |          \
> >                                        ARM_SMMU_CB_FSR_EF |           \
> >                                        ARM_SMMU_CB_FSR_PF |           \
>
> because now any subsequent spurious IRQs will return IRQ_NONE. So I
> don't see why we need the locking at all. What am I missing?
>
> Also, I think your change probably exposes a benign bug in the qcom
> debug code as qcom_tbu_halt() has the following:
>
> if ((fsr & ARM_SMMU_CB_FSR_FAULT) && (fsr & ARM_SMMU_CB_FSR_SS))
>         ...
>
> Will
Will Deacon May 6, 2025, 2:53 p.m. UTC | #3
On Tue, May 06, 2025 at 10:08:05AM -0400, Connor Abbott wrote:
> On Tue, May 6, 2025 at 8:24 AM Will Deacon <will@kernel.org> wrote:
> > On Wed, Mar 19, 2025 at 10:44:02AM -0400, Connor Abbott wrote:
> > > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > index c7b5d7c093e71050d29a834c8d33125e96b04d81..9927f3431a2eab913750e6079edc6393d1938c98 100644
> > > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > @@ -470,13 +470,52 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
> > >       if (!(cfi->fsr & ARM_SMMU_CB_FSR_FAULT))
> > >               return IRQ_NONE;
> > >
> > > +     /*
> > > +      * On some implementations FSR.SS asserts a context fault
> > > +      * interrupt. We do not want this behavior, because resolving the
> > > +      * original context fault typically requires operations that cannot be
> > > +      * performed in IRQ context but leaving the stall unacknowledged will
> > > +      * immediately lead to another spurious interrupt as FSR.SS is still
> > > +      * set. Work around this by disabling interrupts for this context bank.
> > > +      * It's expected that interrupts are re-enabled after resuming the
> > > +      * translation.
> >
> > s/translation/transaction/
> >
> > > +      *
> > > +      * We have to do this before report_iommu_fault() so that we don't
> > > +      * leave interrupts disabled in case the downstream user decides the
> > > +      * fault can be resolved inside its fault handler.
> > > +      *
> > > +      * There is a possible race if there are multiple context banks sharing
> > > +      * the same interrupt and both signal an interrupt in between writing
> > > +      * RESUME and SCTLR. We could disable interrupts here before we
> > > +      * re-enable them in the resume handler, leaving interrupts enabled.
> > > +      * Lock the write to serialize it with the resume handler.
> > > +      */
> >
> > I'm struggling to understand this last part. If the resume handler runs
> > synchronously from report_iommu_fault(), then there's no need for
> > locking because we're in interrupt context. If the resume handler can
> > run asynchronously from report_iommu_fault(), then the locking doesn't
> > help because the code below could clear CFIE right after the resume
> > handler has set it.
> 
> The problem is indeed when the resume handler runs asynchronously.
> Clearing CFIE right after the resume handler has set it is normal and
> expected. The issue is the opposite, i.e. something like:
> 
> - Resume handler writes RESUME and stalls for some reason
> - The interrupt handler runs through and clears CFIE while it's already cleared
> - Resume handler sets CFIE, assuming that the handler hasn't run yet
> but it actually has
> 
> This wouldn't happen with only one context bank, because we wouldn't
> get an interrupt until the resume handler sets CFIE, but with multiple
> context banks and a shared interrupt line we could get a "spurious"
> interrupt due to a fault in an earlier context bank that becomes not
> spurious if the resume handler writes RESUME before the context fault
> handler for this bank reads FSR above.

Ah, gotcha. Thanks for the explanation.

If we moved the RESUME+CFIE into the interrupt handler after the call
to report_iommu_fault(), would it be possible to run the handler as a
threaded irq (see 'context_fault_needs_threaded_irq') and handle the
callback synchronously? In that case, I think we could avoid taking the
lock if we wrote CFIE _before_ RESUME.

Will
Connor Abbott May 6, 2025, 3:18 p.m. UTC | #4
On Tue, May 6, 2025 at 10:53 AM Will Deacon <will@kernel.org> wrote:
>
> On Tue, May 06, 2025 at 10:08:05AM -0400, Connor Abbott wrote:
> > On Tue, May 6, 2025 at 8:24 AM Will Deacon <will@kernel.org> wrote:
> > > On Wed, Mar 19, 2025 at 10:44:02AM -0400, Connor Abbott wrote:
> > > > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > > index c7b5d7c093e71050d29a834c8d33125e96b04d81..9927f3431a2eab913750e6079edc6393d1938c98 100644
> > > > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > > > @@ -470,13 +470,52 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
> > > >       if (!(cfi->fsr & ARM_SMMU_CB_FSR_FAULT))
> > > >               return IRQ_NONE;
> > > >
> > > > +     /*
> > > > +      * On some implementations FSR.SS asserts a context fault
> > > > +      * interrupt. We do not want this behavior, because resolving the
> > > > +      * original context fault typically requires operations that cannot be
> > > > +      * performed in IRQ context but leaving the stall unacknowledged will
> > > > +      * immediately lead to another spurious interrupt as FSR.SS is still
> > > > +      * set. Work around this by disabling interrupts for this context bank.
> > > > +      * It's expected that interrupts are re-enabled after resuming the
> > > > +      * translation.
> > >
> > > s/translation/transaction/
> > >
> > > > +      *
> > > > +      * We have to do this before report_iommu_fault() so that we don't
> > > > +      * leave interrupts disabled in case the downstream user decides the
> > > > +      * fault can be resolved inside its fault handler.
> > > > +      *
> > > > +      * There is a possible race if there are multiple context banks sharing
> > > > +      * the same interrupt and both signal an interrupt in between writing
> > > > +      * RESUME and SCTLR. We could disable interrupts here before we
> > > > +      * re-enable them in the resume handler, leaving interrupts enabled.
> > > > +      * Lock the write to serialize it with the resume handler.
> > > > +      */
> > >
> > > I'm struggling to understand this last part. If the resume handler runs
> > > synchronously from report_iommu_fault(), then there's no need for
> > > locking because we're in interrupt context. If the resume handler can
> > > run asynchronously from report_iommu_fault(), then the locking doesn't
> > > help because the code below could clear CFIE right after the resume
> > > handler has set it.
> >
> > The problem is indeed when the resume handler runs asynchronously.
> > Clearing CFIE right after the resume handler has set it is normal and
> > expected. The issue is the opposite, i.e. something like:
> >
> > - Resume handler writes RESUME and stalls for some reason
> > - The interrupt handler runs through and clears CFIE while it's already cleared
> > - Resume handler sets CFIE, assuming that the handler hasn't run yet
> > but it actually has
> >
> > This wouldn't happen with only one context bank, because we wouldn't
> > get an interrupt until the resume handler sets CFIE, but with multiple
> > context banks and a shared interrupt line we could get a "spurious"
> > interrupt due to a fault in an earlier context bank that becomes not
> > spurious if the resume handler writes RESUME before the context fault
> > handler for this bank reads FSR above.
>
> Ah, gotcha. Thanks for the explanation.
>
> If we moved the RESUME+CFIE into the interrupt handler after the call
> to report_iommu_fault(), would it be possible to run the handler as a
> threaded irq (see 'context_fault_needs_threaded_irq') and handle the
> callback synchronously? In that case, I think we could avoid taking the
> lock if we wrote CFIE _before_ RESUME.
>
> Will

We need the lock anyway due to the parallel manipulation of CFCFG in
the same register introduced in the next patch. Expanding it to also
cover the write to RESUME is not a huge deal. Also, doing it
synchronously would require rewriting the fault handling in drm/msm
and again I'm trying to fix this serious stability problem now as soon
as possible without getting dragged into rewriting the whole thing.

Connor
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 186d6ad4fd1c990398df4dec53f4d58ada9e658c..a428e53add08d451fb2152e3ab80e0fba936e214 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -90,12 +90,25 @@  static void qcom_adreno_smmu_resume_translation(const void *cookie, bool termina
 	struct arm_smmu_domain *smmu_domain = (void *)cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	u32 reg = 0;
+	u32 reg = 0, sctlr;
+	unsigned long flags;
 
 	if (terminate)
 		reg |= ARM_SMMU_RESUME_TERMINATE;
 
+	spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+
 	arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg);
+
+	/*
+	 * Re-enable interrupts after they were disabled by
+	 * arm_smmu_context_fault().
+	 */
+	sctlr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR);
+	sctlr |= ARM_SMMU_SCTLR_CFIE;
+	arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR, sctlr);
+
+	spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
 }
 
 #define QCOM_ADRENO_SMMU_GPU_SID 0
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index c7b5d7c093e71050d29a834c8d33125e96b04d81..9927f3431a2eab913750e6079edc6393d1938c98 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -470,13 +470,52 @@  static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	if (!(cfi->fsr & ARM_SMMU_CB_FSR_FAULT))
 		return IRQ_NONE;
 
+	/*
+	 * On some implementations FSR.SS asserts a context fault
+	 * interrupt. We do not want this behavior, because resolving the
+	 * original context fault typically requires operations that cannot be
+	 * performed in IRQ context but leaving the stall unacknowledged will
+	 * immediately lead to another spurious interrupt as FSR.SS is still
+	 * set. Work around this by disabling interrupts for this context bank.
+	 * It's expected that interrupts are re-enabled after resuming the
+	 * translation.
+	 *
+	 * We have to do this before report_iommu_fault() so that we don't
+	 * leave interrupts disabled in case the downstream user decides the
+	 * fault can be resolved inside its fault handler.
+	 *
+	 * There is a possible race if there are multiple context banks sharing
+	 * the same interrupt and both signal an interrupt in between writing
+	 * RESUME and SCTLR. We could disable interrupts here before we
+	 * re-enable them in the resume handler, leaving interrupts enabled.
+	 * Lock the write to serialize it with the resume handler.
+	 */
+	if (cfi->fsr & ARM_SMMU_CB_FSR_SS) {
+		u32 val;
+
+		spin_lock(&smmu_domain->cb_lock);
+		val = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR);
+		val &= ~ARM_SMMU_SCTLR_CFIE;
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, val);
+		spin_unlock(&smmu_domain->cb_lock);
+	}
+
+	/*
+	 * The SMMUv2 architecture specification says that if stall-on-fault is
+	 * enabled the correct sequence is to write to SMMU_CBn_FSR to clear
+	 * the fault and then write to SMMU_CBn_RESUME. Clear the interrupt
+	 * first before running the user's fault handler to make sure we follow
+	 * this sequence. It should be ok if there is another fault in the
+	 * meantime because we have already read the fault info.
+	 */
+	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi->fsr);
+
 	ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi->iova,
 		cfi->fsynr0 & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
 	if (ret == -ENOSYS && __ratelimit(&rs))
 		arm_smmu_print_context_fault_info(smmu, idx, cfi);
 
-	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi->fsr);
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index ff84ce3b8d8567d3402e945e8277ca2a87df9a4e..5fe8e482457f905529a08aea14ea5656d3e31328 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -214,7 +214,6 @@  enum arm_smmu_cbar_type {
 					 ARM_SMMU_CB_FSR_TLBLKF)
 
 #define ARM_SMMU_CB_FSR_FAULT		(ARM_SMMU_CB_FSR_MULTI |	\
-					 ARM_SMMU_CB_FSR_SS |		\
 					 ARM_SMMU_CB_FSR_UUT |		\
 					 ARM_SMMU_CB_FSR_EF |		\
 					 ARM_SMMU_CB_FSR_PF |		\