diff mbox series

iommu/arm-smmu: Pretty-print context fault related regs

Message ID 20240604150136.493962-1-robdclark@gmail.com
State New
Headers show
Series iommu/arm-smmu: Pretty-print context fault related regs | expand

Commit Message

Rob Clark June 4, 2024, 3:01 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

Parse out the bitfields for easier-to-read fault messages.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
Stephen was wanting easier to read fault messages.. so I typed this up.

Resend with the new iommu list address

 drivers/iommu/arm/arm-smmu/arm-smmu.c | 53 +++++++++++++++++++++++++--
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  5 +++
 2 files changed, 54 insertions(+), 4 deletions(-)

Comments

Rob Clark June 17, 2024, 4:18 p.m. UTC | #1
On Mon, Jun 17, 2024 at 6:07 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 04/06/2024 4:01 pm, Rob Clark wrote:
> > From: Rob Clark <robdclark@chromium.org>
> >
> > Parse out the bitfields for easier-to-read fault messages.
> >
> > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > ---
> > Stephen was wanting easier to read fault messages.. so I typed this up.
> >
> > Resend with the new iommu list address
> >
> >   drivers/iommu/arm/arm-smmu/arm-smmu.c | 53 +++++++++++++++++++++++++--
> >   drivers/iommu/arm/arm-smmu/arm-smmu.h |  5 +++
> >   2 files changed, 54 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > index c572d877b0e1..06712d73519c 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > @@ -411,6 +411,8 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
> >       unsigned long iova;
> >       struct arm_smmu_domain *smmu_domain = dev;
> >       struct arm_smmu_device *smmu = smmu_domain->smmu;
> > +     static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
> > +                                   DEFAULT_RATELIMIT_BURST);
> >       int idx = smmu_domain->cfg.cbndx;
> >       int ret;
> >
> > @@ -425,10 +427,53 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
> >       ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
> >               fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
> >
> > -     if (ret == -ENOSYS)
> > -             dev_err_ratelimited(smmu->dev,
> > -             "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
> > -                         fsr, iova, fsynr, cbfrsynra, idx);
> > +     if (ret == -ENOSYS && __ratelimit(&rs)) {
> > +             static const struct {
> > +                     u32 mask; const char *name;
> > +             } fsr_bits[] = {
> > +                     { ARM_SMMU_FSR_MULTI,  "MULTI" },
> > +                     { ARM_SMMU_FSR_SS,     "SS"    },
> > +                     { ARM_SMMU_FSR_UUT,    "UUT"   },
> > +                     { ARM_SMMU_FSR_ASF,    "ASF"   },
> > +                     { ARM_SMMU_FSR_TLBLKF, "TLBLKF" },
> > +                     { ARM_SMMU_FSR_TLBMCF, "TLBMCF" },
> > +                     { ARM_SMMU_FSR_EF,     "EF"     },
> > +                     { ARM_SMMU_FSR_PF,     "PF"     },
> > +                     { ARM_SMMU_FSR_AFF,    "AFF"    },
> > +                     { ARM_SMMU_FSR_TF,     "TF"     },
> > +             }, fsynr0_bits[] = {
> > +                     { ARM_SMMU_FSYNR0_WNR,    "WNR"    },
> > +                     { ARM_SMMU_FSYNR0_PNU,    "PNU"    },
> > +                     { ARM_SMMU_FSYNR0_IND,    "IND"    },
> > +                     { ARM_SMMU_FSYNR0_NSATTR, "NSATTR" },
> > +                     { ARM_SMMU_FSYNR0_PTWF,   "PTWF"   },
> > +                     { ARM_SMMU_FSYNR0_AFR,    "AFR"    },
> > +             };
> > +
> > +             pr_err("%s %s: Unhandled context fault: fsr=0x%x (",
> > +                    dev_driver_string(smmu->dev), dev_name(smmu->dev), fsr);
> > +
> > +             for (int i = 0, n = 0; i < ARRAY_SIZE(fsr_bits); i++) {
> > +                     if (fsr & fsr_bits[i].mask) {
> > +                             pr_cont("%s%s", (n > 0) ? "|" : "", fsr_bits[i].name);
>
> Given that SMMU faults have a high likelihood of correlating with other
> errors, e.g. the initiating device also reporting that it got an abort
> back, this much pr_cont is a recipe for an unreadable mess. Furthermore,
> just imagine how "helpful" this would be when faults in two contexts are
> reported by two different CPUs at the same time ;)

It looks like arm_smmu_context_fault() is only used with non-threaded
irq's.  And this fallback is only used if driver doesn't register it's
own fault handler.  So I don't think this will be a problem.

> I'd prefer to retain the original message as-is, so there is at least
> still an unambiguous "atomic" view of a fault's entire state, then
> follow it with a decode more in the style of arm64's ESR logging. TBH I
> also wouldn't disapprove of hiding the additional decode behind a
> command-line/runtime parameter, since a fault storm can cripple a system
> enough as it is, without making the interrupt handler spend even longer
> printing to a potentially slow console.

It _is_ ratelimited.  But we could perhaps use a higher loglevel (pr_debug?)

BR,
-R

> > +                             n++;
> > +                     }
> > +             }
> > +
> > +             pr_cont("), iova=0x%08lx, fsynr=0x%x (S1CBNDX=%u", iova, fsynr,
> > +                     (fsynr >> 16) & 0xff);
>
> Please define all the bitfields properly (and I agree with Pranjal about
> the naming).
>
> Thanks,
> Robin.
>
> > +
> > +             for (int i = 0; i < ARRAY_SIZE(fsynr0_bits); i++) {
> > +                     if (fsynr & fsynr0_bits[i].mask) {
> > +                             pr_cont("|%s", fsynr0_bits[i].name);
> > +                     }
> > +             }
> > +
> > +             pr_cont("|PLVL=%u), cbfrsynra=0x%x, cb=%d\n",
> > +                     fsynr & 0x3,   /* FSYNR0.PLV */
> > +                     cbfrsynra, idx);
> > +
> > +     }
> >
> >       arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
> >       return IRQ_HANDLED;
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > index 836ed6799a80..3b051273718b 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
> > @@ -223,6 +223,11 @@ enum arm_smmu_cbar_type {
> >
> >   #define ARM_SMMU_CB_FSYNR0          0x68
> >   #define ARM_SMMU_FSYNR0_WNR         BIT(4)
> > +#define ARM_SMMU_FSYNR0_PNU          BIT(5)
> > +#define ARM_SMMU_FSYNR0_IND          BIT(6)
> > +#define ARM_SMMU_FSYNR0_NSATTR               BIT(8)
> > +#define ARM_SMMU_FSYNR0_PTWF         BIT(10)
> > +#define ARM_SMMU_FSYNR0_AFR          BIT(11)
> >
> >   #define ARM_SMMU_CB_FSYNR1          0x6c
> >
Robin Murphy June 17, 2024, 5:33 p.m. UTC | #2
On 2024-06-17 5:18 pm, Rob Clark wrote:
> On Mon, Jun 17, 2024 at 6:07 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 04/06/2024 4:01 pm, Rob Clark wrote:
>>> From: Rob Clark <robdclark@chromium.org>
>>>
>>> Parse out the bitfields for easier-to-read fault messages.
>>>
>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>> ---
>>> Stephen was wanting easier to read fault messages.. so I typed this up.
>>>
>>> Resend with the new iommu list address
>>>
>>>    drivers/iommu/arm/arm-smmu/arm-smmu.c | 53 +++++++++++++++++++++++++--
>>>    drivers/iommu/arm/arm-smmu/arm-smmu.h |  5 +++
>>>    2 files changed, 54 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
>>> index c572d877b0e1..06712d73519c 100644
>>> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
>>> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
>>> @@ -411,6 +411,8 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
>>>        unsigned long iova;
>>>        struct arm_smmu_domain *smmu_domain = dev;
>>>        struct arm_smmu_device *smmu = smmu_domain->smmu;
>>> +     static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
>>> +                                   DEFAULT_RATELIMIT_BURST);
>>>        int idx = smmu_domain->cfg.cbndx;
>>>        int ret;
>>>
>>> @@ -425,10 +427,53 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
>>>        ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
>>>                fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
>>>
>>> -     if (ret == -ENOSYS)
>>> -             dev_err_ratelimited(smmu->dev,
>>> -             "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
>>> -                         fsr, iova, fsynr, cbfrsynra, idx);
>>> +     if (ret == -ENOSYS && __ratelimit(&rs)) {
>>> +             static const struct {
>>> +                     u32 mask; const char *name;
>>> +             } fsr_bits[] = {
>>> +                     { ARM_SMMU_FSR_MULTI,  "MULTI" },
>>> +                     { ARM_SMMU_FSR_SS,     "SS"    },
>>> +                     { ARM_SMMU_FSR_UUT,    "UUT"   },
>>> +                     { ARM_SMMU_FSR_ASF,    "ASF"   },
>>> +                     { ARM_SMMU_FSR_TLBLKF, "TLBLKF" },
>>> +                     { ARM_SMMU_FSR_TLBMCF, "TLBMCF" },
>>> +                     { ARM_SMMU_FSR_EF,     "EF"     },
>>> +                     { ARM_SMMU_FSR_PF,     "PF"     },
>>> +                     { ARM_SMMU_FSR_AFF,    "AFF"    },
>>> +                     { ARM_SMMU_FSR_TF,     "TF"     },
>>> +             }, fsynr0_bits[] = {
>>> +                     { ARM_SMMU_FSYNR0_WNR,    "WNR"    },
>>> +                     { ARM_SMMU_FSYNR0_PNU,    "PNU"    },
>>> +                     { ARM_SMMU_FSYNR0_IND,    "IND"    },
>>> +                     { ARM_SMMU_FSYNR0_NSATTR, "NSATTR" },
>>> +                     { ARM_SMMU_FSYNR0_PTWF,   "PTWF"   },
>>> +                     { ARM_SMMU_FSYNR0_AFR,    "AFR"    },
>>> +             };
>>> +
>>> +             pr_err("%s %s: Unhandled context fault: fsr=0x%x (",
>>> +                    dev_driver_string(smmu->dev), dev_name(smmu->dev), fsr);
>>> +
>>> +             for (int i = 0, n = 0; i < ARRAY_SIZE(fsr_bits); i++) {
>>> +                     if (fsr & fsr_bits[i].mask) {
>>> +                             pr_cont("%s%s", (n > 0) ? "|" : "", fsr_bits[i].name);
>>
>> Given that SMMU faults have a high likelihood of correlating with other
>> errors, e.g. the initiating device also reporting that it got an abort
>> back, this much pr_cont is a recipe for an unreadable mess. Furthermore,
>> just imagine how "helpful" this would be when faults in two contexts are
>> reported by two different CPUs at the same time ;)
> 
> It looks like arm_smmu_context_fault() is only used with non-threaded
> irq's.  And this fallback is only used if driver doesn't register it's
> own fault handler.  So I don't think this will be a problem.

You don't think two different IRQs can fire on two different CPUs at the 
same time (or close to)? It's already bad enough when multiple CPUs 
panic and one has to pick apart line-by-line interleaving of the 
registers/stacktraces - imagine how much more utterly unusable that 
would be with bits of different dumps randomly pr_cont'ed together onto 
the same line(s)...

Even when unrelated stuff gets interleaved because other CPUs just 
happen to be calling printk() at the same time for unrelated reasons 
it's still annoying, and pr_cont makes a bigger mess than not.
>> I'd prefer to retain the original message as-is, so there is at least
>> still an unambiguous "atomic" view of a fault's entire state, then
>> follow it with a decode more in the style of arm64's ESR logging. TBH I
>> also wouldn't disapprove of hiding the additional decode behind a
>> command-line/runtime parameter, since a fault storm can cripple a system
>> enough as it is, without making the interrupt handler spend even longer
>> printing to a potentially slow console.
> 
> It _is_ ratelimited.  But we could perhaps use a higher loglevel (pr_debug?)

Yeah, I'd have no complaint with pr_debug/dev_dbg either, if that suits 
your use case. True that the ratelimit may typically mitigate the 
overall impact, but still in the worst case with a sufficiently slow 
console and/or a sufficiently large amount to print per __ratelimit() 
call, it can end up being slow enough to stay below the threshold. Don't 
ask me how I know that :)

Thanks,
Robin.
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index c572d877b0e1..06712d73519c 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -411,6 +411,8 @@  static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	unsigned long iova;
 	struct arm_smmu_domain *smmu_domain = dev;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
 	int idx = smmu_domain->cfg.cbndx;
 	int ret;
 
@@ -425,10 +427,53 @@  static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
 		fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
-	if (ret == -ENOSYS)
-		dev_err_ratelimited(smmu->dev,
-		"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
-			    fsr, iova, fsynr, cbfrsynra, idx);
+	if (ret == -ENOSYS && __ratelimit(&rs)) {
+		static const struct {
+			u32 mask; const char *name;
+		} fsr_bits[] = {
+			{ ARM_SMMU_FSR_MULTI,  "MULTI" },
+			{ ARM_SMMU_FSR_SS,     "SS"    },
+			{ ARM_SMMU_FSR_UUT,    "UUT"   },
+			{ ARM_SMMU_FSR_ASF,    "ASF"   },
+			{ ARM_SMMU_FSR_TLBLKF, "TLBLKF" },
+			{ ARM_SMMU_FSR_TLBMCF, "TLBMCF" },
+			{ ARM_SMMU_FSR_EF,     "EF"     },
+			{ ARM_SMMU_FSR_PF,     "PF"     },
+			{ ARM_SMMU_FSR_AFF,    "AFF"    },
+			{ ARM_SMMU_FSR_TF,     "TF"     },
+		}, fsynr0_bits[] = {
+			{ ARM_SMMU_FSYNR0_WNR,    "WNR"    },
+			{ ARM_SMMU_FSYNR0_PNU,    "PNU"    },
+			{ ARM_SMMU_FSYNR0_IND,    "IND"    },
+			{ ARM_SMMU_FSYNR0_NSATTR, "NSATTR" },
+			{ ARM_SMMU_FSYNR0_PTWF,   "PTWF"   },
+			{ ARM_SMMU_FSYNR0_AFR,    "AFR"    },
+		};
+
+		pr_err("%s %s: Unhandled context fault: fsr=0x%x (",
+		       dev_driver_string(smmu->dev), dev_name(smmu->dev), fsr);
+
+		for (int i = 0, n = 0; i < ARRAY_SIZE(fsr_bits); i++) {
+			if (fsr & fsr_bits[i].mask) {
+				pr_cont("%s%s", (n > 0) ? "|" : "", fsr_bits[i].name);
+				n++;
+			}
+		}
+
+		pr_cont("), iova=0x%08lx, fsynr=0x%x (S1CBNDX=%u", iova, fsynr,
+			(fsynr >> 16) & 0xff);
+
+		for (int i = 0; i < ARRAY_SIZE(fsynr0_bits); i++) {
+			if (fsynr & fsynr0_bits[i].mask) {
+				pr_cont("|%s", fsynr0_bits[i].name);
+			}
+		}
+
+		pr_cont("|PLVL=%u), cbfrsynra=0x%x, cb=%d\n",
+			fsynr & 0x3,   /* FSYNR0.PLV */
+			cbfrsynra, idx);
+
+	}
 
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
 	return IRQ_HANDLED;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 836ed6799a80..3b051273718b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -223,6 +223,11 @@  enum arm_smmu_cbar_type {
 
 #define ARM_SMMU_CB_FSYNR0		0x68
 #define ARM_SMMU_FSYNR0_WNR		BIT(4)
+#define ARM_SMMU_FSYNR0_PNU		BIT(5)
+#define ARM_SMMU_FSYNR0_IND		BIT(6)
+#define ARM_SMMU_FSYNR0_NSATTR		BIT(8)
+#define ARM_SMMU_FSYNR0_PTWF		BIT(10)
+#define ARM_SMMU_FSYNR0_AFR		BIT(11)
 
 #define ARM_SMMU_CB_FSYNR1		0x6c