diff mbox series

[3/4,v2] ACPI: extlog: Trace CPER PCI Express Error Section

Message ID 20250429172109.3199192-4-fabio.m.de.francesco@linux.intel.com
State New
Headers show
Series [1/4,v2] ACPI: extlog: Trace CPER Non-standard Section Body | expand

Commit Message

Fabio M. De Francesco April 29, 2025, 5:21 p.m. UTC
I/O Machine Check Arcitecture events may signal failing PCIe components
or links. The AER event contains details on what was happening on the wire
when the error was signaled.

Trace the CPER PCIe Error section (UEFI v2.10, Appendix N.2.7) reported
by the I/O MCA.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
---
 drivers/acpi/acpi_extlog.c | 30 ++++++++++++++++++++++++++++++
 drivers/pci/pcie/aer.c     |  2 +-
 include/linux/aer.h        | 13 +++++++++++--
 3 files changed, 42 insertions(+), 3 deletions(-)

Comments

Fabio M. De Francesco June 2, 2025, 4:59 p.m. UTC | #1
On Tuesday, April 29, 2025 8:02:42 PM Central European Summer Time Yazen Ghannam wrote:
> On Tue, Apr 29, 2025 at 07:21:08PM +0200, Fabio M. De Francesco wrote:
> > I/O Machine Check Arcitecture events may signal failing PCIe components
> > or links. The AER event contains details on what was happening on the wire
> > when the error was signaled.
> > 
> > Trace the CPER PCIe Error section (UEFI v2.10, Appendix N.2.7) reported
> > by the I/O MCA.
> > 
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
> > ---
> >  drivers/acpi/acpi_extlog.c | 30 ++++++++++++++++++++++++++++++
> >  drivers/pci/pcie/aer.c     |  2 +-
> >  include/linux/aer.h        | 13 +++++++++++--
> >  3 files changed, 42 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
> > index caca6ccd6e99..7d7a813169f1 100644
> > --- a/drivers/acpi/acpi_extlog.c
> > +++ b/drivers/acpi/acpi_extlog.c
> > @@ -131,6 +131,32 @@ static int print_extlog_rcd(const char *pfx,
> >  	return 1;
> >  }
> >  
> > +static void extlog_print_pcie(struct cper_sec_pcie *pcie_err,
> > +			      int severity)
> > +{
> > +	struct aer_capability_regs *aer;
> > +	struct pci_dev *pdev;
> > +	unsigned int devfn;
> > +	unsigned int bus;
> > +	int aer_severity;
> > +	int domain;
> > +
> > +	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> > +	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
> 
> You can save an indentation level by inverting this check and returning
> early.
> 
Nice idea, I'll do it.
>
> > +		aer_severity = cper_severity_to_aer(severity);
> 
> I think it would help with clarity if all these lines were aligned on
> the "=".
> 
> > +		aer = (struct aer_capability_regs *)pcie_err->aer_info;
> > +		domain = pcie_err->device_id.segment;
> > +		bus = pcie_err->device_id.bus;
> 
> Many of these variables are passed unchanged to a single function below.
> 
> Why not pass them directly to the function?
> 
> Even if you split the function parameters across multiple lines, you
> will still have fewer lines. Plus you will not need to allocate the
> variables.
> 
I think that the cost is minimal and readability is improved.
>
> > +		devfn = PCI_DEVFN(pcie_err->device_id.device,
> > +				  pcie_err->device_id.function);
> > +		pdev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> > +		if (!pdev)
> > +			return;
> 
> Newline here, please.
> 
Sure.
>
> > +		pci_print_aer(KERN_DEBUG, pdev, aer_severity, aer);
> 
> Why use a debug log level?
>
Dan Williams suggested the debug log level commenting v1.
>
> > +		pci_dev_put(pdev);
> > +	}
> > +}
> > +
> >  static int extlog_print(struct notifier_block *nb, unsigned long val,
> >  			void *data)
> >  {
> > @@ -182,6 +208,10 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
> >  			if (gdata->error_data_length >= sizeof(*mem))
> >  				trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
> >  						       (u8)gdata->error_severity);
> > +		} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
> > +			struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
> > +
> > +			extlog_print_pcie(pcie_err, gdata->error_severity);
> >  		} else {
> >  			void *err = acpi_hest_get_payload(gdata);
> >  
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index d0ebf7c15afa..627fcf434698 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -801,7 +801,7 @@ void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
> >  	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> >  			aer_severity, tlp_header_valid, &aer->header_log);
> >  }
> > -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
> > +EXPORT_SYMBOL_GPL(pci_print_aer);
> >  
> >  /**
> >   * add_error_device - list device to be handled
> > diff --git a/include/linux/aer.h b/include/linux/aer.h
> > index 45d0fb2e2e75..737db92e6570 100644
> > --- a/include/linux/aer.h
> > +++ b/include/linux/aer.h
> > @@ -56,17 +56,26 @@ struct aer_capability_regs {
> >  #if defined(CONFIG_PCIEAER)
> >  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
> >  int pcie_aer_is_native(struct pci_dev *dev);
> > +void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
> > +		   struct aer_capability_regs *aer);
> >  #else
> >  static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
> >  {
> >  	return -EINVAL;
> >  }
> >  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
> > +static inline void pci_print_aer(char *level, struct pci_dev *dev,
> > +				 int aer_severity,
> > +				 struct aer_capability_regs *aer)
> > +{ }
> 
> I think the "{ }" can just go at the end of the parameters.
> 
> >  #endif
> >  
> > -void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
> > -		   struct aer_capability_regs *aer);
> > +#if defined(CONFIG_ACPI_APEI_PCIEAER)
> >  int cper_severity_to_aer(int cper_severity);
> > +#else
> > +static inline int cper_severity_to_aer(int cper_severity) { return 0; }
> 
> This may have an unintentional side effect.
> 
> '0' means AER_NONFATAL.
> 
> So the function will return that the error is an uncorrectable AER error
> that is potentially recoverable. At a minimum, this will incorrectly
> classify the error for data collection, and it could cause incorrect
> handling.
> 
> I guess the risk is minimal, since CONFIG_ACPI_APEI_PCIEAER will likely
> be enabled on systems that would use this.
>
Noted. Kconfig will select CONFIG_ACPI_APEI_PCIEAER.
>
> Thanks,
> Yazen
>

Thanks,

Fabio
diff mbox series

Patch

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index caca6ccd6e99..7d7a813169f1 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -131,6 +131,32 @@  static int print_extlog_rcd(const char *pfx,
 	return 1;
 }
 
+static void extlog_print_pcie(struct cper_sec_pcie *pcie_err,
+			      int severity)
+{
+	struct aer_capability_regs *aer;
+	struct pci_dev *pdev;
+	unsigned int devfn;
+	unsigned int bus;
+	int aer_severity;
+	int domain;
+
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
+		aer_severity = cper_severity_to_aer(severity);
+		aer = (struct aer_capability_regs *)pcie_err->aer_info;
+		domain = pcie_err->device_id.segment;
+		bus = pcie_err->device_id.bus;
+		devfn = PCI_DEVFN(pcie_err->device_id.device,
+				  pcie_err->device_id.function);
+		pdev = pci_get_domain_bus_and_slot(domain, bus, devfn);
+		if (!pdev)
+			return;
+		pci_print_aer(KERN_DEBUG, pdev, aer_severity, aer);
+		pci_dev_put(pdev);
+	}
+}
+
 static int extlog_print(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -182,6 +208,10 @@  static int extlog_print(struct notifier_block *nb, unsigned long val,
 			if (gdata->error_data_length >= sizeof(*mem))
 				trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
 						       (u8)gdata->error_severity);
+		} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
+			struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
+			extlog_print_pcie(pcie_err, gdata->error_severity);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d0ebf7c15afa..627fcf434698 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -801,7 +801,7 @@  void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
 			aer_severity, tlp_header_valid, &aer->header_log);
 }
-EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
+EXPORT_SYMBOL_GPL(pci_print_aer);
 
 /**
  * add_error_device - list device to be handled
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 45d0fb2e2e75..737db92e6570 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -56,17 +56,26 @@  struct aer_capability_regs {
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 int pcie_aer_is_native(struct pci_dev *dev);
+void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
+		   struct aer_capability_regs *aer);
 #else
 static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
+static inline void pci_print_aer(char *level, struct pci_dev *dev,
+				 int aer_severity,
+				 struct aer_capability_regs *aer)
+{ }
 #endif
 
-void pci_print_aer(char *level, struct pci_dev *dev, int aer_severity,
-		   struct aer_capability_regs *aer);
+#if defined(CONFIG_ACPI_APEI_PCIEAER)
 int cper_severity_to_aer(int cper_severity);
+#else
+static inline int cper_severity_to_aer(int cper_severity) { return 0; }
+#endif
+
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 		       int severity, struct aer_capability_regs *aer_regs);
 #endif //_AER_H_