Message ID | 20240125062802.50819-4-qingshun.wang@linux.intel.com |
---|---|
State | New |
Headers | show |
Series | PCI/AER: Handle Advisory Non-Fatal properly | expand |
Wang, Qingshun wrote: > Fetch and store the data of 3 more registers: "Link Status", "Device > Control 2", and "Advanced Error Capabilities and Control". This data is > needed for external observation to better understand ANFE. > > Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com> > --- > drivers/acpi/apei/ghes.c | 8 +++++++- > drivers/cxl/core/pci.c | 11 ++++++++++- > drivers/pci/pci.h | 4 ++++ > drivers/pci/pcie/aer.c | 26 ++++++++++++++++++++------ > include/linux/aer.h | 6 ++++-- > 5 files changed, 45 insertions(+), 10 deletions(-) > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index 6034039d5cff..047cc01be68c 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -594,7 +594,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID && > pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) { > struct pcie_capability_regs *pcie_caps; > + u16 device_control_2 = 0; > u16 device_status = 0; > + u16 link_status = 0; > unsigned int devfn; > int aer_severity; > u8 *aer_info; > @@ -619,7 +621,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > > if (pcie_err->validation_bits & CPER_PCIE_VALID_CAPABILITY) { > pcie_caps = (struct pcie_capability_regs *)pcie_err->capability; > + device_control_2 = pcie_caps->device_control_2; > device_status = pcie_caps->device_status; > + link_status = pcie_caps->link_status; > } > > aer_recover_queue(pcie_err->device_id.segment, > @@ -627,7 +631,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > devfn, aer_severity, > (struct aer_capability_regs *) > aer_info, > - device_status); > + device_status, > + link_status, > + device_control_2); > } > #endif > } > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 9111a4415a63..3aa57fe8db42 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -903,7 +903,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > struct aer_capability_regs aer_regs; > struct cxl_dport *dport; > struct cxl_port *port; > + u16 device_control_2; > u16 device_status; > + u16 link_status; > int severity; > > port = cxl_pci_find_port(pdev, &dport); > @@ -918,10 +920,17 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) > return; > > + if (pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &device_control_2)) > + return; > + > if (pcie_capability_read_word(pdev, PCI_EXP_DEVSTA, &device_status)) > return; > > - pci_print_aer(pdev, severity, &aer_regs, device_status); > + if (pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status)) > + return; > + > + pci_print_aer(pdev, severity, &aer_regs, device_status, > + link_status, device_control_2); Rather than complicate the calling convention of pci_print_aer(), update the internals of pci_print_aer() to get these extra registers, or provide a new wrapper interface that satisfies the dependencies and switch users over to that. Otherwise multiple touches of the same code path in one patch set is indicative of the need for a higher level helper.
On Fri, Feb 02, 2024 at 10:01:40AM -0800, Dan Williams wrote: > Wang, Qingshun wrote: > > Fetch and store the data of 3 more registers: "Link Status", "Device > > Control 2", and "Advanced Error Capabilities and Control". This data is > > needed for external observation to better understand ANFE. > > > > Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com> > > --- > > drivers/acpi/apei/ghes.c | 8 +++++++- > > drivers/cxl/core/pci.c | 11 ++++++++++- > > drivers/pci/pci.h | 4 ++++ > > drivers/pci/pcie/aer.c | 26 ++++++++++++++++++++------ > > include/linux/aer.h | 6 ++++-- > > 5 files changed, 45 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > > index 6034039d5cff..047cc01be68c 100644 > > --- a/drivers/acpi/apei/ghes.c > > +++ b/drivers/acpi/apei/ghes.c > > @@ -594,7 +594,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > > if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID && > > pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) { > > struct pcie_capability_regs *pcie_caps; > > + u16 device_control_2 = 0; > > u16 device_status = 0; > > + u16 link_status = 0; > > unsigned int devfn; > > int aer_severity; > > u8 *aer_info; > > @@ -619,7 +621,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > > > > if (pcie_err->validation_bits & CPER_PCIE_VALID_CAPABILITY) { > > pcie_caps = (struct pcie_capability_regs *)pcie_err->capability; > > + device_control_2 = pcie_caps->device_control_2; > > device_status = pcie_caps->device_status; > > + link_status = pcie_caps->link_status; > > } > > > > aer_recover_queue(pcie_err->device_id.segment, > > @@ -627,7 +631,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) > > devfn, aer_severity, > > (struct aer_capability_regs *) > > aer_info, > > - device_status); > > + device_status, > > + link_status, > > + device_control_2); > > } > > #endif > > } > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > > index 9111a4415a63..3aa57fe8db42 100644 > > --- a/drivers/cxl/core/pci.c > > +++ b/drivers/cxl/core/pci.c > > @@ -903,7 +903,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > > struct aer_capability_regs aer_regs; > > struct cxl_dport *dport; > > struct cxl_port *port; > > + u16 device_control_2; > > u16 device_status; > > + u16 link_status; > > int severity; > > > > port = cxl_pci_find_port(pdev, &dport); > > @@ -918,10 +920,17 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > > if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) > > return; > > > > + if (pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &device_control_2)) > > + return; > > + > > if (pcie_capability_read_word(pdev, PCI_EXP_DEVSTA, &device_status)) > > return; > > > > - pci_print_aer(pdev, severity, &aer_regs, device_status); > > + if (pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status)) > > + return; > > + > > + pci_print_aer(pdev, severity, &aer_regs, device_status, > > + link_status, device_control_2); > > Rather than complicate the calling convention of pci_print_aer(), update > the internals of pci_print_aer() to get these extra registers, or > provide a new wrapper interface that satisfies the dependencies and > switch users over to that. Otherwise multiple touches of the same code > path in one patch set is indicative of the need for a higher level > helper. Thanks for the advice, it does make sense. Will reconsider the implementation. -- Best regards, Wang, Qingshun
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 6034039d5cff..047cc01be68c 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -594,7 +594,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID && pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) { struct pcie_capability_regs *pcie_caps; + u16 device_control_2 = 0; u16 device_status = 0; + u16 link_status = 0; unsigned int devfn; int aer_severity; u8 *aer_info; @@ -619,7 +621,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) if (pcie_err->validation_bits & CPER_PCIE_VALID_CAPABILITY) { pcie_caps = (struct pcie_capability_regs *)pcie_err->capability; + device_control_2 = pcie_caps->device_control_2; device_status = pcie_caps->device_status; + link_status = pcie_caps->link_status; } aer_recover_queue(pcie_err->device_id.segment, @@ -627,7 +631,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata) devfn, aer_severity, (struct aer_capability_regs *) aer_info, - device_status); + device_status, + link_status, + device_control_2); } #endif } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 9111a4415a63..3aa57fe8db42 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -903,7 +903,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) struct aer_capability_regs aer_regs; struct cxl_dport *dport; struct cxl_port *port; + u16 device_control_2; u16 device_status; + u16 link_status; int severity; port = cxl_pci_find_port(pdev, &dport); @@ -918,10 +920,17 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) return; + if (pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, &device_control_2)) + return; + if (pcie_capability_read_word(pdev, PCI_EXP_DEVSTA, &device_status)) return; - pci_print_aer(pdev, severity, &aer_regs, device_status); + if (pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status)) + return; + + pci_print_aer(pdev, severity, &aer_regs, device_status, + link_status, device_control_2); if (severity == AER_CORRECTABLE) cxl_handle_rdport_cor_ras(cxlds, dport); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index f881a1b42f14..5788a94b4e95 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -412,7 +412,11 @@ struct aer_err_info { u32 uncor_mask; /* UNCOR Error Mask */ u32 uncor_status; /* UNCOR Error Status */ u32 uncor_severity; /* UNCOR Error Severity */ + + u16 link_status; + u32 aer_cap_ctrl; /* AER Capabilities and Control */ u16 device_status; + u16 device_control_2; struct aer_header_log_regs tlp; /* TLP Header */ }; diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 713cbf625d3f..eec3406f727a 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -825,7 +825,8 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer); #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, - struct aer_capability_regs *aer, u16 device_status) + struct aer_capability_regs *aer, u16 device_status, + u16 link_status, u16 device_control_2) { int layer, agent, tlp_header_valid = 0; u32 status, mask; @@ -850,7 +851,10 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.uncor_status = aer->uncor_status; info.uncor_severity = aer->uncor_severity; info.uncor_mask = aer->uncor_mask; + info.link_status = link_status; + info.aer_cap_ctrl = aer->cap_control; info.device_status = device_status; + info.device_control_2 = device_control_2; info.first_error = PCI_ERR_CAP_FEP(aer->cap_control); pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); @@ -1205,7 +1209,9 @@ struct aer_recover_entry { u8 devfn; u16 domain; int severity; + u16 link_status; u16 device_status; + u16 device_control_2; struct aer_capability_regs *regs; }; @@ -1226,7 +1232,8 @@ static void aer_recover_work_func(struct work_struct *work) PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); continue; } - pci_print_aer(pdev, entry.severity, entry.regs, entry.device_status); + pci_print_aer(pdev, entry.severity, entry.regs, entry.device_status, + entry.link_status, entry.device_control_2); /* * Memory for aer_capability_regs(entry.regs) is being allocated from the * ghes_estatus_pool to protect it from overwriting when multiple sections @@ -1255,7 +1262,8 @@ static DEFINE_SPINLOCK(aer_recover_ring_lock); static DECLARE_WORK(aer_recover_work, aer_recover_work_func); void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity, struct aer_capability_regs *aer_regs, u16 device_status) + int severity, struct aer_capability_regs *aer_regs, u16 device_status, + u16 link_status, u16 device_control_2) { struct aer_recover_entry entry = { .bus = bus, @@ -1263,7 +1271,9 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, .domain = domain, .severity = severity, .regs = aer_regs, + .link_status = link_status, .device_status = device_status, + .device_control_2 = device_control_2, }; if (kfifo_in_spinlocked(&aer_recover_ring, &entry, 1, @@ -1289,7 +1299,6 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) { int type = pci_pcie_type(dev); int aer = dev->aer_cap; - int temp; /* Must reset in this function */ info->cor_status = 0; @@ -1317,8 +1326,14 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) &info->uncor_severity); pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &info->uncor_mask); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, + &info->aer_cap_ctrl); + pcie_capability_read_word(dev, PCI_EXP_LNKSTA, + &info->link_status); pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &info->device_status); + pcie_capability_read_word(dev, PCI_EXP_DEVCTL2, + &info->device_control_2); } else { return 1; } @@ -1331,8 +1346,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) return 0; /* Get First Error Pointer */ - pci_read_config_dword(dev, aer + PCI_ERR_CAP, &temp); - info->first_error = PCI_ERR_CAP_FEP(temp); + info->first_error = PCI_ERR_CAP_FEP(info->aer_cap_ctrl); if (info->uncor_status & AER_LOG_TLP_MASKS) { info->tlp_header_valid = 1; diff --git a/include/linux/aer.h b/include/linux/aer.h index 38ac802250ac..327ebec1e4b3 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -52,9 +52,11 @@ static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, - struct aer_capability_regs *aer, u16 device_status); + struct aer_capability_regs *aer, u16 device_status, + u16 link_status, u16 device_control_2); int cper_severity_to_aer(int cper_severity); void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity, struct aer_capability_regs *aer_regs, u16 device_status); + int severity, struct aer_capability_regs *aer_regs, u16 device_status, + u16 link_status, u16 device_control_2); #endif //_AER_H_
Fetch and store the data of 3 more registers: "Link Status", "Device Control 2", and "Advanced Error Capabilities and Control". This data is needed for external observation to better understand ANFE. Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com> --- drivers/acpi/apei/ghes.c | 8 +++++++- drivers/cxl/core/pci.c | 11 ++++++++++- drivers/pci/pci.h | 4 ++++ drivers/pci/pcie/aer.c | 26 ++++++++++++++++++++------ include/linux/aer.h | 6 ++++-- 5 files changed, 45 insertions(+), 10 deletions(-)