@@ -83,8 +83,11 @@ struct memarr_dmi_entry {
struct ghes_dimm_info {
struct dimm_info dimm_info;
+ struct dimm_info *dimm;
int idx;
int numa_node;
+ int card;
+ int module;
phys_addr_t start;
phys_addr_t end;
u16 phys_handle;
@@ -119,6 +122,8 @@ static void ghes_dimm_info_init(void)
for_each_dimm(dimm) {
dimm->idx = idx;
dimm->numa_node = NUMA_NO_NODE;
+ dimm->card = -1;
+ dimm->module = -1;
idx++;
}
}
@@ -401,6 +406,13 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
if (*dmi_dimm->label)
strcpy(mci_dimm->label, dmi_dimm->label);
+
+ /*
+ * From here on do not use any longer &dimm.dimm_info.
+ * Instead switch to the mci's dimm info which might
+ * contain updated data, such as the label.
+ */
+ dimm->dimm = mci_dimm;
}
if (index != mci->tot_dimms)
@@ -408,24 +420,46 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
index, mci->tot_dimms);
}
-static struct mem_ctl_info *get_mc_by_node(int nid)
+/* Requires ghes_lock being set. */
+static struct ghes_dimm_info *
+get_and_prepare_dimm_info(int nid, int card, int module, int handle)
{
- struct mem_ctl_info *mci = edac_mc_find(nid);
+ static struct ghes_dimm_info *dimm;
+ struct dimm_info *di;
- if (mci)
- return mci;
+ /*
+ * We require smbios_handle being set in the error report for
+ * per layer reporting (SMBIOS handle for the Type 17 Memory
+ * Device Structure that represents the Memory Module)
+ */
+ for_each_dimm(dimm) {
+ di = dimm->dimm;
+ if (di->smbios_handle == handle)
+ goto found;
+ }
- if (num_possible_nodes() > 1) {
- edac_mc_printk(fallback, KERN_WARNING,
- "Invalid or no node information, falling back to first node: %s",
- fallback->dev_name);
+ return NULL;
+found:
+ if (dimm->card < 0 && card >= 0)
+ dimm->card = card;
+ if (dimm->module < 0 && module >= 0)
+ dimm->module = module;
+
+ if ((num_possible_nodes() > 1 && di->mci->mc_idx != nid) ||
+ (card >= 0 && card != dimm->card) ||
+ (module >= 0 && module != dimm->module)) {
+ edac_mc_printk(di->mci, KERN_WARNING,
+ "Inconsistent error report (nid/card/module): %d/%d/%d (dimm%d: %d/%d/%d)",
+ nid, card, module, di->idx,
+ di->mci->mc_idx, dimm->card, dimm->module);
}
- return fallback;
+ return dimm;
}
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
+ struct ghes_dimm_info *dimm;
struct dimm_info *dimm_info;
enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
@@ -434,6 +468,9 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
unsigned long flags;
char *p;
int nid = NUMA_NO_NODE;
+ int card = -1;
+ int module = -1;
+ int handle = -1;
/* We need at least one mc */
if (WARN_ON_ONCE(!fallback))
@@ -449,10 +486,23 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
spin_lock_irqsave(&ghes_lock, flags);
- /* select the node's mc device */
if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
nid = mem_err->node;
- mci = get_mc_by_node(nid);
+ if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
+ card = mem_err->card;
+ if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
+ module = mem_err->module;
+ if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)
+ handle = mem_err->mem_dev_handle;
+
+ dimm = get_and_prepare_dimm_info(nid, card, module, handle);
+ if (dimm)
+ mci = dimm->dimm->mci;
+ else
+ mci = edac_mc_find(nid);
+ if (!mci)
+ mci = fallback;
+
pvt = mci->pvt_info;
e = &mci->error_desc;
@@ -670,7 +720,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
- dimm_info = edac_get_dimm_by_index(mci, e->top_layer);
+ dimm_info = dimm ? dimm->dimm : NULL;
edac_raw_mc_handle_error(type, mci, dimm_info, e, -1, -1);
According to SMBIOS Spec. 2.7 (N.2.5 Memory Error Section), a failing DIMM (module or rank number) can be identified by its error location consisting of node, card and module. A module handle is used to map it to the dimms listed in the dmi table. Collect all those data from the error record and select the dimm accordingly. Inconsistent error records will be reported which is the case if the same dimm handle reports errors with different node, card or module. The change allows to enable per-layer reporting based on node, card and module in the next patch. Signed-off-by: Robert Richter <rrichter@marvell.com> --- drivers/edac/ghes_edac.c | 74 +++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 12 deletions(-) -- 2.20.1