[v2,14/24] EDAC, ghes: Rework memory hierarchy detection

Message ID 20190624150758.6695-15-rrichter@marvell.com
State New
Headers show
Series
  • EDAC, mc, ghes: Fixes and updates to improve memory error reporting
Related show

Commit Message

Robert Richter June 24, 2019, 3:09 p.m.
In a later patch we want to add more information about the memory
hierarchy (NUMA topology, DIMM label information). Rework memory
hierarchy detection to make the code extendable for this.

The general approach is roughly like:

	mem_info_setup();
	for_each_node(nid) {
		mci = edac_mc_alloc(nid);
		mem_info_prepare_mci(mci);
		edac_mc_add_mc(mci);
	};

This patch introduces mem_info_setup() and mem_info_prepare_mci().

All data of the memory hierarchy is collected in a local struct
ghes_mem_info.

Note: Per (NUMA) node registration will be implemented in a later
patch.

Signed-off-by: Robert Richter <rrichter@marvell.com>

---
 drivers/edac/ghes_edac.c | 166 ++++++++++++++++++++++++++++++---------
 1 file changed, 127 insertions(+), 39 deletions(-)

-- 
2.20.1

Comments

Borislav Petkov Aug. 20, 2019, 8:56 a.m. | #1
On Mon, Jun 24, 2019 at 03:09:24PM +0000, Robert Richter wrote:
> In a later patch we want to add more information about the memory

> hierarchy (NUMA topology, DIMM label information). Rework memory

> hierarchy detection to make the code extendable for this.

> 

> The general approach is roughly like:

> 

> 	mem_info_setup();

> 	for_each_node(nid) {

> 		mci = edac_mc_alloc(nid);

> 		mem_info_prepare_mci(mci);

> 		edac_mc_add_mc(mci);

> 	};

> 

> This patch introduces mem_info_setup() and mem_info_prepare_mci().


Avoid having "This patch" or "This commit" in the commit message. It is
tautologically useless.

> All data of the memory hierarchy is collected in a local struct

> ghes_mem_info.

> 

> Note: Per (NUMA) node registration will be implemented in a later

> patch.


That sentence is not needed in the commit message.

> Signed-off-by: Robert Richter <rrichter@marvell.com>

> ---

>  drivers/edac/ghes_edac.c | 166 ++++++++++++++++++++++++++++++---------

>  1 file changed, 127 insertions(+), 39 deletions(-)

> 

> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c

> index 8063996a311d..44bfb499b147 100644

> --- a/drivers/edac/ghes_edac.c

> +++ b/drivers/edac/ghes_edac.c

> @@ -65,17 +65,53 @@ struct memdev_dmi_entry {

>  	u16 conf_mem_clk_speed;

>  } __attribute__((__packed__));

>  

> -struct ghes_edac_dimm_fill {

> -	struct mem_ctl_info *mci;

> -	unsigned count;


All those "dimm" and "info" words everywhere are making my head spin.
Let's make it more readable:

> +struct ghes_dimm_info {

> +	struct dimm_info dimm_info;

> +	int		idx;

> +};

> +

> +struct ghes_mem_info {

> +	int num_dimm;

> +	struct ghes_dimm_info *dimms;

>  };

>  

> +static struct ghes_mem_info mem_info;


/* A DIMM */
struct ghes_dimm {
        struct dimm_info dimm;
        int idx;
};

/* The memory layout of the system */
struct ghes_memory {
        struct ghes_dimm *dimms;
        int num_dimms;
};

static struct ghes_memory mem;

> +

> +#define for_each_dimm(dimm)				\

> +	for (dimm = mem_info.dimms;			\

> +	     dimm < mem_info.dimms + mem_info.num_dimm;	\

> +	     dimm++)

> +

>  static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)

>  {

> -	int *num_dimm = arg;

> +	int *num = arg;

>  

>  	if (dh->type == DMI_ENTRY_MEM_DEVICE)

> -		(*num_dimm)++;

> +		(*num)++;

> +}

> +

> +static int ghes_dimm_info_init(int num)


ghes_dimm_init()

... you get the idea - let's drop the _info crap.

> +{

> +	struct ghes_dimm_info *dimm;

> +	int idx = 0;

> +

> +	memset(&mem_info, 0, sizeof(mem_info));

> +

> +	if (num <= 0)

> +		return -EINVAL;


Move that check into the caller mem_info_setup() so that you don't do
the memset unnecessarily.

> +

> +	mem_info.dimms = kcalloc(num, sizeof(*mem_info.dimms), GFP_KERNEL);

> +	if (!mem_info.dimms)

> +		return -ENOMEM;

> +

> +	mem_info.num_dimm = num;

> +

> +	for_each_dimm(dimm) {

> +		dimm->idx	= idx;

> +		idx++;

> +	}


or simply

	for_each_dimm(dimm)
		dimm->idx = idx++;

> +

> +	return 0;

>  }

>  

>  static int get_dimm_smbios_index(u16 handle)


-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

Patch

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 8063996a311d..44bfb499b147 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -65,17 +65,53 @@  struct memdev_dmi_entry {
 	u16 conf_mem_clk_speed;
 } __attribute__((__packed__));
 
-struct ghes_edac_dimm_fill {
-	struct mem_ctl_info *mci;
-	unsigned count;
+struct ghes_dimm_info {
+	struct dimm_info dimm_info;
+	int		idx;
+};
+
+struct ghes_mem_info {
+	int num_dimm;
+	struct ghes_dimm_info *dimms;
 };
 
+static struct ghes_mem_info mem_info;
+
+#define for_each_dimm(dimm)				\
+	for (dimm = mem_info.dimms;			\
+	     dimm < mem_info.dimms + mem_info.num_dimm;	\
+	     dimm++)
+
 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 {
-	int *num_dimm = arg;
+	int *num = arg;
 
 	if (dh->type == DMI_ENTRY_MEM_DEVICE)
-		(*num_dimm)++;
+		(*num)++;
+}
+
+static int ghes_dimm_info_init(int num)
+{
+	struct ghes_dimm_info *dimm;
+	int idx = 0;
+
+	memset(&mem_info, 0, sizeof(mem_info));
+
+	if (num <= 0)
+		return -EINVAL;
+
+	mem_info.dimms = kcalloc(num, sizeof(*mem_info.dimms), GFP_KERNEL);
+	if (!mem_info.dimms)
+		return -ENOMEM;
+
+	mem_info.num_dimm = num;
+
+	for_each_dimm(dimm) {
+		dimm->idx	= idx;
+		idx++;
+	}
+
+	return 0;
 }
 
 static int get_dimm_smbios_index(u16 handle)
@@ -92,18 +128,15 @@  static int get_dimm_smbios_index(u16 handle)
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 {
-	struct ghes_edac_dimm_fill *dimm_fill = arg;
-	struct mem_ctl_info *mci = dimm_fill->mci;
-
 	if (dh->type == DMI_ENTRY_MEM_DEVICE) {
+		int *idx = arg;
 		struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
-		struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count,
-						       0, 0);
+		struct ghes_dimm_info *mi = &mem_info.dimms[*idx];
+		struct dimm_info *dimm = &mi->dimm_info;
 		u16 rdr_mask = BIT(7) | BIT(13);
 
 		if (entry->size == 0xffff) {
-			pr_info("Can't get DIMM%i size\n",
-				dimm_fill->count);
+			pr_info("Can't get DIMM%i size\n", mi->idx);
 			dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
 		} else if (entry->size == 0x7fff) {
 			dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
@@ -177,7 +210,7 @@  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 
 		if (dimm->nr_pages) {
 			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
-				dimm_fill->count, edac_mem_types[dimm->mtype],
+				mi->idx, edac_mem_types[dimm->mtype],
 				PAGES_TO_MiB(dimm->nr_pages),
 				(dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
 			edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
@@ -187,10 +220,74 @@  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 
 		dimm->smbios_handle = entry->handle;
 
-		dimm_fill->count++;
+		(*idx)++;
 	}
 }
 
+static int mem_info_setup(void)
+{
+	int num = 0;
+	int idx = 0;
+	int ret;
+
+	/* Get the number of DIMMs */
+	dmi_walk(ghes_edac_count_dimms, &num);
+
+	ret = ghes_dimm_info_init(num);
+	if (ret)
+		return ret;
+
+	dmi_walk(ghes_edac_dmidecode, &idx);
+
+	return 0;
+}
+
+static int mem_info_setup_fake(void)
+{
+	struct dimm_info *dimm;
+	int ret;
+
+	ret = ghes_dimm_info_init(1);
+	if (ret)
+		return ret;
+
+	dimm = &mem_info.dimms->dimm_info;
+	dimm->nr_pages = 1;
+	dimm->grain = 128;
+	dimm->mtype = MEM_UNKNOWN;
+	dimm->dtype = DEV_UNKNOWN;
+	dimm->edac_mode = EDAC_SECDED;
+
+	return 0;
+}
+
+static void mem_info_prepare_mci(struct mem_ctl_info *mci)
+{
+	struct dimm_info *mci_dimm, *dmi_dimm;
+	struct ghes_dimm_info *dimm;
+	int index = 0;
+
+	for_each_dimm(dimm) {
+		dmi_dimm = &dimm->dimm_info;
+		mci_dimm = edac_get_dimm_by_index(mci, index);
+
+		index++;
+		if (index > mci->tot_dimms)
+			break;
+
+		mci_dimm->nr_pages	= dmi_dimm->nr_pages;
+		mci_dimm->mtype		= dmi_dimm->mtype;
+		mci_dimm->edac_mode	= dmi_dimm->edac_mode;
+		mci_dimm->dtype		= dmi_dimm->dtype;
+		mci_dimm->grain		= dmi_dimm->grain;
+		mci_dimm->smbios_handle = dmi_dimm->smbios_handle;
+	}
+
+	if (index != mci->tot_dimms)
+		pr_warn("Unexpected number of DIMMs: %d (exp. %d)\n",
+			index, mci->tot_dimms);
+}
+
 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 {
 	struct dimm_info *dimm_info;
@@ -450,10 +547,9 @@  static struct acpi_platform_list plat_list[] = {
 int ghes_edac_register(struct ghes *ghes, struct device *dev)
 {
 	bool fake = false;
-	int rc, num_dimm = 0;
+	int rc;
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[1];
-	struct ghes_edac_dimm_fill dimm_fill;
 	int idx = -1;
 
 	if (IS_ENABLED(CONFIG_X86)) {
@@ -471,22 +567,24 @@  int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	if (atomic_inc_return(&ghes_init) > 1)
 		return 0;
 
-	/* Get the number of DIMMs */
-	dmi_walk(ghes_edac_count_dimms, &num_dimm);
-
-	/* Check if we've got a bogus BIOS */
-	if (num_dimm == 0) {
+	rc = mem_info_setup();
+	if (rc == -EINVAL) {
+		/* we've got a bogus BIOS */
 		fake = true;
-		num_dimm = 1;
+		rc = mem_info_setup_fake();
+	}
+	if (rc < 0) {
+		pr_err("Can't allocate memory for DIMM data\n");
+		return rc;
 	}
 
 	layers[0].type = EDAC_MC_LAYER_ALL_MEM;
-	layers[0].size = num_dimm;
+	layers[0].size = mem_info.num_dimm;
 	layers[0].is_virt_csrow = true;
 
 	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt));
 	if (!mci) {
-		pr_info("Can't allocate memory for EDAC data\n");
+		pr_err("Can't allocate memory for EDAC data\n");
 		return -ENOMEM;
 	}
 
@@ -512,26 +610,14 @@  int ghes_edac_register(struct ghes *ghes, struct device *dev)
 		pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
 		pr_info("If you find incorrect reports, please contact your hardware vendor\n");
 		pr_info("to correct its BIOS.\n");
-		pr_info("This system has %d DIMM sockets.\n", num_dimm);
+		pr_info("This system has %d DIMM sockets.\n", mem_info.num_dimm);
 	}
 
-	if (!fake) {
-		dimm_fill.count = 0;
-		dimm_fill.mci = mci;
-		dmi_walk(ghes_edac_dmidecode, &dimm_fill);
-	} else {
-		struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
-
-		dimm->nr_pages = 1;
-		dimm->grain = 128;
-		dimm->mtype = MEM_UNKNOWN;
-		dimm->dtype = DEV_UNKNOWN;
-		dimm->edac_mode = EDAC_SECDED;
-	}
+	mem_info_prepare_mci(mci);
 
 	rc = edac_mc_add_mc(mci);
 	if (rc < 0) {
-		pr_info("Can't register at EDAC core\n");
+		pr_err("Can't register at EDAC core\n");
 		edac_mc_free(mci);
 		return -ENODEV;
 	}
@@ -548,4 +634,6 @@  void ghes_edac_unregister(struct ghes *ghes)
 	mci = ghes_pvt->mci;
 	edac_mc_del_mc(mci->pdev);
 	edac_mc_free(mci);
+
+	kfree(mem_info.dimms);
 }