[v2,10/24] EDAC, mc: Remove per layer counters

Message ID 20190624150758.6695-11-rrichter@marvell.com
State Superseded
Headers show
Series
  • EDAC, mc, ghes: Fixes and updates to improve memory error reporting
Related show

Commit Message

Robert Richter June 24, 2019, 3:09 p.m.
Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
turns out that only the leaves in the memory hierarchy are consumed
(in sysfs), but not the intermediate layers, e.g.:

 count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];

So let's get rid of the unused counters that just add complexity.

Error counter values are directly stored in struct dimm_info now.

Signed-off-by: Robert Richter <rrichter@marvell.com>

---
 drivers/edac/edac_mc.c       | 98 ++++++++++++------------------------
 drivers/edac/edac_mc_sysfs.c | 20 +++-----
 drivers/edac/ghes_edac.c     |  5 +-
 include/linux/edac.h         |  7 ++-
 4 files changed, 44 insertions(+), 86 deletions(-)

-- 
2.20.1

Comments

Borislav Petkov Aug. 16, 2019, 9:24 a.m. | #1
On Mon, Jun 24, 2019 at 03:09:15PM +0000, Robert Richter wrote:
> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it

> turns out that only the leaves in the memory hierarchy are consumed

> (in sysfs), but not the intermediate layers, e.g.:

> 

>  count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];

> 

> So let's get rid of the unused counters that just add complexity.

> 

> Error counter values are directly stored in struct dimm_info now.

> 

> Signed-off-by: Robert Richter <rrichter@marvell.com>

> ---

>  drivers/edac/edac_mc.c       | 98 ++++++++++++------------------------

>  drivers/edac/edac_mc_sysfs.c | 20 +++-----

>  drivers/edac/ghes_edac.c     |  5 +-

>  include/linux/edac.h         |  7 ++-

>  4 files changed, 44 insertions(+), 86 deletions(-)

> 

> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c

> index f2acdab34eb7..bce39b2e10c9 100644

> --- a/drivers/edac/edac_mc.c

> +++ b/drivers/edac/edac_mc.c

> @@ -313,10 +313,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,

>  	struct csrow_info *csr;

>  	struct rank_info *chan;

>  	struct dimm_info *dimm;

> -	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];

>  	unsigned pos[EDAC_MAX_LAYERS];

> -	unsigned size, tot_dimms = 1, count = 1;

> -	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;

> +	unsigned size, tot_dimms = 1;

> +	unsigned tot_csrows = 1, tot_channels = 1;


Pls fix those while touching this:

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
#48: FILE: drivers/edac/edac_mc.c:317:
+       unsigned size, tot_dimms = 1;

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
#49: FILE: drivers/edac/edac_mc.c:318:
+       unsigned tot_csrows = 1, tot_channels = 1;

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

Patch

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f2acdab34eb7..bce39b2e10c9 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -313,10 +313,9 @@  struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
 	struct csrow_info *csr;
 	struct rank_info *chan;
 	struct dimm_info *dimm;
-	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
 	unsigned pos[EDAC_MAX_LAYERS];
-	unsigned size, tot_dimms = 1, count = 1;
-	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+	unsigned size, tot_dimms = 1;
+	unsigned tot_csrows = 1, tot_channels = 1;
 	void *pvt, *p, *ptr = NULL;
 	int idx, i, j, row, chn, n, len;
 	bool per_rank = false;
@@ -342,19 +341,10 @@  struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
 	 * stringent as what the compiler would provide if we could simply
 	 * hardcode everything into a single struct.
 	 */
-	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
-	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
-	for (i = 0; i < n_layers; i++) {
-		count *= layers[i].size;
-		edac_dbg(4, "errcount layer %d size %d\n", i, count);
-		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
-		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
-		tot_errcount += 2 * count;
-	}
-
-	edac_dbg(4, "allocating %d error counters\n", tot_errcount);
-	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
-	size = ((unsigned long)pvt) + sz_pvt;
+	mci	= edac_align_ptr(&ptr, sizeof(*mci), 1);
+	layer	= edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+	pvt	= edac_align_ptr(&ptr, sz_pvt, 1);
+	size	= ((unsigned long)pvt) + sz_pvt;
 
 	edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
 		 size,
@@ -370,10 +360,6 @@  struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
 	 * rather than an imaginary chunk of memory located at address 0.
 	 */
 	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
-	for (i = 0; i < n_layers; i++) {
-		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
-		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
-	}
 	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
 
 	/* setup index and various internal pointers */
@@ -903,53 +889,31 @@  const char *edac_layer_name[] = {
 EXPORT_SYMBOL_GPL(edac_layer_name);
 
 static void edac_inc_ce_error(struct mem_ctl_info *mci,
-			      bool enable_per_layer_report,
 			      const int pos[EDAC_MAX_LAYERS],
 			      const u16 count)
 {
-	int i, index = 0;
+	struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
 
 	mci->ce_mc += count;
 
-	if (!enable_per_layer_report) {
+	if (dimm)
+		dimm->ce_count += count;
+	else
 		mci->ce_noinfo_count += count;
-		return;
-	}
-
-	for (i = 0; i < mci->n_layers; i++) {
-		if (pos[i] < 0)
-			break;
-		index += pos[i];
-		mci->ce_per_layer[i][index] += count;
-
-		if (i < mci->n_layers - 1)
-			index *= mci->layers[i + 1].size;
-	}
 }
 
 static void edac_inc_ue_error(struct mem_ctl_info *mci,
-				    bool enable_per_layer_report,
 				    const int pos[EDAC_MAX_LAYERS],
 				    const u16 count)
 {
-	int i, index = 0;
+	struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
 
 	mci->ue_mc += count;
 
-	if (!enable_per_layer_report) {
+	if (dimm)
+		dimm->ue_count += count;
+	else
 		mci->ue_noinfo_count += count;
-		return;
-	}
-
-	for (i = 0; i < mci->n_layers; i++) {
-		if (pos[i] < 0)
-			break;
-		index += pos[i];
-		mci->ue_per_layer[i][index] += count;
-
-		if (i < mci->n_layers - 1)
-			index *= mci->layers[i + 1].size;
-	}
 }
 
 static void edac_ce_error(struct mem_ctl_info *mci,
@@ -960,7 +924,6 @@  static void edac_ce_error(struct mem_ctl_info *mci,
 			  const char *label,
 			  const char *detail,
 			  const char *other_detail,
-			  const bool enable_per_layer_report,
 			  const unsigned long page_frame_number,
 			  const unsigned long offset_in_page,
 			  long grain)
@@ -983,7 +946,7 @@  static void edac_ce_error(struct mem_ctl_info *mci,
 				       error_count, msg, msg_aux, label,
 				       location, detail);
 	}
-	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
+	edac_inc_ce_error(mci, pos, error_count);
 
 	if (mci->scrub_mode == SCRUB_SW_SRC) {
 		/*
@@ -1013,8 +976,7 @@  static void edac_ue_error(struct mem_ctl_info *mci,
 			  const char *location,
 			  const char *label,
 			  const char *detail,
-			  const char *other_detail,
-			  const bool enable_per_layer_report)
+			  const char *other_detail)
 {
 	char *msg_aux = "";
 
@@ -1043,7 +1005,7 @@  static void edac_ue_error(struct mem_ctl_info *mci,
 			      msg, msg_aux, label, location, detail);
 	}
 
-	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
+	edac_inc_ue_error(mci, pos, error_count);
 }
 
 void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
@@ -1076,16 +1038,16 @@  void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
 			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
 			e->page_frame_number, e->offset_in_page,
 			e->grain, e->syndrome);
-		edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
-			      detail, e->other_detail, e->enable_per_layer_report,
+		edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+			      e->label, detail, e->other_detail,
 			      e->page_frame_number, e->offset_in_page, e->grain);
 	} else {
 		snprintf(detail, sizeof(detail),
 			"page:0x%lx offset:0x%lx grain:%ld",
 			e->page_frame_number, e->offset_in_page, e->grain);
 
-		edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
-			      detail, e->other_detail, e->enable_per_layer_report);
+		edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+			      e->label, detail, e->other_detail);
 	}
 
 
@@ -1110,6 +1072,7 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
 	int i, n_labels = 0;
 	struct edac_raw_error_desc *e = &mci->error_desc;
+	bool per_layer_report = false;
 
 	edac_dbg(3, "MC%d\n", mci->mc_idx);
 
@@ -1127,9 +1090,9 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 
 	/*
 	 * Check if the event report is consistent and if the memory
-	 * location is known. If it is known, enable_per_layer_report will be
-	 * true, the DIMM(s) label info will be filled and the per-layer
-	 * error counters will be incremented.
+	 * location is known. If it is known, the DIMM(s) label info
+	 * will be filled and the per-layer error counters will be
+	 * incremented.
 	 */
 	for (i = 0; i < mci->n_layers; i++) {
 		if (pos[i] >= (int)mci->layers[i].size) {
@@ -1147,7 +1110,7 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 			pos[i] = -1;
 		}
 		if (pos[i] >= 0)
-			e->enable_per_layer_report = true;
+			per_layer_report = true;
 	}
 
 	/*
@@ -1176,15 +1139,18 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 		if (dimm->grain > e->grain)
 			e->grain = dimm->grain;
 
+		if (!per_layer_report)
+			continue;
+
 		/*
 		 * If the error is memory-controller wide, there's no need to
 		 * seek for the affected DIMMs because the whole
 		 * channel/memory controller/...  may be affected.
 		 * Also, don't show errors for empty DIMM slots.
 		 */
-		if (e->enable_per_layer_report && dimm->nr_pages) {
+		if (dimm->nr_pages) {
 			if (n_labels >= EDAC_MAX_LABELS) {
-				e->enable_per_layer_report = false;
+				per_layer_report = false;
 				break;
 			}
 			n_labels++;
@@ -1215,7 +1181,7 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 		}
 	}
 
-	if (!e->enable_per_layer_report) {
+	if (!per_layer_report) {
 		strcpy(e->label, "any memory");
 	} else {
 		edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 4d15c88a52cd..a4c1b8501ff3 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -558,10 +558,8 @@  static ssize_t dimmdev_ce_count_show(struct device *dev,
 				      char *data)
 {
 	struct dimm_info *dimm = to_dimm(dev);
-	u32 count;
 
-	count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
-	return sprintf(data, "%u\n", count);
+	return sprintf(data, "%u\n", dimm->ce_count);
 }
 
 static ssize_t dimmdev_ue_count_show(struct device *dev,
@@ -569,10 +567,8 @@  static ssize_t dimmdev_ue_count_show(struct device *dev,
 				      char *data)
 {
 	struct dimm_info *dimm = to_dimm(dev);
-	u32 count;
 
-	count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
-	return sprintf(data, "%u\n", count);
+	return sprintf(data, "%u\n", dimm->ue_count);
 }
 
 /* dimm/rank attribute files */
@@ -660,7 +656,9 @@  static ssize_t mci_reset_counters_store(struct device *dev,
 					const char *data, size_t count)
 {
 	struct mem_ctl_info *mci = to_mci(dev);
-	int cnt, row, chan, i;
+	struct dimm_info *dimm;
+	int row, chan;
+
 	mci->ue_mc = 0;
 	mci->ce_mc = 0;
 	mci->ue_noinfo_count = 0;
@@ -676,11 +674,9 @@  static ssize_t mci_reset_counters_store(struct device *dev,
 			ri->channels[chan]->ce_count = 0;
 	}
 
-	cnt = 1;
-	for (i = 0; i < mci->n_layers; i++) {
-		cnt *= mci->layers[i].size;
-		memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
-		memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
+	mci_for_each_dimm(mci, dimm) {
+		dimm->ue_count = 0;
+		dimm->ce_count = 0;
 	}
 
 	mci->start_time = jiffies;
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 72e75ea5526c..757a02f2ce49 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -348,11 +348,8 @@  void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 				     mem_err->mem_dev_handle);
 
 		index = get_dimm_smbios_index(mem_err->mem_dev_handle);
-		if (index >= 0) {
+		if (index >= 0)
 			e->top_layer = index;
-			e->enable_per_layer_report = true;
-		}
-
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 20a04f48616c..4dcf075e9dff 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -383,6 +383,9 @@  struct dimm_info {
 	unsigned csrow, cschannel;	/* Points to the old API data */
 
 	u16 smbios_handle;              /* Handle for SMBIOS type 17 */
+
+	u32 ce_count;
+	u32 ue_count;
 };
 
 /**
@@ -453,8 +456,6 @@  struct errcount_attribute_data {
  * @location:			location of the error
  * @label:			label of the affected DIMM(s)
  * @other_detail:		other driver-specific detail about the error
- * @enable_per_layer_report:	if false, the error affects all layers
- *				(typically, a memory controller error)
  */
 struct edac_raw_error_desc {
 	/*
@@ -475,7 +476,6 @@  struct edac_raw_error_desc {
 	unsigned long syndrome;
 	const char *msg;
 	const char *other_detail;
-	bool enable_per_layer_report;
 };
 
 /* MEMORY controller information structure
@@ -565,7 +565,6 @@  struct mem_ctl_info {
 	 */
 	u32 ce_noinfo_count, ue_noinfo_count;
 	u32 ue_mc, ce_mc;
-	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
 
 	struct completion complete;