From patchwork Fri Oct  2 12:22:29 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 313983
Delivered-To: patch@linaro.org
Received: by 2002:a05:6e02:1081:0:0:0:0 with SMTP id r1csp1411903ilj;
 Fri, 2 Oct 2020 05:27:39 -0700 (PDT)
X-Google-Smtp-Source: ABdhPJxlvDJnyf9dKj4pUDkDCjd958NEVe1++ActvuMmFsvccMcBEbnDlx+FRNnuY26RwKKZJjMi
X-Received: by 2002:a05:6402:1642:: with SMTP id
 s2mr2012950edx.295.1601641659157; 
 Fri, 02 Oct 2020 05:27:39 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1601641659; cv=none;
 d=google.com; s=arc-20160816;
 b=VknFA7lD1+uYukr9vceSNG+qiqmiMpPQtwEDgKn3Yn0h+4eXZPbDZeB/UyRh3Krg9U
 L7wNut84kNQH4G6Eq+C1SW7WWJceMjcOhnqWi/Ltd0QuI0y5DwTPsLerdLPKrE6dA4nX
 eQ4ZnoDEbNLOTOQDIqqBFGhehqw+GS1tQ/WlKgzDnhZJW48OT+9fDHsExLgB4qadRzNK
 qDK4elqxhqi+hezP+RwrjkcGFHDiahpNEgCOgFuNDE52TsJNEnFiGGg0o8YnrsSeIl0T
 CshnqJWgn0qP24CXug9Nclzarp5r8ILAZo6rFARNwo/dz7KBUQRneBsctVxD4UyfkRzc
 PV7w==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from;
 bh=rAqsse5LlfdL3l/wgvLOMqKomXdjECvIXVtGbfUp5uM=;
 b=YrkHLTEnpYsb3sqAKbNIk8rF0HYtu/dAEXdFfcpP5r5AVWEjlyPvxDaJQG+0TI3fke
 GsIbVv84AauFSAsgz9AlHyFDllx9sxTODnpAVrxEHRF01kpwrU9mBzBe94HqN2/pIcAr
 ZL0rY7AZ967MCOI3VD3NDCWF9x79SkGCHs5IfTo8bw+n3XB4bucDwf4QLLQ5a+tWViaS
 nKn6NEtb3XV787VIiWVh9TJeWEZTgwKyMtQ+AWHCpx081niTe44e362Lrky/VsBjuSAA
 3s29jx9wqrRMKquJ8xqv2KQWDMxOQPkUHHHJrwDM80WFNy6nXdRmzSawiRGWXGwiEtnN
 +/vA==
ARC-Authentication-Results: i=1; mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Return-Path: <linux-acpi-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [23.128.96.18])
 by mx.google.com with ESMTP id i7si1027217edf.103.2020.10.02.05.27.38;
 Fri, 02 Oct 2020 05:27:39 -0700 (PDT)
Received-SPF: pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 client-ip=23.128.96.18; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387787AbgJBM1f (ORCPT <rfc822;patch@linaro.org> + 7 others);
 Fri, 2 Oct 2020 08:27:35 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2943 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726176AbgJBM1f (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:27:35 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id 9466F8D17E98730A081B;
 Fri,  2 Oct 2020 13:27:33 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:27:33 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 1/7] RAS/CEC: Replace the macro PFN with ELEM_NO
Date: Fri, 2 Oct 2020 13:22:29 +0100
Message-ID: <20201002122235.1280-2-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

Replace the macro PFN with ELEM_NO for common use.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

-- 
2.17.1

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 569d9ad2c594..22d11c66c266 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -86,7 +86,7 @@
  * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
  */
 
-#define PFN(e)			((e) >> PAGE_SHIFT)
+#define ELEM_NO(e, shift)	((e) >> (shift))
 #define DECAY(e)		(((e) >> COUNT_BITS) & DECAY_MASK)
 #define COUNT(e)		((unsigned int)(e) & COUNT_MASK)
 #define FULL_COUNT(e)		((e) & (PAGE_SIZE - 1))
@@ -113,6 +113,10 @@ static struct ce_array {
 					 * Times we did spring cleaning.
 					 */
 
+	u8 id_shift;			/*
+					 * shift for element id.
+					 */
+
 	union {
 		struct {
 			__u32	disabled : 1,	/* cmdline disabled */
@@ -191,7 +195,7 @@ static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
 	while (min <= max) {
 		int i = (min + max) >> 1;
 
-		this_pfn = PFN(ca->array[i]);
+		this_pfn = ELEM_NO(ca->array[i], ca->id_shift);
 
 		if (this_pfn < pfn)
 			min = i + 1;
@@ -258,7 +262,7 @@ static u64 del_lru_elem_unlocked(struct ce_array *ca)
 
 	del_elem(ca, min_idx);
 
-	return PFN(ca->array[min_idx]);
+	return ELEM_NO(ca->array[min_idx], ca->id_shift);
 }
 
 /*
@@ -287,7 +291,7 @@ static bool sanity_check(struct ce_array *ca)
 	int i;
 
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this))
 			ret = true;
@@ -300,7 +304,7 @@ static bool sanity_check(struct ce_array *ca)
 
 	pr_info("Sanity check dump:\n{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
 	}
@@ -444,7 +448,7 @@ static int array_dump(struct seq_file *m, void *v)
 
 	seq_printf(m, "{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		seq_printf(m, " %3d: [%016llx|%s|%03llx]\n",
 			   i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i]));
@@ -569,6 +573,7 @@ static void __init cec_init(void)
 		return;
 	}
 
+	ce_arr.id_shift = PAGE_SHIFT;
 	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
 	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
 

From patchwork Fri Oct  2 12:22:30 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 313984
Delivered-To: patch@linaro.org
Received: by 2002:a05:6e02:1081:0:0:0:0 with SMTP id r1csp1412436ilj;
 Fri, 2 Oct 2020 05:28:13 -0700 (PDT)
X-Google-Smtp-Source: ABdhPJxMlK77VKNTh8ub7/xUNZspS8LRfpnhu9IxJAKOs1YqKQA+sYng28nzBvo9sgXbZRFfR3RE
X-Received: by 2002:a17:906:1955:: with SMTP id
 b21mr2096557eje.42.1601641692936; 
 Fri, 02 Oct 2020 05:28:12 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1601641692; cv=none;
 d=google.com; s=arc-20160816;
 b=K0T4hw3LFk0J69gsjXp9cwZfoY8Mn1cEgWLd3TZdIBBIpXgT1NTihvqTu7QfCoGK+u
 h2lrGBtCK0ON6CWU88jItrSz3G7Siz20jALsvaide6QDlvILvsG7NaGFnMkFZ4pgyDbT
 E2O8/WGQ9uLvapr7gysmG+qfuwMYD403zdLySnApax7OFLBJChfCzxcQPSZCvNbt+GrK
 vTfXHTvtn6RzrPvPlnOudHTHOujW7OlqHf478djz3dpOx6ZWCh+OreODhAf2MByup2JT
 BDeHWmOEDF23VLGgU8RhBzh22npSzLTTvQfOzPaFhxm4QMleNDCnavtrZr706JdNcLue
 +Pzw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from;
 bh=LWxvSKSbAeA4iwQB3M5oIlzzEeacfrTY+9gi3P9/wP0=;
 b=hZDJzVCFbwDzyiI/3O1VGJ2jxoR4dabNYRdLs8GgofOfs8ysSnMfFq0kPM/IccCVCc
 +A1xuB4Xij0qK+9ToIS8VWf7iSazhVE6ogxE2febi6vnne8FFJpo4EEhhb2PcVgeu+Yw
 k6fOIsiqyGEFMx4F/WAJOVOWqnAKMVgV0nrIaDPCGy6uK6c+1HmZpFB9aPvHoSzr//+l
 Hbq01m7S5xH/aL8kUjQHAQa7gknmBASfPKZfXxMaezgcyEytSO4NQ/E23kkKo/k6RevP
 BvibSlEyaglzQLSG6QFmZDjnxceZM0xNRXviolqDcsL+g4fZOlKThmwNJxxMzSwIVaio
 klTw==
ARC-Authentication-Results: i=1; mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Return-Path: <linux-acpi-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [23.128.96.18])
 by mx.google.com with ESMTP id n2si968319edi.564.2020.10.02.05.28.12; 
 Fri, 02 Oct 2020 05:28:12 -0700 (PDT)
Received-SPF: pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 client-ip=23.128.96.18; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387806AbgJBM2J (ORCPT <rfc822;patch@linaro.org> + 7 others);
 Fri, 2 Oct 2020 08:28:09 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2944 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726010AbgJBM2I (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:28:08 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id 2E84F37E9A5E1E12C15D;
 Fri,  2 Oct 2020 13:28:07 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:28:06 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 2/7] RAS/CEC: Replace pfns_poisoned with elems_poisoned
Date: Fri, 2 Oct 2020 13:22:30 +0100
Message-ID: <20201002122235.1280-3-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

Replace the variable pfns_poisoned with elems_poisoned
for the common use.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

-- 
2.17.1

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 22d11c66c266..f20da1103f27 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -100,8 +100,8 @@ static struct ce_array {
 					 * since the last spring cleaning.
 					 */
 
-	u64 pfns_poisoned;		/*
-					 * number of PFNs which got poisoned.
+	u64 elems_poisoned;		/*
+					 * number of elements which got poisoned.
 					 */
 
 	u64 ces_entered;		/*
@@ -362,7 +362,7 @@ static int cec_add_elem(u64 pfn)
 			/* We have reached max count for this page, soft-offline it. */
 			pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
 			memory_failure_queue(pfn, MF_SOFT_OFFLINE);
-			ca->pfns_poisoned++;
+			ca->elems_poisoned++;
 		}
 
 		del_elem(ca, to);
@@ -457,7 +457,7 @@ static int array_dump(struct seq_file *m, void *v)
 	seq_printf(m, "}\n");
 
 	seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
-		   ca->ces_entered, ca->pfns_poisoned);
+		   ca->ces_entered, ca->elems_poisoned);
 
 	seq_printf(m, "Flags: 0x%x\n", ca->flags);
 

From patchwork Fri Oct  2 12:22:31 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 266968
Return-Path: <SRS0=bNam=DJ=vger.kernel.org=linux-acpi-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
 aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-12.7 required=3.0 tests=BAYES_00,
 HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI,
 SIGNED_OFF_BY, 
 SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT autolearn=ham
 autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
 by smtp.lore.kernel.org (Postfix) with ESMTP id 6A87DC47423
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:29:01 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
 by mail.kernel.org (Postfix) with ESMTP id 3651D21D6C
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:29:01 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S1726386AbgJBM3A (ORCPT <rfc822; linux-acpi@archiver.kernel.org>); 
 Fri, 2 Oct 2020 08:29:00 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2945 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726090AbgJBM3A (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:29:00 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id 49972A71E05F8C9D331E;
 Fri,  2 Oct 2020 13:28:59 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:28:58 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 3/7] RAS/CEC: Move X86 MCE specific code under
 CONFIG_X86_MCE
Date: Fri, 2 Oct 2020 13:22:31 +0100
Message-ID: <20201002122235.1280-4-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

CEC may need to support other architectures such as ARM64.
Move X86 MCE specific code under CONFIG_X86_MCE to support
building for other architectures.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f20da1103f27..803e641d8e5c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -8,7 +8,9 @@
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
 
+#if defined(CONFIG_X86_MCE)
 #include <asm/mce.h>
+#endif
 
 #include "debugfs.h"
 
@@ -511,6 +513,7 @@ static int __init create_debugfs_nodes(void)
 	if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG))
 		return 0;
 
+#if defined(CONFIG_X86_MCE)
 	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
 	if (!pfn) {
 		pr_warn("Error creating pfn debugfs node!\n");
@@ -522,6 +525,7 @@ static int __init create_debugfs_nodes(void)
 		pr_warn("Error creating array debugfs node!\n");
 		goto err;
 	}
+#endif
 
 	return 0;
 
@@ -531,6 +535,7 @@ static int __init create_debugfs_nodes(void)
 	return 1;
 }
 
+#if defined(CONFIG_X86_MCE)
 static int cec_notifier(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -556,28 +561,33 @@ static struct notifier_block cec_nb = {
 	.notifier_call	= cec_notifier,
 	.priority	= MCE_PRIO_CEC,
 };
+#endif
 
 static void __init cec_init(void)
 {
 	if (ce_arr.disabled)
 		return;
 
+#if defined(CONFIG_X86_MCE)
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
 		return;
 	}
+#endif
 
 	if (create_debugfs_nodes()) {
 		free_page((unsigned long)ce_arr.array);
 		return;
 	}
 
+#if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
 	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
 	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
 
 	mce_register_decode_chain(&cec_nb);
+#endif
 
 	pr_info("Correctable Errors collector initialized.\n");
 }

From patchwork Fri Oct  2 12:22:32 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 313986
Delivered-To: patch@linaro.org
Received: by 2002:a05:6e02:1081:0:0:0:0 with SMTP id r1csp1414415ilj;
 Fri, 2 Oct 2020 05:29:55 -0700 (PDT)
X-Google-Smtp-Source: ABdhPJwbmwc4pfrWecmPiZGRwoA4QizTljZttodxS0Eu2AfIo0Z+W3x/5Ksy7TtR6b+40YeFphpX
X-Received: by 2002:a17:907:10db:: with SMTP id
 rv27mr1930091ejb.223.1601641795623; 
 Fri, 02 Oct 2020 05:29:55 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1601641795; cv=none;
 d=google.com; s=arc-20160816;
 b=KL9BCXxNrKbgj185kqS3zHNH6Jd3x999abxdpH2S/LQBEiNopod+bCLWlVMm3jIKsj
 KLgeJ13bVpxTvRZ1DqU7WIdoh9lw2+Vy3fRQ/fGYGAWkSXVbPZwflwiJWTYWpyuqk34G
 oOiIjDZAU/s+i3J0c9jgboY6g2okO9gwzOb+0QaxTrMq8js7Y7N0mBRlB+mqrKz1TNSh
 E0lRbvp2JLuFyEwL7AOgDiU2ziX3l85C32RMv4hzYi/XLb+T+/RCDHky3gvbcoVRrsM4
 qBeC4m82+VPSilmM3NmOiU+t5GQ1Fr8EiFSvQrOJ0FXjV9jLGEJ8ZSmNykQcGukkRrgc
 6wGA==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from;
 bh=iQ0qMQK7aOX3td805pw6eHxd8gAEcGzc/sM+nt+GUi4=;
 b=mECs/4IQJXfozzEji18j4n4KNKZCT5u/RO3inkNAHntgEHnbv9fj9Aav9DStq3zyq7
 qIalZN3h2d0DwetwuInHJiyD+UIG5cTm9NvuIZaK5CBG15WNgoHQx9HQBPCBUwH1BwH+
 oanwHUPFYGiK8+z+e2IuVL1Vq2MYr/zlE8hH9RJjbN+ZVRGjR6XIVJFDT1wJSJvZytxE
 tU6MO2k5MOG/t2R9LA0CthQ/fF7HL31VZQEh5UsGGJj4AOvywmEYbSXUrU60RH9wSTHd
 KtxNrV9ImdmjaT4UfiWYQKQj9nyLxHNIiVvkl7mRiwDIbhsWKx1VwrIuVPp2riRybsZr
 MVPQ==
ARC-Authentication-Results: i=1; mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Return-Path: <linux-acpi-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [23.128.96.18])
 by mx.google.com with ESMTP id n2si968319edi.564.2020.10.02.05.29.55; 
 Fri, 02 Oct 2020 05:29:55 -0700 (PDT)
Received-SPF: pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 client-ip=23.128.96.18; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387677AbgJBM3y (ORCPT <rfc822;patch@linaro.org> + 7 others);
 Fri, 2 Oct 2020 08:29:54 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2946 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726090AbgJBM3x (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:29:53 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.107])
 by Forcepoint Email with ESMTP id BE7E84547CA7565F6D7E;
 Fri,  2 Oct 2020 13:29:51 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:29:51 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 4/7] RAS/CEC: Modify cec_mod_work() for common use
Date: Fri, 2 Oct 2020 13:22:32 +0100
Message-ID: <20201002122235.1280-5-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

Modify the function cec_mod_work() for the common use
with the other error sources.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

-- 
2.17.1

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 803e641d8e5c..f869e7a270b8 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -167,12 +167,12 @@ static void do_spring_cleaning(struct ce_array *ca)
 /*
  * @interval in seconds
  */
-static void cec_mod_work(unsigned long interval)
+static void cec_mod_work(struct delayed_work *dwork, unsigned long interval)
 {
 	unsigned long iv;
 
 	iv = interval * HZ;
-	mod_delayed_work(system_wq, &cec_work, round_jiffies(iv));
+	mod_delayed_work(system_wq, dwork, round_jiffies(iv));
 }
 
 static void cec_work_fn(struct work_struct *work)
@@ -181,7 +181,7 @@ static void cec_work_fn(struct work_struct *work)
 	do_spring_cleaning(&ce_arr);
 	mutex_unlock(&ce_mutex);
 
-	cec_mod_work(decay_interval);
+	cec_mod_work(&cec_work, decay_interval);
 }
 
 /*
@@ -420,7 +420,7 @@ static int decay_interval_set(void *data, u64 val)
 	*(u64 *)data   = val;
 	decay_interval = val;
 
-	cec_mod_work(decay_interval);
+	cec_mod_work(&cec_work, decay_interval);
 
 	return 0;
 }

From patchwork Fri Oct  2 12:22:33 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 266967
Return-Path: <SRS0=bNam=DJ=vger.kernel.org=linux-acpi-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
 aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-12.7 required=3.0 tests=BAYES_00,
 HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI,
 SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED,
 USER_AGENT_GIT autolearn=unavailable
 autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
 by smtp.lore.kernel.org (Postfix) with ESMTP id B5804C47425
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:30:31 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
 by mail.kernel.org (Postfix) with ESMTP id 80A94207EA
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:30:31 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387789AbgJBMa2 (ORCPT <rfc822; linux-acpi@archiver.kernel.org>); 
 Fri, 2 Oct 2020 08:30:28 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2947 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726090AbgJBMa2 (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:30:28 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id B99506BA6F7C194C561B;
 Fri,  2 Oct 2020 13:30:26 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:30:26 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 5/7] RAS/CEC: Add support for errors count check on
 short time period
Date: Fri, 2 Oct 2020 13:22:33 +0100
Message-ID: <20201002122235.1280-6-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

Some types of elements, for example CPU core, should be isolated
when the corrected errors reported too often. This is used for the
early fault prediction and would help to prevent serious faults
by taking corrective actions.
Modify CEC to support for the errors count check on short
time period. Implementation details is added in the file.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 125 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 109 insertions(+), 16 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f869e7a270b8..ca52917d514c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -119,6 +119,23 @@ static struct ce_array {
 					 * shift for element id.
 					 */
 
+	struct delayed_work work;	/*
+					 * delayed work.
+					 */
+
+	bool short_period;		/* Indicates threshold check for the error count
+					 * over short time period.
+					 */
+
+	u8 time_slot;			/*
+					 * time slot's number within the decay interval.
+					 */
+
+	union {
+		struct mutex	mutex;
+		spinlock_t	spin_lock;
+	};
+
 	union {
 		struct {
 			__u32	disabled : 1,	/* cmdline disabled */
@@ -128,7 +145,6 @@ static struct ce_array {
 	};
 } ce_arr;
 
-static DEFINE_MUTEX(ce_mutex);
 static u64 dfs_pfn;
 
 /* Amount of errors after which we offline */
@@ -138,9 +154,35 @@ static u64 action_threshold = COUNT_MASK;
 #define CEC_DECAY_DEFAULT_INTERVAL	24 * 60 * 60	/* 24 hrs */
 #define CEC_DECAY_MIN_INTERVAL		 1 * 60 * 60	/* 1h */
 #define CEC_DECAY_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
-static struct delayed_work cec_work;
 static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL;
 
+/* Definitions for elements (for example CPU) for which
+ * error count on shrot time period is checked with threshold.
+ *
+ * An element such as a CPU core may need to isolate when large number of
+ * correctable errors are reported on that element too often. When the
+ * CEs count is exceeded the threshold value in a short time period.
+ *
+ * The decay interval is divided into a number of time slots. The CE collector
+ * calculates the average error count at the end of each decay interval. Then
+ * the average count would be subtracted from the total count in each following
+ * time slots. The work function for the decay interval would be set  for the
+ * reduced time period = decay interval/ number of time slots. When the new
+ * CE count for a cpu is added, the element would be offlined when the sum of
+ * the most recent CEs counts exceeded the CE threshold value.
+ */
+
+/*
+ * u64: [ 63 ELEM ID 23 | ELEM_STATUS_BIT 22 | 21 AVG_COUNT_BITS 12 | 11 DECAY_BITS 10 | 9 COUNT_BITS 0]
+ */
+
+/* Number of time slots in the decay interval */
+#define RAS_CEC_NUM_TIME_SLOTS	10
+
+#define AVG_COUNT_SHIFT	(DECAY_BITS + COUNT_BITS)
+#define ELEM_STATUS_BIT	BIT(22)	/* Indicates an element offlined by CEC */
+#define ELEM_ID_SHIFT	(1 + AVG_COUNT_SHIFT + COUNT_BITS)
+
 /*
  * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
  * element in the array. On insertion and any access, it gets reset to max.
@@ -177,11 +219,62 @@ static void cec_mod_work(struct delayed_work *dwork, unsigned long interval)
 
 static void cec_work_fn(struct work_struct *work)
 {
-	mutex_lock(&ce_mutex);
-	do_spring_cleaning(&ce_arr);
-	mutex_unlock(&ce_mutex);
+	struct ce_array *ca;
+	unsigned long flags;
+	u64 avg_count;
+	int i, time_slots = 1;
+	struct delayed_work *d_work = container_of(work, struct delayed_work, work);
+
+	if (!d_work)
+		return;
+
+	ca = container_of(d_work, struct ce_array, work);
+	if (!ca->array || ca->disabled)
+		return;
 
-	cec_mod_work(&cec_work, decay_interval);
+	if (!ca->short_period) {
+		mutex_lock(&ca->mutex);
+		do_spring_cleaning(ca);
+		mutex_unlock(&ca->mutex);
+	} else {
+		time_slots = RAS_CEC_NUM_TIME_SLOTS;
+		spin_lock_irqsave(&ca->spin_lock, flags);
+		ca->time_slot = (ca->time_slot + 1) % RAS_CEC_NUM_TIME_SLOTS;
+
+		for (i = 0; i < ca->n; i++) {
+			if (ca->array[i] & ELEM_STATUS_BIT)
+				continue;
+
+			/* clear old errors count approximately by subtracting the avg count
+			 * from the total errors count.
+			 */
+			avg_count = (ca->array[i] >> AVG_COUNT_SHIFT) & COUNT_MASK;
+			ca->array[i] -= avg_count;
+		}
+
+		if (ca->time_slot) {
+			spin_unlock_irqrestore(&ca->spin_lock, flags);
+			goto exit;
+		}
+
+		for (i = 0; i < ca->n; i++) {
+			if (ca->array[i] & ELEM_STATUS_BIT)
+				continue;
+
+			/* calculate average error count for the completed time period */
+			avg_count = COUNT(ca->array[i]) / RAS_CEC_NUM_TIME_SLOTS;
+			ca->array[i] -= (COUNT(ca->array[i]) % RAS_CEC_NUM_TIME_SLOTS);
+			/* store average error count */
+			ca->array[i] &= ~(COUNT_MASK << AVG_COUNT_SHIFT);
+			ca->array[i] |= (avg_count << AVG_COUNT_SHIFT);
+		}
+
+		do_spring_cleaning(ca);
+		spin_unlock_irqrestore(&ca->spin_lock, flags);
+	}
+
+exit:
+	cec_mod_work(&ca->work, decay_interval/time_slots);
 }
 
 /*
@@ -279,9 +372,9 @@ static u64 __maybe_unused del_lru_elem(void)
 	if (!ca->n)
 		return 0;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 	pfn = del_lru_elem_unlocked(ca);
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return pfn;
 }
@@ -328,7 +421,7 @@ static int cec_add_elem(u64 pfn)
 	if (!ce_arr.array || ce_arr.disabled)
 		return -ENODEV;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 
 	ca->ces_entered++;
 
@@ -386,7 +479,7 @@ static int cec_add_elem(u64 pfn)
 	WARN_ON_ONCE(sanity_check(ca));
 
 unlock:
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return ret;
 }
@@ -420,7 +513,7 @@ static int decay_interval_set(void *data, u64 val)
 	*(u64 *)data   = val;
 	decay_interval = val;
 
-	cec_mod_work(&cec_work, decay_interval);
+	cec_mod_work(&ce_arr.work, decay_interval);
 
 	return 0;
 }
@@ -446,7 +539,7 @@ static int array_dump(struct seq_file *m, void *v)
 	struct ce_array *ca = &ce_arr;
 	int i;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 
 	seq_printf(m, "{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
@@ -468,7 +561,7 @@ static int array_dump(struct seq_file *m, void *v)
 
 	seq_printf(m, "Action threshold: %lld\n", action_threshold);
 
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return 0;
 }
@@ -583,9 +676,9 @@ static void __init cec_init(void)
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
-	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
-	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
-
+	mutex_init(&ce_arr.mutex);
+	INIT_DELAYED_WORK(&ce_arr.work, cec_work_fn);
+	schedule_delayed_work(&ce_arr.work, CEC_DECAY_DEFAULT_INTERVAL);
 	mce_register_decode_chain(&cec_nb);
 #endif
 

From patchwork Fri Oct  2 12:22:34 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 313988
Delivered-To: patch@linaro.org
Received: by 2002:a05:6e02:1081:0:0:0:0 with SMTP id r1csp1416239ilj;
 Fri, 2 Oct 2020 05:31:27 -0700 (PDT)
X-Google-Smtp-Source: ABdhPJwu8cMpqRpaIys5Vi4+kI2ackRUmmUGI1PVCDZHQ9mCY655/N8E0Tw0HP8u426ZFXfk0WGr
X-Received: by 2002:a50:d64f:: with SMTP id c15mr2019216edj.248.1601641886812; 
 Fri, 02 Oct 2020 05:31:26 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1601641886; cv=none;
 d=google.com; s=arc-20160816;
 b=j/NJL7zMUVBAX7wnSHVdInH/sP5bNdbe92GeDv6C+31PA8wTv4HYgiiHQ1SnBiZ+JO
 CZFFZz70V1Kb3gDnHVEHJFKV2TcTWadbykw+9lPX4qEAk927184CdvdeKRLfVltmD+To
 l8Na3nNltEBPxLrpsVg4dAg+2Pm/oBYcCdQ2R9e7micGzVE05vP6fs4qdGfb2pm3fBNj
 MNEyILFrNd6PZa9boMLENpuUUJsKpGAW/IthGEKgmNpKbJ9kz5pNJIZZxyf4MEFS8mDj
 nSbEgrujZ+YSaIKaEws96HXmAFV9XRxrL9deGtmc8xusL5hoTqK2AG5ncJ72E/0wEv7C
 qezQ==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816; 
 h=list-id:precedence:content-transfer-encoding:mime-version
 :references:in-reply-to:message-id:date:subject:cc:to:from;
 bh=v87pQUlepHvyxBhvahWapd3XfQgAbDNn3b+ROKkMl7g=;
 b=vJKEXKlrgaunsiIk+GmYF3SM9ZN1Tc3oVdoPLWJeIcNyl/4PwM7ME0A8N/G6+qLi1x
 w/E5ZSGlqE4ZDVtqtvB0zBWdJJlzT6sqD995vQuMjBC8fSSNDBB1FBCnW6b9y6mFrPIK
 yRG8fU0ihkvhpZoYCO3l1qNwc1eTjYdVGPr6n/oJ0M4jj/6JOQolI8MOLoCqnDslyLf3
 Ih4qGvWYj+McGCoESXc6pQ8ci7p3MxX7ZCeqAhLWuAYOKBxyd6DYRaCCo0uoW8+ze79Q
 cIXl12frPvOfTUVL9Zg+WoDcttdV1WPrfOf1cm5zcJLhXuvLRm3zfyh9+pQNDuLMpFHN
 JqBw==
ARC-Authentication-Results: i=1; mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Return-Path: <linux-acpi-owner@vger.kernel.org>
Received: from vger.kernel.org (vger.kernel.org. [23.128.96.18])
 by mx.google.com with ESMTP id g7si925667edu.307.2020.10.02.05.31.26; 
 Fri, 02 Oct 2020 05:31:26 -0700 (PDT)
Received-SPF: pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 client-ip=23.128.96.18; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: domain of linux-acpi-owner@vger.kernel.org
 designates 23.128.96.18 as permitted sender)
 smtp.mailfrom=linux-acpi-owner@vger.kernel.org
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387856AbgJBMbW (ORCPT <rfc822;patch@linaro.org> + 7 others);
 Fri, 2 Oct 2020 08:31:22 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2948 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726090AbgJBMbV (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:31:21 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id 4821A485866C963EC62F;
 Fri,  2 Oct 2020 13:31:20 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:31:19 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 6/7] RAS/CEC: Add CPU Correctable Error Collector to
 isolate an erroneous CPU core
Date: Fri, 2 Oct 2020 13:22:34 +0100
Message-ID: <20201002122235.1280-7-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

When the CPU correctable errors, for example L1/L2 cache errors,
reported on an ARM64 CPU core too often, it should be isolated.
Add the CPU correctable error collector to store the CPU correctable
error count.

When the correctable error count for a CPU exceed the threshold
value in a short time period, it will try to isolate the CPU core.

If disabling entire CPU core is not acceptable, Please suggest
method to disable L1 and L2 cache on ARM64 core?

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 arch/arm64/ras/Kconfig |  17 +++
 drivers/ras/Kconfig    |   1 +
 drivers/ras/cec.c      | 231 +++++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h    |   9 ++
 4 files changed, 247 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/ras/Kconfig

-- 
2.17.1

diff --git a/arch/arm64/ras/Kconfig b/arch/arm64/ras/Kconfig
new file mode 100644
index 000000000000..bfa14157cd2e
--- /dev/null
+++ b/arch/arm64/ras/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+config RAS_CEC
+        bool "Correctable Errors Collector"
+        depends on ARM64 && HOTPLUG_CPU && DEBUG_FS
+        help
+          This is a small cache which collects correctable CPU errors and
+          counts their repeated occurrence. Once the counter for a CPU
+          overflows in a short time period, we try to offline that CPU
+          as we take it to mean that it has reached a relatively high error
+          count and would probably be best if we don't use it anymore.
+
+          Presently CPU error correction enabld for ARM64 platform only.
+
+config RAS_CEC_DEBUG
+        bool "CEC debugging machinery"
+        default n
+        depends on RAS_CEC
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig
index c2a236f2e846..d2f877e5f7ad 100644
--- a/drivers/ras/Kconfig
+++ b/drivers/ras/Kconfig
@@ -32,5 +32,6 @@ menuconfig RAS
 if RAS
 
 source "arch/x86/ras/Kconfig"
+source "arch/arm64/ras/Kconfig"
 
 endif
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index ca52917d514c..408bf2ac2461 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -7,6 +7,8 @@
 #include <linux/ras.h>
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
 
 #if defined(CONFIG_X86_MCE)
 #include <asm/mce.h>
@@ -143,7 +145,7 @@ static struct ce_array {
 		};
 		__u32 flags;
 	};
-} ce_arr;
+} ce_arr, cpu_ce_arr;
 
 static u64 dfs_pfn;
 
@@ -156,6 +158,8 @@ static u64 action_threshold = COUNT_MASK;
 #define CEC_DECAY_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
 static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL;
 
+static const char * const bins[] = { "00", "01", "10", "11" };
+
 /* Definitions for elements (for example CPU) for which
  * error count on shrot time period is checked with threshold.
  *
@@ -484,6 +488,172 @@ static int cec_add_elem(u64 pfn)
 	return ret;
 }
 
+struct cec_elem_offline {
+	struct work_struct work;
+	struct ce_array *ca;
+	int array_index;
+	int elem_id;
+};
+
+/*
+ * Work function to offline a cpu because the offlining to be done
+ * in the process context.
+ */
+static void cec_cpu_offline_work_fn(struct work_struct *work)
+{
+	int rc, cpu;
+	struct cec_elem_offline *elem;
+	struct ce_array *ca;
+
+	elem = container_of(work, struct cec_elem_offline, work);
+
+	cpu = elem->elem_id;
+	if (!cpu_online(cpu))
+		return;
+
+	rc = remove_cpu(cpu);
+	if (rc) {
+		pr_warn("Failed to offline CPU%d, error %d\n", cpu, rc);
+	} else {
+		ca = elem->ca;
+		ca->array[elem->array_index] |= ELEM_STATUS_BIT;
+	}
+
+	kfree(elem);
+}
+
+int cec_cpu_add_elem(int cpu, u64 ce_count)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned int to = 0;
+	int count, ret = 0;
+	unsigned long flags;
+	struct cec_elem_offline *elem;
+
+	/*
+	 * We can be called very early on the identify_cpu() path where we are
+	 * not initialized yet. We ignore the error for simplicity.
+	 */
+	if (!ca->array || ca->disabled || !cpu_online(cpu))
+		return -ENODEV;
+
+	spin_lock_irqsave(&ca->spin_lock, flags);
+
+	ca->ces_entered++;
+
+	ret = find_elem(ca, cpu, &to);
+	if (ret < 0) {
+		/*
+		 * Shift range [to-end] to make room for one more element.
+		 */
+		memmove((void *)&ca->array[to + 1],
+			(void *)&ca->array[to],
+			(ca->n - to) * sizeof(u64));
+
+		ca->array[to] = cpu << ca->id_shift;
+		ca->n++;
+	}
+
+	/* Error received for a previously CEC offlined CPU, which later online elsewhere.
+	 * reset array.
+	 */
+	if (ca->array[to] & ELEM_STATUS_BIT) {
+		ca->array[to] &= ~(ELEM_STATUS_BIT);
+		ca->array[to] &= ~(COUNT_MASK);
+	}
+
+	/* Add/refresh element generation and increment count */
+	ca->array[to] |= DECAY_MASK << COUNT_BITS;
+	ca->array[to] += ce_count;
+
+	/* Check action threshold and offline, if reached. */
+	count = COUNT(ca->array[to]);
+	if (count >= action_threshold) {
+		if (!cpu_online(cpu)) {
+			pr_warn("CEC: Invalid cpu: %d\n", cpu);
+		} else {
+			/* We have reached max count for this cpu, offline it. */
+			ca->elems_poisoned++;
+			/* schedule work function to offline the cpu */
+			elem = kmalloc(sizeof(*elem), GFP_NOWAIT);
+			if (elem) {
+				pr_info("CEC: offlining cpu: %d\n", cpu);
+				elem->ca = ca;
+				elem->array_index = to;
+				elem->elem_id = cpu;
+				INIT_WORK(&elem->work, cec_cpu_offline_work_fn);
+				schedule_work(&elem->work);
+			} else
+				pr_warn("CEC: offlining cpu: out of memory %d\n", cpu);
+		}
+
+		/*
+		 * Return a >0 value to callers, to denote that we've reached
+		 * the offlining threshold.
+		 */
+		ret = 1;
+
+		goto unlock;
+	}
+
+	ca->decay_count++;
+
+	/* Do we need to call spring cleaning for the modules(eg CPU) with
+	 * small number of elements?
+	 */
+	if (ca->decay_count >= (num_present_cpus() >> DECAY_BITS))
+		do_spring_cleaning(ca);
+
+	WARN_ON_ONCE(sanity_check(ca));
+
+unlock:
+	spin_unlock_irqrestore(&ca->spin_lock, flags);
+
+	return ret;
+}
+
+static int cec_cpu_stats_show(struct seq_file *seq, void *v)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&cpu_ce_arr.spin_lock, flags);
+	seq_puts(seq, "CEC CPU Stats:\n");
+
+	seq_printf(seq, "{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		int cpu = ELEM_NO(ca->array[i], ca->id_shift);
+
+	seq_printf(seq, "cpu=%d: %03llx\n",
+		   cpu, ca->array[i]);
+
+	seq_printf(seq, " %3d: [%d|%s|%03lld|%s]\n",
+		   i, cpu, bins[DECAY(ca->array[i])],
+		   COUNT(ca->array[i]),
+		   cpu_online(cpu) ? "online" :
+		   (ca->array[i] & ELEM_STATUS_BIT) ?
+		   "offlined-by-cec" : "offline");
+	}
+
+	seq_printf(seq, "}\n");
+
+	seq_printf(seq, "Stats:\nCEs: %llu\nofflined CPUs: %llu\n",
+		   ca->ces_entered, ca->elems_poisoned);
+
+	seq_printf(seq, "Flags: 0x%x\n", ca->flags);
+
+	seq_printf(seq, "Decay interval: %lld seconds\n", decay_interval);
+	seq_printf(seq, "Decays: %lld\n", ca->decays_done);
+
+	seq_printf(seq, "Action threshold: %lld\n", action_threshold);
+
+	spin_unlock_irqrestore(&cpu_ce_arr.spin_lock, flags);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cec_cpu_stats);
+
 static int u64_get(void *data, u64 *val)
 {
 	*val = *(u64 *)data;
@@ -514,6 +684,7 @@ static int decay_interval_set(void *data, u64 val)
 	decay_interval = val;
 
 	cec_mod_work(&ce_arr.work, decay_interval);
+	cec_mod_work(&cpu_ce_arr.work, decay_interval/RAS_CEC_NUM_TIME_SLOTS);
 
 	return 0;
 }
@@ -532,8 +703,6 @@ static int action_threshold_set(void *data, u64 val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
 
-static const char * const bins[] = { "00", "01", "10", "11" };
-
 static int array_dump(struct seq_file *m, void *v)
 {
 	struct ce_array *ca = &ce_arr;
@@ -620,6 +789,14 @@ static int __init create_debugfs_nodes(void)
 	}
 #endif
 
+#if defined(CONFIG_ARM64)
+	array = debugfs_create_file("cpu_stats", 0400, d, NULL, &cec_cpu_stats_fops);
+	if (!array) {
+		pr_warn("Error creating cpu_stats debugfs node!\n");
+		goto err;
+	}
+#endif
+
 	return 0;
 
 err:
@@ -658,21 +835,26 @@ static struct notifier_block cec_nb = {
 
 static void __init cec_init(void)
 {
-	if (ce_arr.disabled)
+	if (ce_arr.disabled && cpu_ce_arr.disabled)
 		return;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
-		return;
+		goto error;
 	}
 #endif
 
-	if (create_debugfs_nodes()) {
-		free_page((unsigned long)ce_arr.array);
-		return;
-	}
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.array = kcalloc(num_present_cpus(), sizeof(*(cpu_ce_arr.array)),
+				   GFP_KERNEL);
+	if (!cpu_ce_arr.array)
+		goto error;
+#endif
+
+	if (create_debugfs_nodes())
+		goto error;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
@@ -682,22 +864,49 @@ static void __init cec_init(void)
 	mce_register_decode_chain(&cec_nb);
 #endif
 
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.short_period = true;
+	cpu_ce_arr.id_shift = ELEM_ID_SHIFT;
+	spin_lock_init(&cpu_ce_arr.spin_lock);
+	INIT_DELAYED_WORK(&cpu_ce_arr.work, cec_work_fn);
+	schedule_delayed_work(&cpu_ce_arr.work, CEC_DECAY_DEFAULT_INTERVAL/RAS_CEC_NUM_TIME_SLOTS);
+#endif
+
 	pr_info("Correctable Errors collector initialized.\n");
+	return;
+error:
+#if defined(CONFIG_ARM64)
+	kfree(cpu_ce_arr.array);
+#endif
+	if (ce_arr.array)
+		free_page((unsigned long)ce_arr.array);
+
 }
 late_initcall(cec_init);
 
 int __init parse_cec_param(char *str)
 {
+	bool match = false;
+
 	if (!str)
 		return 0;
 
 	if (*str == '=')
 		str++;
 
-	if (!strcmp(str, "cec_disable"))
+	if (!strcmp(str, "cec_disable")) {
 		ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (!strcmp(str, "cec_cpu_disable")) {
+		cpu_ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (match)
+		return 1;
 	else
 		return 0;
 
-	return 1;
 }
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 1f4048bf2674..43d91298f1e3 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -18,6 +18,15 @@ static inline int ras_add_daemon_trace(void) { return 0; }
 
 #ifdef CONFIG_RAS_CEC
 int __init parse_cec_param(char *str);
+/**
+ * cec_cpu_add_elem - add the count of CPU correctable errors to the
+ * CEC(correctable errors collector).
+ * @cpu: CPU index.
+ * @ce_count: CPU correctable errors count.
+ */
+int cec_cpu_add_elem(int cpu, u64 ce_count);
+#else
+static inline int cec_cpu_add_elem(int cpu, u64 ce_count) { return -ENODEV; }
 #endif
 
 #ifdef CONFIG_RAS

From patchwork Fri Oct  2 12:22:35 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 266966
Return-Path: <SRS0=bNam=DJ=vger.kernel.org=linux-acpi-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
 aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-12.7 required=3.0 tests=BAYES_00,
 HEADER_FROM_DIFFERENT_DOMAINS, INCLUDES_PATCH, MAILING_LIST_MULTI,
 SIGNED_OFF_BY, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED,
 USER_AGENT_GIT autolearn=unavailable
 autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
 by smtp.lore.kernel.org (Postfix) with ESMTP id 496D1C4727D
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:31:57 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
 by mail.kernel.org (Postfix) with ESMTP id 174BD20719
 for <linux-acpi@archiver.kernel.org>;
 Fri,  2 Oct 2020 12:31:57 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
 id S2387767AbgJBMbz (ORCPT <rfc822; linux-acpi@archiver.kernel.org>); 
 Fri, 2 Oct 2020 08:31:55 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2949 "EHLO huawei.com"
 rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
 id S1726090AbgJBMbz (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
 Fri, 2 Oct 2020 08:31:55 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
 by Forcepoint Email with ESMTP id 6DD1FBE0408B33BD3372;
 Fri,  2 Oct 2020 13:31:54 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP
 Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:31:53 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
 <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
 <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
 <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 7/7] ACPI / APEI: Add reporting ARM64 CPU correctable
 errors to the CEC
Date: Fri, 2 Oct 2020 13:22:35 +0100
Message-ID: <20201002122235.1280-8-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org

Add reporting ARM64 CPU correctable errors to the RAS correctable
errors collector(CEC).

ARM processor error types are cache/TLB/bus errors.
Any of the above error types should not be consider for the
error collection and CPU core isolation?

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/acpi/apei/ghes.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81bf71b10d44..3cecb457d352 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -511,6 +511,38 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+static void ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	struct cper_arm_err_info *err_info;
+	int sec_sev;
+	int cpu, i, ret;
+
+	log_arm_hw_error(err);
+
+	sec_sev = ghes_severity(gdata->error_severity);
+	if (sec_sev != GHES_SEV_CORRECTED)
+		return;
+
+#if defined(CONFIG_ARM64)
+	cpu = get_logical_index(err->mpidr);
+	if (cpu == -EINVAL)
+		return;
+
+	/* ARM processor error types are cache/tlb/bus errors.
+	 * Any of the above error types should not be consider for the
+	 * error collection and CPU core isolation?
+	 */
+	err_info = (struct cper_arm_err_info *)(err + 1);
+	for (i = 0; i < err->err_info_num; i++) {
+		ret = cec_cpu_add_elem(cpu, err_info->multiple_error + 1);
+		if (ret)
+			break;
+		err_info += 1;
+	}
+#endif
+}
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -543,9 +575,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-
-			log_arm_hw_error(err);
+			ghes_handle_arm_hw_error(gdata);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);