diff mbox series

[v11,1/2] ACPI / APEI: Add a notifier chain for unknown (vendor) CPER records

Message ID 20200622120527.690-2-shiju.jose@huawei.com
State Superseded
Headers show
Series ACPI / APEI: Add support to notify the vendor specific HW errors | expand

Commit Message

Shiju Jose June 22, 2020, 12:05 p.m. UTC
CPER records describing a firmware-first error are identified by GUID.
The ghes driver currently logs, but ignores any unknown CPER records.
This prevents describing errors that can't be represented by a standard
entry, that would otherwise allow a driver to recover from an error.
The UEFI spec calls these 'Non-standard Section Body' (N.2.3 of
version 2.8).

Add a notifier chain for these non-standard/vendor-records. Callers
must identify their type of records by GUID.

Record data is copied to memory from the ghes_estatus_pool to allow
us to keep it until after the notifier has run.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>

[ Removed kfifo and ghes_gdata_pool. Expanded commit message ]
Signed-off-by: James Morse <james.morse@arm.com>

---
 drivers/acpi/apei/ghes.c | 63 ++++++++++++++++++++++++++++++++++++++++
 include/acpi/ghes.h      | 27 +++++++++++++++++
 2 files changed, 90 insertions(+)

-- 
2.17.1

Comments

Shiju Jose July 13, 2020, 8:35 a.m. UTC | #1
Hi Rafael, Hi James,

Can you help to merge this patch because I added and tested all the suggestions from James.

Thanks,
Shiju

>-----Original Message-----

>From: linux-pci-owner@vger.kernel.org [mailto:linux-pci-

>owner@vger.kernel.org] On Behalf Of Shiju Jose

>Sent: 22 June 2020 13:05

>To: linux-acpi@vger.kernel.org; linux-pci@vger.kernel.org; linux-

>kernel@vger.kernel.org; rjw@rjwysocki.net; helgaas@kernel.org;

>bp@alien8.de; james.morse@arm.com; lenb@kernel.org;

>tony.luck@intel.com; dan.carpenter@oracle.com;

>zhangliguang@linux.alibaba.com; andriy.shevchenko@linux.intel.com;

>Wangkefeng (OS Kernel Lab) <wangkefeng.wang@huawei.com>;

>jroedel@suse.de

>Cc: Linuxarm <linuxarm@huawei.com>; yangyicong

><yangyicong@huawei.com>; Jonathan Cameron

><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>

>Subject: [PATCH v11 1/2] ACPI / APEI: Add a notifier chain for unknown

>(vendor) CPER records

>

>CPER records describing a firmware-first error are identified by GUID.

>The ghes driver currently logs, but ignores any unknown CPER records.

>This prevents describing errors that can't be represented by a standard entry,

>that would otherwise allow a driver to recover from an error.

>The UEFI spec calls these 'Non-standard Section Body' (N.2.3 of version 2.8).

>

>Add a notifier chain for these non-standard/vendor-records. Callers must

>identify their type of records by GUID.

>

>Record data is copied to memory from the ghes_estatus_pool to allow us to

>keep it until after the notifier has run.

>

>Signed-off-by: Shiju Jose <shiju.jose@huawei.com> [ Removed kfifo and

>ghes_gdata_pool. Expanded commit message ]

>Signed-off-by: James Morse <james.morse@arm.com>

>---

> drivers/acpi/apei/ghes.c | 63

>++++++++++++++++++++++++++++++++++++++++

> include/acpi/ghes.h      | 27 +++++++++++++++++

> 2 files changed, 90 insertions(+)

>

>diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index

>81bf71b10d44..99df00f64306 100644

>--- a/drivers/acpi/apei/ghes.c

>+++ b/drivers/acpi/apei/ghes.c

>@@ -79,6 +79,12 @@

> 	((struct acpi_hest_generic_status *)				\

> 	 ((struct ghes_estatus_node *)(estatus_node) + 1))

>

>+#define GHES_VENDOR_ENTRY_LEN(gdata_len)                               \

>+	(sizeof(struct ghes_vendor_record_entry) + (gdata_len))

>+#define GHES_GDATA_FROM_VENDOR_ENTRY(vendor_entry)                     \

>+	((struct acpi_hest_generic_data *)                              \

>+	((struct ghes_vendor_record_entry *)(vendor_entry) + 1))

>+

> /*

>  *  NMI-like notifications vary by architecture, before the compiler can prune

>  *  unused static functions it needs a value for these enums.

>@@ -123,6 +129,12 @@ static DEFINE_MUTEX(ghes_list_mutex);

>  */

> static DEFINE_SPINLOCK(ghes_notify_lock_irq);

>

>+struct ghes_vendor_record_entry {

>+	struct work_struct work;

>+	int error_severity;

>+	char vendor_record[];

>+};

>+

> static struct gen_pool *ghes_estatus_pool;  static unsigned long

>ghes_estatus_pool_size_request;

>

>@@ -511,6 +523,56 @@ static void ghes_handle_aer(struct

>acpi_hest_generic_data *gdata)  #endif  }

>

>+static BLOCKING_NOTIFIER_HEAD(vendor_record_notify_list);

>+

>+int ghes_register_vendor_record_notifier(struct notifier_block *nb) {

>+	return blocking_notifier_chain_register(&vendor_record_notify_list,

>+nb); } EXPORT_SYMBOL_GPL(ghes_register_vendor_record_notifier);

>+

>+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)

>+{

>+	blocking_notifier_chain_unregister(&vendor_record_notify_list, nb);

>}

>+EXPORT_SYMBOL_GPL(ghes_unregister_vendor_record_notifier);

>+

>+static void ghes_vendor_record_work_func(struct work_struct *work) {

>+	struct ghes_vendor_record_entry *entry;

>+	struct acpi_hest_generic_data *gdata;

>+	u32 len;

>+

>+	entry = container_of(work, struct ghes_vendor_record_entry, work);

>+	gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);

>+

>+	blocking_notifier_call_chain(&vendor_record_notify_list,

>+				     entry->error_severity, gdata);

>+

>+	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));

>+	gen_pool_free(ghes_estatus_pool, (unsigned long)entry, len); }

>+

>+static void ghes_defer_non_standard_event(struct acpi_hest_generic_data

>*gdata,

>+					  int sev)

>+{

>+	struct acpi_hest_generic_data *copied_gdata;

>+	struct ghes_vendor_record_entry *entry;

>+	u32 len;

>+

>+	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));

>+	entry = (void *)gen_pool_alloc(ghes_estatus_pool, len);

>+	if (!entry)

>+		return;

>+

>+	copied_gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);

>+	memcpy(copied_gdata, gdata, acpi_hest_get_record_size(gdata));

>+	entry->error_severity = sev;

>+

>+	INIT_WORK(&entry->work, ghes_vendor_record_work_func);

>+	schedule_work(&entry->work);

>+}

>+

> static bool ghes_do_proc(struct ghes *ghes,

> 			 const struct acpi_hest_generic_status *estatus)  {

>@@ -549,6 +611,7 @@ static bool ghes_do_proc(struct ghes *ghes,

> 		} else {

> 			void *err = acpi_hest_get_payload(gdata);

>

>+			ghes_defer_non_standard_event(gdata, sev);

> 			log_non_standard_event(sec_type, fru_id, fru_text,

> 					       sec_sev, err,

> 					       gdata->error_data_length);

>diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index

>517a5231cc1b..ae0e8847fdd5 100644

>--- a/include/acpi/ghes.h

>+++ b/include/acpi/ghes.h

>@@ -53,6 +53,33 @@ enum {

> 	GHES_SEV_PANIC = 0x3,

> };

>

>+#ifdef CONFIG_ACPI_APEI_GHES

>+/**

>+ * ghes_register_vendor_record_notifier - register a notifier for

>+vendor

>+ * records that the kernel would otherwise ignore.

>+ * @nb: pointer to the notifier_block structure of the event handler.

>+ *

>+ * return 0 : SUCCESS, non-zero : FAIL

>+ */

>+int ghes_register_vendor_record_notifier(struct notifier_block *nb);

>+

>+/**

>+ * ghes_unregister_vendor_record_notifier - unregister the previously

>+ * registered vendor record notifier.

>+ * @nb: pointer to the notifier_block structure of the vendor record

>handler.

>+ */

>+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);

>+#else static inline int ghes_register_vendor_record_notifier(struct

>+notifier_block *nb) {

>+	return -ENODEV;

>+}

>+

>+static inline void ghes_unregister_vendor_record_notifier(struct

>+notifier_block *nb) { } #endif

>+

> int ghes_estatus_pool_init(int num_ghes);

>

> /* From drivers/edac/ghes_edac.c */

>--

>2.17.1

>
Rafael J. Wysocki July 13, 2020, 11:17 a.m. UTC | #2
On Mon, Jul 13, 2020 at 10:35 AM Shiju Jose <shiju.jose@huawei.com> wrote:
>

> Hi Rafael, Hi James,

>

> Can you help to merge this patch because I added and tested all the suggestions from James.


I could apply the [1/2] in principle, but I need an ACK for the [2/2]
from the PCI side.

That said, it looks like the [1/2] is a James' patch that you are
sending with some changes made by you.

In that case the ordering of the S-o-b tags under it should be
different (the S-o-b from James, the what-you-have-change line and the
S-o-b from you) and also the From: tag should point to James.

Thanks!
Shiju Jose July 13, 2020, 1:33 p.m. UTC | #3
Hi Rafael,

>-----Original Message-----

>From: Rafael J. Wysocki [mailto:rafael@kernel.org]

>Sent: 13 July 2020 12:18

>To: Shiju Jose <shiju.jose@huawei.com>

>Cc: linux-acpi@vger.kernel.org; linux-pci@vger.kernel.org; linux-

>kernel@vger.kernel.org; rjw@rjwysocki.net; helgaas@kernel.org;

>bp@alien8.de; james.morse@arm.com; lenb@kernel.org;

>tony.luck@intel.com; dan.carpenter@oracle.com;

>zhangliguang@linux.alibaba.com; andriy.shevchenko@linux.intel.com;

>Wangkefeng (OS Kernel Lab) <wangkefeng.wang@huawei.com>;

>jroedel@suse.de; Linuxarm <linuxarm@huawei.com>; yangyicong

><yangyicong@huawei.com>; Jonathan Cameron

><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>

>Subject: Re: [PATCH v11 1/2] ACPI / APEI: Add a notifier chain for unknown

>(vendor) CPER records

>

>On Mon, Jul 13, 2020 at 10:35 AM Shiju Jose <shiju.jose@huawei.com>

>wrote:

>>

>> Hi Rafael, Hi James,

>>

>> Can you help to merge this patch because I added and tested all the

>suggestions from James.

>

>I could apply the [1/2] in principle, but I need an ACK for the [2/2] from the

>PCI side.

>

>That said, it looks like the [1/2] is a James' patch that you are sending with

>some changes made by you.

James added following changes on top of the original patch(V10) by me, 
[ Removed kfifo and ghes_gdata_pool. Expanded commit message ]
I had confusion how the S-o-b tag to be added for James's changes in the V11 patch posted.

>

>In that case the ordering of the S-o-b tags under it should be different (the S-

>o-b from James, the what-you-have-change line and the S-o-b from you) and

>also the From: tag should point to James.

>

>Thanks!


Thanks,
Shiju
Rafael J. Wysocki July 13, 2020, 1:38 p.m. UTC | #4
On Mon, Jul 13, 2020 at 3:33 PM Shiju Jose <shiju.jose@huawei.com> wrote:
>

> Hi Rafael,

>

> >-----Original Message-----

> >From: Rafael J. Wysocki [mailto:rafael@kernel.org]

> >Sent: 13 July 2020 12:18

> >To: Shiju Jose <shiju.jose@huawei.com>

> >Cc: linux-acpi@vger.kernel.org; linux-pci@vger.kernel.org; linux-

> >kernel@vger.kernel.org; rjw@rjwysocki.net; helgaas@kernel.org;

> >bp@alien8.de; james.morse@arm.com; lenb@kernel.org;

> >tony.luck@intel.com; dan.carpenter@oracle.com;

> >zhangliguang@linux.alibaba.com; andriy.shevchenko@linux.intel.com;

> >Wangkefeng (OS Kernel Lab) <wangkefeng.wang@huawei.com>;

> >jroedel@suse.de; Linuxarm <linuxarm@huawei.com>; yangyicong

> ><yangyicong@huawei.com>; Jonathan Cameron

> ><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>

> >Subject: Re: [PATCH v11 1/2] ACPI / APEI: Add a notifier chain for unknown

> >(vendor) CPER records

> >

> >On Mon, Jul 13, 2020 at 10:35 AM Shiju Jose <shiju.jose@huawei.com>

> >wrote:

> >>

> >> Hi Rafael, Hi James,

> >>

> >> Can you help to merge this patch because I added and tested all the

> >suggestions from James.

> >

> >I could apply the [1/2] in principle, but I need an ACK for the [2/2] from the

> >PCI side.

> >

> >That said, it looks like the [1/2] is a James' patch that you are sending with

> >some changes made by you.

> James added following changes on top of the original patch(V10) by me,

> [ Removed kfifo and ghes_gdata_pool. Expanded commit message ]

> I had confusion how the S-o-b tag to be added for James's changes in the V11 patch posted.


So James should have sent the patch with his S-o-b under it.

You cannot add S-o-b for somebody else to any patches.  You can only
add your S-o-b to somebody else's patch if you have made any changes
on top of the original.

In case you want to make a record of somebody else's contribution to
your patch, you can use the Co-developed-by tag.

Thanks!
Shiju Jose July 13, 2020, 1:50 p.m. UTC | #5
Hi Rafael,

>-----Original Message-----

>From: linux-pci-owner@vger.kernel.org [mailto:linux-pci-

>owner@vger.kernel.org] On Behalf Of Rafael J. Wysocki

>Sent: 13 July 2020 14:38

>To: Shiju Jose <shiju.jose@huawei.com>

>Cc: Rafael J. Wysocki <rafael@kernel.org>; linux-acpi@vger.kernel.org; linux-

>pci@vger.kernel.org; linux-kernel@vger.kernel.org; rjw@rjwysocki.net;

>helgaas@kernel.org; bp@alien8.de; james.morse@arm.com;

>lenb@kernel.org; tony.luck@intel.com; dan.carpenter@oracle.com;

>zhangliguang@linux.alibaba.com; andriy.shevchenko@linux.intel.com;

>Wangkefeng (OS Kernel Lab) <wangkefeng.wang@huawei.com>;

>jroedel@suse.de; Linuxarm <linuxarm@huawei.com>; yangyicong

><yangyicong@huawei.com>; Jonathan Cameron

><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>

>Subject: Re: [PATCH v11 1/2] ACPI / APEI: Add a notifier chain for unknown

>(vendor) CPER records

>

>On Mon, Jul 13, 2020 at 3:33 PM Shiju Jose <shiju.jose@huawei.com> wrote:

>>

>> Hi Rafael,

>>

>> >-----Original Message-----

>> >From: Rafael J. Wysocki [mailto:rafael@kernel.org]

>> >Sent: 13 July 2020 12:18

>> >To: Shiju Jose <shiju.jose@huawei.com>

>> >Cc: linux-acpi@vger.kernel.org; linux-pci@vger.kernel.org; linux-

>> >kernel@vger.kernel.org; rjw@rjwysocki.net; helgaas@kernel.org;

>> >bp@alien8.de; james.morse@arm.com; lenb@kernel.org;

>> >tony.luck@intel.com; dan.carpenter@oracle.com;

>> >zhangliguang@linux.alibaba.com; andriy.shevchenko@linux.intel.com;

>> >Wangkefeng (OS Kernel Lab) <wangkefeng.wang@huawei.com>;

>> >jroedel@suse.de; Linuxarm <linuxarm@huawei.com>; yangyicong

>> ><yangyicong@huawei.com>; Jonathan Cameron

>> ><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>

>> >Subject: Re: [PATCH v11 1/2] ACPI / APEI: Add a notifier chain for

>> >unknown

>> >(vendor) CPER records

>> >

>> >On Mon, Jul 13, 2020 at 10:35 AM Shiju Jose <shiju.jose@huawei.com>

>> >wrote:

>> >>

>> >> Hi Rafael, Hi James,

>> >>

>> >> Can you help to merge this patch because I added and tested all the

>> >suggestions from James.

>> >

>> >I could apply the [1/2] in principle, but I need an ACK for the [2/2]

>> >from the PCI side.

>> >

>> >That said, it looks like the [1/2] is a James' patch that you are

>> >sending with some changes made by you.

>> James added following changes on top of the original patch(V10) by me,

>> [ Removed kfifo and ghes_gdata_pool. Expanded commit message ] I had

>> confusion how the S-o-b tag to be added for James's changes in the V11

>patch posted.

>

>So James should have sent the patch with his S-o-b under it.

>

>You cannot add S-o-b for somebody else to any patches.  You can only add

>your S-o-b to somebody else's patch if you have made any changes on top of

>the original.

>

>In case you want to make a record of somebody else's contribution to your

>patch, you can use the Co-developed-by tag.


Ok. I will resend the patch with Co-developed-by tag.

>

>Thanks!


Thanks,
Shiju
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81bf71b10d44..99df00f64306 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -79,6 +79,12 @@ 
 	((struct acpi_hest_generic_status *)				\
 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+#define GHES_VENDOR_ENTRY_LEN(gdata_len)                               \
+	(sizeof(struct ghes_vendor_record_entry) + (gdata_len))
+#define GHES_GDATA_FROM_VENDOR_ENTRY(vendor_entry)                     \
+	((struct acpi_hest_generic_data *)                              \
+	((struct ghes_vendor_record_entry *)(vendor_entry) + 1))
+
 /*
  *  NMI-like notifications vary by architecture, before the compiler can prune
  *  unused static functions it needs a value for these enums.
@@ -123,6 +129,12 @@  static DEFINE_MUTEX(ghes_list_mutex);
  */
 static DEFINE_SPINLOCK(ghes_notify_lock_irq);
 
+struct ghes_vendor_record_entry {
+	struct work_struct work;
+	int error_severity;
+	char vendor_record[];
+};
+
 static struct gen_pool *ghes_estatus_pool;
 static unsigned long ghes_estatus_pool_size_request;
 
@@ -511,6 +523,56 @@  static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+static BLOCKING_NOTIFIER_HEAD(vendor_record_notify_list);
+
+int ghes_register_vendor_record_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&vendor_record_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(ghes_register_vendor_record_notifier);
+
+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&vendor_record_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(ghes_unregister_vendor_record_notifier);
+
+static void ghes_vendor_record_work_func(struct work_struct *work)
+{
+	struct ghes_vendor_record_entry *entry;
+	struct acpi_hest_generic_data *gdata;
+	u32 len;
+
+	entry = container_of(work, struct ghes_vendor_record_entry, work);
+	gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);
+
+	blocking_notifier_call_chain(&vendor_record_notify_list,
+				     entry->error_severity, gdata);
+
+	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
+	gen_pool_free(ghes_estatus_pool, (unsigned long)entry, len);
+}
+
+static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
+					  int sev)
+{
+	struct acpi_hest_generic_data *copied_gdata;
+	struct ghes_vendor_record_entry *entry;
+	u32 len;
+
+	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
+	entry = (void *)gen_pool_alloc(ghes_estatus_pool, len);
+	if (!entry)
+		return;
+
+	copied_gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);
+	memcpy(copied_gdata, gdata, acpi_hest_get_record_size(gdata));
+	entry->error_severity = sev;
+
+	INIT_WORK(&entry->work, ghes_vendor_record_work_func);
+	schedule_work(&entry->work);
+}
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -549,6 +611,7 @@  static bool ghes_do_proc(struct ghes *ghes,
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
+			ghes_defer_non_standard_event(gdata, sev);
 			log_non_standard_event(sec_type, fru_id, fru_text,
 					       sec_sev, err,
 					       gdata->error_data_length);
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 517a5231cc1b..ae0e8847fdd5 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -53,6 +53,33 @@  enum {
 	GHES_SEV_PANIC = 0x3,
 };
 
+#ifdef CONFIG_ACPI_APEI_GHES
+/**
+ * ghes_register_vendor_record_notifier - register a notifier for vendor
+ * records that the kernel would otherwise ignore.
+ * @nb: pointer to the notifier_block structure of the event handler.
+ *
+ * return 0 : SUCCESS, non-zero : FAIL
+ */
+int ghes_register_vendor_record_notifier(struct notifier_block *nb);
+
+/**
+ * ghes_unregister_vendor_record_notifier - unregister the previously
+ * registered vendor record notifier.
+ * @nb: pointer to the notifier_block structure of the vendor record handler.
+ */
+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
+#else
+static inline int ghes_register_vendor_record_notifier(struct notifier_block *nb)
+{
+	return -ENODEV;
+}
+
+static inline void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)
+{
+}
+#endif
+
 int ghes_estatus_pool_init(int num_ghes);
 
 /* From drivers/edac/ghes_edac.c */