diff mbox series

[v5,9/9] cxl/pci: Register for and process CPER events

Message ID 20231220-cxl-cper-v5-9-1bb8a4ca2c7a@intel.com
State Accepted
Commit dc97f6344f205b0dfa144e1b3e16d6dc05383d57
Headers show
Series efi/cxl-cper: Report CPER CXL component events through trace events | expand

Commit Message

Ira Weiny Dec. 21, 2023, 12:17 a.m. UTC
If the firmware has configured CXL event support to be firmware first
the OS can process those events through CPER records.  The CXL layer has
unique DPA to HPA knowledge and standard event trace parsing in place.

CPER records contain Bus, Device, Function information which can be used
to identify the PCI device which is sending the event.

Change the PCI driver registration to include registration of a CXL
CPER callback to process events through the trace subsystem.

Use new scoped based management to simplify the handling of the PCI
device object.

NOTE this patch depends on Dan's addition of a device guard[1].

[1] https://lore.kernel.org/all/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com/

---
Changes for v5:
[Smita/djbw: trace a generic UUID if the type is unknown]
[Jonathan: clean up pci and device state error handling]
[iweiny: consolidate the trace function]
---
 drivers/cxl/core/mbox.c   | 49 ++++++++++++++++++++++++++++-----------
 drivers/cxl/cxlmem.h      |  4 ++++
 drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cxl-event.h |  1 +
 4 files changed, 98 insertions(+), 14 deletions(-)

Comments

Smita Koralahalli Jan. 2, 2024, 3:14 p.m. UTC | #1
Hi Ira,

I tested these patches. It works as expected.

Tested-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

Since, the trace support for FW-First Protocol errors are missing I 
wrote a patch for it. I reused the existing registered callback 
cxl_cper_callback making some changes to it. Please take a look and let 
me know what you think. I'm not sure if its appropriate to reuse the 
existing callback or define a new one..

https://lore.kernel.org/linux-cxl/20240102150933.161009-1-Smita.KoralahalliChannabasappa@amd.com/T/#t

Thanks,
Smita

On 12/20/2023 4:17 PM, Ira Weiny wrote:
> If the firmware has configured CXL event support to be firmware first
> the OS can process those events through CPER records.  The CXL layer has
> unique DPA to HPA knowledge and standard event trace parsing in place.
> 
> CPER records contain Bus, Device, Function information which can be used
> to identify the PCI device which is sending the event.
> 
> Change the PCI driver registration to include registration of a CXL
> CPER callback to process events through the trace subsystem.
> 
> Use new scoped based management to simplify the handling of the PCI
> device object.
> 
> NOTE this patch depends on Dan's addition of a device guard[1].
> 
> [1] https://lore.kernel.org/all/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com/
> 
> ---
> Changes for v5:
> [Smita/djbw: trace a generic UUID if the type is unknown]
> [Jonathan: clean up pci and device state error handling]
> [iweiny: consolidate the trace function]
> ---
>   drivers/cxl/core/mbox.c   | 49 ++++++++++++++++++++++++++++-----------
>   drivers/cxl/cxlmem.h      |  4 ++++
>   drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
>   include/linux/cxl-event.h |  1 +
>   4 files changed, 98 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 06957696247b..b801faaccd45 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -836,21 +836,44 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
>   }
>   EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>   
> -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> -				   enum cxl_event_log_type type,
> -				   struct cxl_event_record_raw *record)
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +			    enum cxl_event_log_type type,
> +			    enum cxl_event_type event_type,
> +			    const uuid_t *uuid, union cxl_event *evt)
>   {
> -	union cxl_event *evt = &record->event;
> -	uuid_t *id = &record->id;
> -
> -	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
> +	switch (event_type) {
> +	case CXL_CPER_EVENT_GEN_MEDIA:
>   		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
> -	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
> +		break;
> +	case CXL_CPER_EVENT_DRAM:
>   		trace_cxl_dram(cxlmd, type, &evt->dram);
> -	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
> +		break;
> +	case CXL_CPER_EVENT_MEM_MODULE:
>   		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
> -	else
> -		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
> +		break;
> +	case CXL_CPER_EVENT_GENERIC:
> +	default:
> +		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
> +		break;
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
> +
> +static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +				     enum cxl_event_log_type type,
> +				     struct cxl_event_record_raw *record)
> +{
> +	enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
> +	const uuid_t *uuid = &record->id;
> +
> +	if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID))
> +		ev_type = CXL_CPER_EVENT_GEN_MEDIA;
> +	else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID))
> +		ev_type = CXL_CPER_EVENT_DRAM;
> +	else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
> +		ev_type = CXL_CPER_EVENT_MEM_MODULE;
> +
> +	cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
>   }
>   
>   static int cxl_clear_event_record(struct cxl_memdev_state *mds,
> @@ -961,8 +984,8 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>   			break;
>   
>   		for (i = 0; i < nr_rec; i++)
> -			cxl_event_trace_record(cxlmd, type,
> -					       &payload->records[i]);
> +			__cxl_event_trace_record(cxlmd, type,
> +						 &payload->records[i]);
>   
>   		if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
>   			trace_cxl_overflow(cxlmd, type, payload);
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index e5d770e26e02..80076c235073 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -802,6 +802,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
>   void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
>   				  unsigned long *cmds);
>   void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +			    enum cxl_event_log_type type,
> +			    enum cxl_event_type event_type,
> +			    const uuid_t *uuid, union cxl_event *evt);
>   int cxl_set_timestamp(struct cxl_memdev_state *mds);
>   int cxl_poison_state_init(struct cxl_memdev_state *mds);
>   int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 0155fb66b580..b14237f824cf 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1,5 +1,6 @@
>   // SPDX-License-Identifier: GPL-2.0-only
>   /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
> +#include <asm-generic/unaligned.h>
>   #include <linux/io-64-nonatomic-lo-hi.h>
>   #include <linux/moduleparam.h>
>   #include <linux/module.h>
> @@ -969,6 +970,61 @@ static struct pci_driver cxl_pci_driver = {
>   	},
>   };
>   
> +#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> +static void cxl_cper_event_call(enum cxl_event_type ev_type,
> +				struct cxl_cper_event_rec *rec)
> +{
> +	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> +	struct pci_dev *pdev __free(pci_dev_put) = NULL;
> +	enum cxl_event_log_type log_type;
> +	struct cxl_dev_state *cxlds;
> +	unsigned int devfn;
> +	u32 hdr_flags;
> +
> +	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> +	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> +					   device_id->bus_num, devfn);
> +	if (!pdev)
> +		return;
> +
> +	guard(device)(&pdev->dev);
> +	if (pdev->driver != &cxl_pci_driver)
> +		return;
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	/* Fabricate a log type */
> +	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
> +	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
> +
> +	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
> +			       &uuid_null, &rec->event);
> +}
> +
> +static int __init cxl_pci_driver_init(void)
> +{
> +	int rc;
> +
> +	rc = pci_register_driver(&cxl_pci_driver);
> +	if (rc)
> +		return rc;
> +
> +	rc = cxl_cper_register_callback(cxl_cper_event_call);
> +	if (rc)
> +		pci_unregister_driver(&cxl_pci_driver);
> +
> +	return rc;
> +}
> +
> +static void __exit cxl_pci_driver_exit(void)
> +{
> +	cxl_cper_unregister_callback(cxl_cper_event_call);
> +	pci_unregister_driver(&cxl_pci_driver);
> +}
> +
> +module_init(cxl_pci_driver_init);
> +module_exit(cxl_pci_driver_exit);
>   MODULE_LICENSE("GPL v2");
> -module_pci_driver(cxl_pci_driver);
>   MODULE_IMPORT_NS(CXL);
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 71e3646f7569..17eadee819b6 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -109,6 +109,7 @@ struct cxl_event_record_raw {
>   } __packed;
>   
>   enum cxl_event_type {
> +	CXL_CPER_EVENT_GENERIC,
>   	CXL_CPER_EVENT_GEN_MEDIA,
>   	CXL_CPER_EVENT_DRAM,
>   	CXL_CPER_EVENT_MEM_MODULE,
>
Ira Weiny Jan. 2, 2024, 8:29 p.m. UTC | #2
Smita Koralahalli wrote:
> Hi Ira,
> 
> I tested these patches. It works as expected.
> 
> Tested-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

Thank you!

> 
> Since, the trace support for FW-First Protocol errors are missing I 
> wrote a patch for it. I reused the existing registered callback 
> cxl_cper_callback making some changes to it. Please take a look and let 
> me know what you think. I'm not sure if its appropriate to reuse the 
> existing callback or define a new one..
> 
> https://lore.kernel.org/linux-cxl/20240102150933.161009-1-Smita.KoralahalliChannabasappa@amd.com/T/#t

Awesome!  Yea I just went through it.

Thank you again for all the testing!
Ira

[snip]
Dan Williams Jan. 3, 2024, 10:08 p.m. UTC | #3
Ira Weiny wrote:
> If the firmware has configured CXL event support to be firmware first
> the OS can process those events through CPER records.  The CXL layer has
> unique DPA to HPA knowledge and standard event trace parsing in place.
> 
> CPER records contain Bus, Device, Function information which can be used
> to identify the PCI device which is sending the event.
> 
> Change the PCI driver registration to include registration of a CXL
> CPER callback to process events through the trace subsystem.
> 
> Use new scoped based management to simplify the handling of the PCI
> device object.
> 
> NOTE this patch depends on Dan's addition of a device guard[1].

Now that you added guard(pci_dev) earlier in the series you can just use
that here rather than guard(device).

[..]
> +#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> +static void cxl_cper_event_call(enum cxl_event_type ev_type,
> +				struct cxl_cper_event_rec *rec)
> +{
> +	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> +	struct pci_dev *pdev __free(pci_dev_put) = NULL;
> +	enum cxl_event_log_type log_type;
> +	struct cxl_dev_state *cxlds;
> +	unsigned int devfn;
> +	u32 hdr_flags;
> +
> +	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> +	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> +					   device_id->bus_num, devfn);
> +	if (!pdev)
> +		return;
> +
> +	guard(device)(&pdev->dev);

Per above:
	guard(pci_dev)(pdev);

> +	if (pdev->driver != &cxl_pci_driver)
> +		return;
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	/* Fabricate a log type */
> +	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
> +	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
> +
> +	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
> +			       &uuid_null, &rec->event);
> +}
> +
> +static int __init cxl_pci_driver_init(void)
> +{
> +	int rc;
> +
> +	rc = pci_register_driver(&cxl_pci_driver);
> +	if (rc)
> +		return rc;
> +
> +	rc = cxl_cper_register_callback(cxl_cper_event_call);
> +	if (rc)
> +		pci_unregister_driver(&cxl_pci_driver);

I think this order should be flipped. That way any errors that might
arrive due to activity caused by probing have a chance to be serviced.
Any that fire while initial probing is happening will pause in
cxl_cper_event_call() and wait for probing to complete. Of course if
probing fails, all is lost, but I think there is some incremental
benefit to trying to catch those early records for things that are not
probing fatal.
Ira Weiny Jan. 4, 2024, 6:31 p.m. UTC | #4
Ira Weiny wrote:
> If the firmware has configured CXL event support to be firmware first
> the OS can process those events through CPER records.  The CXL layer has
> unique DPA to HPA knowledge and standard event trace parsing in place.
> 
> CPER records contain Bus, Device, Function information which can be used
> to identify the PCI device which is sending the event.
> 
> Change the PCI driver registration to include registration of a CXL
> CPER callback to process events through the trace subsystem.
> 
> Use new scoped based management to simplify the handling of the PCI
> device object.
> 
> NOTE this patch depends on Dan's addition of a device guard[1].
> 
> [1] https://lore.kernel.org/all/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com/

Somehow this patch lost my signed off by line from V4.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>

> 
> ---
> Changes for v5:
> [Smita/djbw: trace a generic UUID if the type is unknown]
> [Jonathan: clean up pci and device state error handling]
> [iweiny: consolidate the trace function]
> ---
>  drivers/cxl/core/mbox.c   | 49 ++++++++++++++++++++++++++++-----------
>  drivers/cxl/cxlmem.h      |  4 ++++
>  drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/cxl-event.h |  1 +
>  4 files changed, 98 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 06957696247b..b801faaccd45 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -836,21 +836,44 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>  
> -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> -				   enum cxl_event_log_type type,
> -				   struct cxl_event_record_raw *record)
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +			    enum cxl_event_log_type type,
> +			    enum cxl_event_type event_type,
> +			    const uuid_t *uuid, union cxl_event *evt)
>  {
> -	union cxl_event *evt = &record->event;
> -	uuid_t *id = &record->id;
> -
> -	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
> +	switch (event_type) {
> +	case CXL_CPER_EVENT_GEN_MEDIA:
>  		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
> -	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
> +		break;
> +	case CXL_CPER_EVENT_DRAM:
>  		trace_cxl_dram(cxlmd, type, &evt->dram);
> -	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
> +		break;
> +	case CXL_CPER_EVENT_MEM_MODULE:
>  		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
> -	else
> -		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
> +		break;
> +	case CXL_CPER_EVENT_GENERIC:
> +	default:
> +		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
> +		break;
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
> +
> +static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +				     enum cxl_event_log_type type,
> +				     struct cxl_event_record_raw *record)
> +{
> +	enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
> +	const uuid_t *uuid = &record->id;
> +
> +	if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID))
> +		ev_type = CXL_CPER_EVENT_GEN_MEDIA;
> +	else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID))
> +		ev_type = CXL_CPER_EVENT_DRAM;
> +	else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
> +		ev_type = CXL_CPER_EVENT_MEM_MODULE;
> +
> +	cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
>  }
>  
>  static int cxl_clear_event_record(struct cxl_memdev_state *mds,
> @@ -961,8 +984,8 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  			break;
>  
>  		for (i = 0; i < nr_rec; i++)
> -			cxl_event_trace_record(cxlmd, type,
> -					       &payload->records[i]);
> +			__cxl_event_trace_record(cxlmd, type,
> +						 &payload->records[i]);
>  
>  		if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
>  			trace_cxl_overflow(cxlmd, type, payload);
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index e5d770e26e02..80076c235073 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -802,6 +802,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
>  void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
>  				  unsigned long *cmds);
>  void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +			    enum cxl_event_log_type type,
> +			    enum cxl_event_type event_type,
> +			    const uuid_t *uuid, union cxl_event *evt);
>  int cxl_set_timestamp(struct cxl_memdev_state *mds);
>  int cxl_poison_state_init(struct cxl_memdev_state *mds);
>  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 0155fb66b580..b14237f824cf 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
> +#include <asm-generic/unaligned.h>
>  #include <linux/io-64-nonatomic-lo-hi.h>
>  #include <linux/moduleparam.h>
>  #include <linux/module.h>
> @@ -969,6 +970,61 @@ static struct pci_driver cxl_pci_driver = {
>  	},
>  };
>  
> +#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> +static void cxl_cper_event_call(enum cxl_event_type ev_type,
> +				struct cxl_cper_event_rec *rec)
> +{
> +	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> +	struct pci_dev *pdev __free(pci_dev_put) = NULL;
> +	enum cxl_event_log_type log_type;
> +	struct cxl_dev_state *cxlds;
> +	unsigned int devfn;
> +	u32 hdr_flags;
> +
> +	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> +	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> +					   device_id->bus_num, devfn);
> +	if (!pdev)
> +		return;
> +
> +	guard(device)(&pdev->dev);
> +	if (pdev->driver != &cxl_pci_driver)
> +		return;
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	/* Fabricate a log type */
> +	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
> +	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
> +
> +	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
> +			       &uuid_null, &rec->event);
> +}
> +
> +static int __init cxl_pci_driver_init(void)
> +{
> +	int rc;
> +
> +	rc = pci_register_driver(&cxl_pci_driver);
> +	if (rc)
> +		return rc;
> +
> +	rc = cxl_cper_register_callback(cxl_cper_event_call);
> +	if (rc)
> +		pci_unregister_driver(&cxl_pci_driver);
> +
> +	return rc;
> +}
> +
> +static void __exit cxl_pci_driver_exit(void)
> +{
> +	cxl_cper_unregister_callback(cxl_cper_event_call);
> +	pci_unregister_driver(&cxl_pci_driver);
> +}
> +
> +module_init(cxl_pci_driver_init);
> +module_exit(cxl_pci_driver_exit);
>  MODULE_LICENSE("GPL v2");
> -module_pci_driver(cxl_pci_driver);
>  MODULE_IMPORT_NS(CXL);
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 71e3646f7569..17eadee819b6 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -109,6 +109,7 @@ struct cxl_event_record_raw {
>  } __packed;
>  
>  enum cxl_event_type {
> +	CXL_CPER_EVENT_GENERIC,
>  	CXL_CPER_EVENT_GEN_MEDIA,
>  	CXL_CPER_EVENT_DRAM,
>  	CXL_CPER_EVENT_MEM_MODULE,
> 
> -- 
> 2.43.0
>
Jonathan Cameron Jan. 8, 2024, 1:50 p.m. UTC | #5
On Wed, 20 Dec 2023 16:17:36 -0800
Ira Weiny <ira.weiny@intel.com> wrote:

> If the firmware has configured CXL event support to be firmware first
> the OS can process those events through CPER records.  The CXL layer has
> unique DPA to HPA knowledge and standard event trace parsing in place.
> 
> CPER records contain Bus, Device, Function information which can be used
> to identify the PCI device which is sending the event.
> 
> Change the PCI driver registration to include registration of a CXL
> CPER callback to process events through the trace subsystem.
> 
> Use new scoped based management to simplify the handling of the PCI
> device object.
> 
> NOTE this patch depends on Dan's addition of a device guard[1].
> 
> [1] https://lore.kernel.org/all/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com/
> 
One trivial comment inline.
The guard change Dan suggests makes sense.  Otherwise I'm fine with this.
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

I'll bolt in the other stuff I need to test it from QEMU this week.
Did the protocol error first, but these are easy to add now I have
that working,

Jonathan
> ---
> Changes for v5:
> [Smita/djbw: trace a generic UUID if the type is unknown]
> [Jonathan: clean up pci and device state error handling]
> [iweiny: consolidate the trace function]
> ---
>  drivers/cxl/core/mbox.c   | 49 ++++++++++++++++++++++++++++-----------
>  drivers/cxl/cxlmem.h      |  4 ++++
>  drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/cxl-event.h |  1 +
>  4 files changed, 98 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 06957696247b..b801faaccd45 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -836,21 +836,44 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>  
> -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> -				   enum cxl_event_log_type type,
> -				   struct cxl_event_record_raw *record)
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> +			    enum cxl_event_log_type type,
> +			    enum cxl_event_type event_type,
> +			    const uuid_t *uuid, union cxl_event *evt)
>  {
> -	union cxl_event *evt = &record->event;
> -	uuid_t *id = &record->id;
> -
> -	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
> +	switch (event_type) {
> +	case CXL_CPER_EVENT_GEN_MEDIA:
>  		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
> -	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
> +		break;

Might as well return directly and save a reviewer having to check if anything else happens
after the switch

> +	case CXL_CPER_EVENT_DRAM:
>  		trace_cxl_dram(cxlmd, type, &evt->dram);
> -	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
> +		break;
> +	case CXL_CPER_EVENT_MEM_MODULE:
>  		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
> -	else
> -		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
> +		break;
> +	case CXL_CPER_EVENT_GENERIC:
> +	default:
> +		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
> +		break;
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
Dan Williams Jan. 9, 2024, 11:59 p.m. UTC | #6
Jonathan Cameron wrote:
> On Wed, 20 Dec 2023 16:17:36 -0800
> Ira Weiny <ira.weiny@intel.com> wrote:
> 
> > If the firmware has configured CXL event support to be firmware first
> > the OS can process those events through CPER records.  The CXL layer has
> > unique DPA to HPA knowledge and standard event trace parsing in place.
> > 
> > CPER records contain Bus, Device, Function information which can be used
> > to identify the PCI device which is sending the event.
> > 
> > Change the PCI driver registration to include registration of a CXL
> > CPER callback to process events through the trace subsystem.
> > 
> > Use new scoped based management to simplify the handling of the PCI
> > device object.
> > 
> > NOTE this patch depends on Dan's addition of a device guard[1].
> > 
> > [1] https://lore.kernel.org/all/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com/
> > 
> One trivial comment inline.
> The guard change Dan suggests makes sense.  Otherwise I'm fine with this.
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> I'll bolt in the other stuff I need to test it from QEMU this week.
> Did the protocol error first, but these are easy to add now I have
> that working,
> 
> Jonathan
> > ---
> > Changes for v5:
> > [Smita/djbw: trace a generic UUID if the type is unknown]
> > [Jonathan: clean up pci and device state error handling]
> > [iweiny: consolidate the trace function]
> > ---
> >  drivers/cxl/core/mbox.c   | 49 ++++++++++++++++++++++++++++-----------
> >  drivers/cxl/cxlmem.h      |  4 ++++
> >  drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
> >  include/linux/cxl-event.h |  1 +
> >  4 files changed, 98 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 06957696247b..b801faaccd45 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -836,21 +836,44 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
> >  
> > -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> > -				   enum cxl_event_log_type type,
> > -				   struct cxl_event_record_raw *record)
> > +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> > +			    enum cxl_event_log_type type,
> > +			    enum cxl_event_type event_type,
> > +			    const uuid_t *uuid, union cxl_event *evt)
> >  {
> > -	union cxl_event *evt = &record->event;
> > -	uuid_t *id = &record->id;
> > -
> > -	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
> > +	switch (event_type) {
> > +	case CXL_CPER_EVENT_GEN_MEDIA:
> >  		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
> > -	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
> > +		break;
> 
> Might as well return directly and save a reviewer having to check if anything else happens
> after the switch

Might as well keep it as an "if () else" tree as that's equally clear
and more compact.

That immeidiately then opens the concern of why the upper level
__cxl_event_trace_record() is calling a lower level function without the
prefix. That can be swapped later to meet common expectations, but it
feels like gymnastics to parse all the uuids *and* still pass the uuid
to the cxl_event_trace_record() helper. Yes, I see how it happens, just
not totally comfortable with the result, but not enough to hold up the
series.
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 06957696247b..b801faaccd45 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -836,21 +836,44 @@  int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
 
-static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-				   enum cxl_event_log_type type,
-				   struct cxl_event_record_raw *record)
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+			    enum cxl_event_log_type type,
+			    enum cxl_event_type event_type,
+			    const uuid_t *uuid, union cxl_event *evt)
 {
-	union cxl_event *evt = &record->event;
-	uuid_t *id = &record->id;
-
-	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
+	switch (event_type) {
+	case CXL_CPER_EVENT_GEN_MEDIA:
 		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
+		break;
+	case CXL_CPER_EVENT_DRAM:
 		trace_cxl_dram(cxlmd, type, &evt->dram);
-	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
+		break;
+	case CXL_CPER_EVENT_MEM_MODULE:
 		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
-	else
-		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
+		break;
+	case CXL_CPER_EVENT_GENERIC:
+	default:
+		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
+		break;
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
+
+static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+				     enum cxl_event_log_type type,
+				     struct cxl_event_record_raw *record)
+{
+	enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
+	const uuid_t *uuid = &record->id;
+
+	if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID))
+		ev_type = CXL_CPER_EVENT_GEN_MEDIA;
+	else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID))
+		ev_type = CXL_CPER_EVENT_DRAM;
+	else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
+		ev_type = CXL_CPER_EVENT_MEM_MODULE;
+
+	cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
 }
 
 static int cxl_clear_event_record(struct cxl_memdev_state *mds,
@@ -961,8 +984,8 @@  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
 			break;
 
 		for (i = 0; i < nr_rec; i++)
-			cxl_event_trace_record(cxlmd, type,
-					       &payload->records[i]);
+			__cxl_event_trace_record(cxlmd, type,
+						 &payload->records[i]);
 
 		if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
 			trace_cxl_overflow(cxlmd, type, payload);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index e5d770e26e02..80076c235073 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -802,6 +802,10 @@  void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
 void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
 				  unsigned long *cmds);
 void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+			    enum cxl_event_log_type type,
+			    enum cxl_event_type event_type,
+			    const uuid_t *uuid, union cxl_event *evt);
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 0155fb66b580..b14237f824cf 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1,5 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <asm-generic/unaligned.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/moduleparam.h>
 #include <linux/module.h>
@@ -969,6 +970,61 @@  static struct pci_driver cxl_pci_driver = {
 	},
 };
 
+#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
+static void cxl_cper_event_call(enum cxl_event_type ev_type,
+				struct cxl_cper_event_rec *rec)
+{
+	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
+	struct pci_dev *pdev __free(pci_dev_put) = NULL;
+	enum cxl_event_log_type log_type;
+	struct cxl_dev_state *cxlds;
+	unsigned int devfn;
+	u32 hdr_flags;
+
+	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
+	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
+					   device_id->bus_num, devfn);
+	if (!pdev)
+		return;
+
+	guard(device)(&pdev->dev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	/* Fabricate a log type */
+	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
+	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
+
+	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
+			       &uuid_null, &rec->event);
+}
+
+static int __init cxl_pci_driver_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&cxl_pci_driver);
+	if (rc)
+		return rc;
+
+	rc = cxl_cper_register_callback(cxl_cper_event_call);
+	if (rc)
+		pci_unregister_driver(&cxl_pci_driver);
+
+	return rc;
+}
+
+static void __exit cxl_pci_driver_exit(void)
+{
+	cxl_cper_unregister_callback(cxl_cper_event_call);
+	pci_unregister_driver(&cxl_pci_driver);
+}
+
+module_init(cxl_pci_driver_init);
+module_exit(cxl_pci_driver_exit);
 MODULE_LICENSE("GPL v2");
-module_pci_driver(cxl_pci_driver);
 MODULE_IMPORT_NS(CXL);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 71e3646f7569..17eadee819b6 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -109,6 +109,7 @@  struct cxl_event_record_raw {
 } __packed;
 
 enum cxl_event_type {
+	CXL_CPER_EVENT_GENERIC,
 	CXL_CPER_EVENT_GEN_MEDIA,
 	CXL_CPER_EVENT_DRAM,
 	CXL_CPER_EVENT_MEM_MODULE,