diff mbox series

[v13,2/4] EINJ: Add CXL error type support

Message ID 20240220221146.399209-3-Benjamin.Cheatham@amd.com
State New
Headers show
Series cxl, EINJ: Update EINJ for CXL error types | expand

Commit Message

Ben Cheatham Feb. 20, 2024, 10:11 p.m. UTC
Remove CXL protocol error types from the EINJ module and move them to
a new einj_cxl module. The einj_cxl module implements the necessary
handling for CXL protocol error injection and exposes an API for the
CXL core to use said functionality. Because the CXL error types
require special handling, only allow them to be injected through the
einj_cxl module and return an error when attempting to inject through
"regular" EINJ.

Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
---
 MAINTAINERS                       |   1 +
 drivers/acpi/apei/Kconfig         |  12 +++
 drivers/acpi/apei/Makefile        |   1 +
 drivers/acpi/apei/apei-internal.h |  17 +++++
 drivers/acpi/apei/einj-cxl.c      | 121 ++++++++++++++++++++++++++++++
 drivers/acpi/apei/einj.c          |  81 ++++++++++++++------
 include/linux/einj-cxl.h          |  40 ++++++++++
 7 files changed, 249 insertions(+), 24 deletions(-)
 create mode 100644 drivers/acpi/apei/einj-cxl.c
 create mode 100644 include/linux/einj-cxl.h

Comments

Dan Williams Feb. 21, 2024, 5:43 p.m. UTC | #1
Ben Cheatham wrote:
> Remove CXL protocol error types from the EINJ module and move them to
> a new einj_cxl module. The einj_cxl module implements the necessary
> handling for CXL protocol error injection and exposes an API for the
> CXL core to use said functionality. Because the CXL error types
> require special handling, only allow them to be injected through the
> einj_cxl module and return an error when attempting to inject through
> "regular" EINJ.

So Robustness Principle says be conservative in what you send and
liberal in what you accept. So cleaning up the reporting of CXL
capabilities over to the new interface is consistent with that
principle, but not removing the ability to inject via the legacy
interface. Especially since that has been the status quo for a few
kernel cycles is there a good reason to actively prevent usage of that
path?

> Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
> ---
>  MAINTAINERS                       |   1 +
>  drivers/acpi/apei/Kconfig         |  12 +++
>  drivers/acpi/apei/Makefile        |   1 +
>  drivers/acpi/apei/apei-internal.h |  17 +++++
>  drivers/acpi/apei/einj-cxl.c      | 121 ++++++++++++++++++++++++++++++
>  drivers/acpi/apei/einj.c          |  81 ++++++++++++++------
>  include/linux/einj-cxl.h          |  40 ++++++++++
>  7 files changed, 249 insertions(+), 24 deletions(-)
>  create mode 100644 drivers/acpi/apei/einj-cxl.c
>  create mode 100644 include/linux/einj-cxl.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 73d898383e51..51f9a0da57d7 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5289,6 +5289,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>  L:	linux-cxl@vger.kernel.org
>  S:	Maintained
>  F:	drivers/cxl/
> +F:	include/linux/cxl-einj.h
>  F:	include/linux/cxl-event.h
>  F:	include/uapi/linux/cxl_mem.h
>  F:	tools/testing/cxl/
> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
> index 6b18f8bc7be3..040a9b2de235 100644
> --- a/drivers/acpi/apei/Kconfig
> +++ b/drivers/acpi/apei/Kconfig
> @@ -60,6 +60,18 @@ config ACPI_APEI_EINJ
>  	  mainly used for debugging and testing the other parts of
>  	  APEI and some other RAS features.
>  
> +config ACPI_APEI_EINJ_CXL
> +	tristate "CXL Error INJection Support"

This should still be a boolean because it is add-on functionality to the
cxl_core.ko module which has its own tristate configuration.

> +	default ACPI_APEI_EINJ
> +	depends on ACPI_APEI_EINJ

The dependency still needs to be:

    depends on ACPI_APEI_EINJ && CXL_BUS >= ACPI_APEI_EINJ

...because CXL_BUS can not tolerate being built-in when ACPI_APEI_EINJ
is not.

> +	help
> +	  Support for CXL protocol Error INJection through debugfs/cxl.
> +	  Availability and which errors are supported is dependent on
> +	  the host platform. Look to ACPI v6.5 section 18.6.4 and kernel
> +	  EINJ documentation for more information.
> +
> +	  If unsure say 'n'
> +
>  config ACPI_APEI_ERST_DEBUG
>  	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
>  	depends on ACPI_APEI
> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
> index 4dfac2128737..c18e96d342b2 100644
> --- a/drivers/acpi/apei/Makefile
> +++ b/drivers/acpi/apei/Makefile
> @@ -2,6 +2,7 @@
>  obj-$(CONFIG_ACPI_APEI)		+= apei.o
>  obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
>  obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
> +obj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o

No new module needed. It only needs another compilation unit optionally
added to einj.ko. Something like this:

diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index 4dfac2128737..2c474e6477e1 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -2,6 +2,8 @@
 obj-$(CONFIG_ACPI_APEI)                += apei.o
 obj-$(CONFIG_ACPI_APEI_GHES)   += ghes.o
 obj-$(CONFIG_ACPI_APEI_EINJ)   += einj.o
+einj-y                         := einj-core.o
+einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
 obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
 
 apei-y := apei-base.o hest.o erst.o bert.o
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj-core.c
similarity index 100%
rename from drivers/acpi/apei/einj.c
rename to drivers/acpi/apei/einj-core.c

>  obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
>  
>  apei-y := apei-base.o hest.o erst.o bert.o
> diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
> index 67c2c3b959e1..336408f4f293 100644
> --- a/drivers/acpi/apei/apei-internal.h
> +++ b/drivers/acpi/apei/apei-internal.h
> @@ -130,4 +130,21 @@ static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus)
>  }
>  
>  int apei_osc_setup(void);
> +
> +int einj_get_available_error_type(u32 *type);
> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
> +		      u64 param4);
> +bool einj_is_initialized(void);
> +bool einj_is_cxl_error_type(u64 type);
> +int einj_validate_error_type(u64 type);
> +
> +#ifndef ACPI_EINJ_CXL_CACHE_CORRECTABLE
> +#define ACPI_EINJ_CXL_CACHE_CORRECTABLE     BIT(12)
> +#define ACPI_EINJ_CXL_CACHE_UNCORRECTABLE   BIT(13)
> +#define ACPI_EINJ_CXL_CACHE_FATAL           BIT(14)
> +#define ACPI_EINJ_CXL_MEM_CORRECTABLE       BIT(15)
> +#define ACPI_EINJ_CXL_MEM_UNCORRECTABLE     BIT(16)
> +#define ACPI_EINJ_CXL_MEM_FATAL             BIT(17)
> +#endif
> +
>  #endif
> diff --git a/drivers/acpi/apei/einj-cxl.c b/drivers/acpi/apei/einj-cxl.c
> new file mode 100644
> index 000000000000..607d4f6adb98
> --- /dev/null
> +++ b/drivers/acpi/apei/einj-cxl.c
> @@ -0,0 +1,121 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * CXL Error INJection support. Used by CXL core to inject
> + * protocol errors into CXL ports.
> + *
> + * Copyright (C) 2023 Advanced Micro Devices, Inc.
> + *
> + * Author: Ben Cheatham <benjamin.cheatham@amd.com>
> + */
> +#include <linux/einj-cxl.h>
> +#include <linux/debugfs.h>
> +
> +#include "apei-internal.h"
> +
> +static struct { u32 mask; const char *str; } const einj_cxl_error_type_string[] = {
> +	{ BIT(12), "CXL.cache Protocol Correctable" },
> +	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
> +	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
> +	{ BIT(15), "CXL.mem Protocol Correctable" },
> +	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
> +	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
> +};
> +
> +int einj_cxl_available_error_type_show(struct seq_file *m, void *v)
> +{
> +	int cxl_err, rc;
> +	u32 available_error_type = 0;
> +
> +	if (!einj_is_initialized())
> +		return -ENXIO;
> +
> +	rc = einj_get_available_error_type(&available_error_type);
> +	if (rc)
> +		return rc;
> +
> +	for (int pos = 0; pos < ARRAY_SIZE(einj_cxl_error_type_string); pos++) {
> +		cxl_err = ACPI_EINJ_CXL_CACHE_CORRECTABLE << pos;
> +
> +		if (available_error_type & cxl_err)
> +			seq_printf(m, "0x%08x\t%s\n",
> +				   einj_cxl_error_type_string[pos].mask,
> +				   einj_cxl_error_type_string[pos].str);
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(einj_cxl_available_error_type_show, CXL);
> +
> +static int cxl_dport_get_sbdf(struct pci_dev *dport_dev, u64 *sbdf)
> +{
> +	struct pci_bus *pbus;
> +	struct pci_host_bridge *bridge;
> +	u64 seg = 0, bus;
> +
> +	pbus = dport_dev->bus;
> +	bridge = pci_find_host_bridge(pbus);
> +
> +	if (!bridge)
> +		return -ENODEV;
> +
> +	if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
> +		seg = bridge->domain_nr;
> +
> +	bus = pbus->number;
> +	*sbdf = (seg << 24) | (bus << 16) | dport_dev->devfn;
> +
> +	return 0;
> +}
> +
> +int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
> +{
> +	int rc;
> +
> +	if (!einj_is_initialized())
> +		return -ENXIO;
> +
> +	/* Only CXL error types can be specified */
> +	if (!einj_is_cxl_error_type(type))
> +		return -EINVAL;
> +
> +	rc = einj_validate_error_type(type);
> +	if (rc)
> +		return rc;
> +
> +	return einj_error_inject(type, 0x2, rcrb, GENMASK_ULL(63, 12), 0, 0);
> +}
> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_rch_error, CXL);
> +
> +int einj_cxl_inject_error(struct pci_dev *dport, u64 type)
> +{
> +	u64 param4 = 0;
> +	int rc;
> +
> +	if (!einj_is_initialized())
> +		return -ENXIO;
> +
> +	/* Only CXL error types can be specified */
> +	if (!einj_is_cxl_error_type(type))
> +		return -EINVAL;
> +
> +	rc = einj_validate_error_type(type);
> +	if (rc)
> +		return rc;
> +
> +	rc = cxl_dport_get_sbdf(dport, &param4);
> +	if (rc)
> +		return rc;
> +
> +	return einj_error_inject(type, 0x4, 0, 0, 0, param4);
> +}
> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_error, CXL);
> +
> +bool einj_cxl_is_initialized(void)
> +{
> +	return einj_is_initialized();
> +}
> +EXPORT_SYMBOL_NS_GPL(einj_cxl_is_initialized, CXL);
> +
> +MODULE_AUTHOR("Ben Cheatham");
> +MODULE_DESCRIPTION("CXL Error INJection support");
> +MODULE_LICENSE("GPL");

These go away when cxl-einj.ko is no longer its own module.

> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
> index 6ea323b9d8ef..e76e64df97a7 100644
> --- a/drivers/acpi/apei/einj.c
> +++ b/drivers/acpi/apei/einj.c
> @@ -37,6 +37,12 @@
>  #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
>  				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
>  				ACPI_EINJ_MEMORY_FATAL)
> +#define CXL_ERROR_MASK		(ACPI_EINJ_CXL_CACHE_CORRECTABLE | \
> +				ACPI_EINJ_CXL_CACHE_UNCORRECTABLE | \
> +				ACPI_EINJ_CXL_CACHE_FATAL | \
> +				ACPI_EINJ_CXL_MEM_CORRECTABLE | \
> +				ACPI_EINJ_CXL_MEM_UNCORRECTABLE | \
> +				ACPI_EINJ_CXL_MEM_FATAL)
>  
>  /*
>   * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
> @@ -166,7 +172,7 @@ static int __einj_get_available_error_type(u32 *type)
>  }
>  
>  /* Get error injection capabilities of the platform */
> -static int einj_get_available_error_type(u32 *type)
> +int einj_get_available_error_type(u32 *type)
>  {
>  	int rc;
>  
> @@ -176,6 +182,7 @@ static int einj_get_available_error_type(u32 *type)
>  
>  	return rc;
>  }
> +EXPORT_SYMBOL_GPL(einj_get_available_error_type);

There should not be any need for new exports from the legacy einj.c.

>  
>  static int einj_timedout(u64 *t)
>  {
> @@ -536,8 +543,8 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  }
>  
>  /* Inject the specified hardware error */
> -static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
> -			     u64 param3, u64 param4)
> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
> +		      u64 param4)
>  {
>  	int rc;
>  	u64 base_addr, size;
> @@ -560,8 +567,11 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  	if (type & ACPI5_VENDOR_BIT) {
>  		if (vendor_flags != SETWA_FLAGS_MEM)
>  			goto inject;
> -	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
> +	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
>  		goto inject;
> +	} else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
> +		goto inject;
> +	}
>  
>  	/*
>  	 * Disallow crazy address masks that give BIOS leeway to pick
> @@ -592,6 +602,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  
>  	return rc;
>  }
> +EXPORT_SYMBOL_GPL(einj_error_inject);
>  
>  static u32 error_type;
>  static u32 error_flags;
> @@ -613,12 +624,6 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
>  	{ BIT(9), "Platform Correctable" },
>  	{ BIT(10), "Platform Uncorrectable non-fatal" },
>  	{ BIT(11), "Platform Uncorrectable fatal"},
> -	{ BIT(12), "CXL.cache Protocol Correctable" },
> -	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
> -	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
> -	{ BIT(15), "CXL.mem Protocol Correctable" },
> -	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
> -	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
>  	{ BIT(31), "Vendor Defined Error Types" },
>  };
>  
> @@ -640,29 +645,21 @@ static int available_error_type_show(struct seq_file *m, void *v)
>  
>  DEFINE_SHOW_ATTRIBUTE(available_error_type);
>  
> -static int error_type_get(void *data, u64 *val)
> -{
> -	*val = error_type;
> -
> -	return 0;
> -}
> -
> -static int error_type_set(void *data, u64 val)
> +int einj_validate_error_type(u64 type)
>  {
> +	u32 tval, vendor, available_error_type = 0;
>  	int rc;
> -	u32 available_error_type = 0;
> -	u32 tval, vendor;
>  
>  	/* Only low 32 bits for error type are valid */
> -	if (val & GENMASK_ULL(63, 32))
> +	if (type & GENMASK_ULL(63, 32))
>  		return -EINVAL;
>  
>  	/*
>  	 * Vendor defined types have 0x80000000 bit set, and
>  	 * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE
>  	 */
> -	vendor = val & ACPI5_VENDOR_BIT;
> -	tval = val & 0x7fffffff;
> +	vendor = type & ACPI5_VENDOR_BIT;
> +	tval = type & GENMASK(30, 0);
>  
>  	/* Only one error type can be specified */
>  	if (tval & (tval - 1))
> @@ -671,9 +668,39 @@ static int error_type_set(void *data, u64 val)
>  		rc = einj_get_available_error_type(&available_error_type);
>  		if (rc)
>  			return rc;
> -		if (!(val & available_error_type))
> +		if (!(type & available_error_type))
>  			return -EINVAL;
>  	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(einj_validate_error_type);
> +
> +bool einj_is_cxl_error_type(u64 type)
> +{
> +	return (type & CXL_ERROR_MASK) && (!(type & ACPI5_VENDOR_BIT));
> +}
> +EXPORT_SYMBOL_GPL(einj_is_cxl_error_type);
> +
> +static int error_type_get(void *data, u64 *val)
> +{
> +	*val = error_type;
> +
> +	return 0;
> +}
> +
> +static int error_type_set(void *data, u64 val)
> +{
> +	int rc;
> +
> +	/* CXL error types have to be injected from cxl debugfs */
> +	if (einj_is_cxl_error_type(val))
> +		return -EINVAL;
> +
> +	rc = einj_validate_error_type(val);
> +	if (rc)
> +		return rc;
> +
>  	error_type = val;
>  
>  	return 0;
> @@ -709,6 +736,12 @@ static int einj_check_table(struct acpi_table_einj *einj_tab)
>  	return 0;
>  }
>  
> +bool einj_is_initialized(void)
> +{
> +	return einj_initialized;
> +}
> +EXPORT_SYMBOL_GPL(einj_is_initialized);

The variable can be referenced directly as a global symbol.
Ben Cheatham Feb. 21, 2024, 8:27 p.m. UTC | #2
On 2/21/24 11:43 AM, Dan Williams wrote:
> Ben Cheatham wrote:
>> Remove CXL protocol error types from the EINJ module and move them to
>> a new einj_cxl module. The einj_cxl module implements the necessary
>> handling for CXL protocol error injection and exposes an API for the
>> CXL core to use said functionality. Because the CXL error types
>> require special handling, only allow them to be injected through the
>> einj_cxl module and return an error when attempting to inject through
>> "regular" EINJ.
> 
> So Robustness Principle says be conservative in what you send and
> liberal in what you accept. So cleaning up the reporting of CXL
> capabilities over to the new interface is consistent with that
> principle, but not removing the ability to inject via the legacy
> interface. Especially since that has been the status quo for a few
> kernel cycles is there a good reason to actively prevent usage of that
> path?
> 

For CXL 2.0+ ports it's fine since EINJ only expects an SBDF which is
pretty readily accessible by the user. CXL 1.1/1.0 ports however, it's a bit
of a headache. It would require the user to find the address of the RCRB
for the port and supply that to the EINJ module. I originally had this option
anyway, but I think it got shot down for being too obtuse to use (I think by
you, but it's been a while xD). If you think it's still worthwhile I can
remove the restriction for both types of ports or just the 2.0+ ports.

For CXL 1.0/1.1 ports there's also the security issue of being able to inject
to any address since the way it works is by skipping the memory address
checks, but since this is a debug module I don't think it's that big
of a deal.

>> Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
>> ---
>>  MAINTAINERS                       |   1 +
>>  drivers/acpi/apei/Kconfig         |  12 +++
>>  drivers/acpi/apei/Makefile        |   1 +
>>  drivers/acpi/apei/apei-internal.h |  17 +++++
>>  drivers/acpi/apei/einj-cxl.c      | 121 ++++++++++++++++++++++++++++++
>>  drivers/acpi/apei/einj.c          |  81 ++++++++++++++------
>>  include/linux/einj-cxl.h          |  40 ++++++++++
>>  7 files changed, 249 insertions(+), 24 deletions(-)
>>  create mode 100644 drivers/acpi/apei/einj-cxl.c
>>  create mode 100644 include/linux/einj-cxl.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 73d898383e51..51f9a0da57d7 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -5289,6 +5289,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>>  L:	linux-cxl@vger.kernel.org
>>  S:	Maintained
>>  F:	drivers/cxl/
>> +F:	include/linux/cxl-einj.h
>>  F:	include/linux/cxl-event.h
>>  F:	include/uapi/linux/cxl_mem.h
>>  F:	tools/testing/cxl/
>> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
>> index 6b18f8bc7be3..040a9b2de235 100644
>> --- a/drivers/acpi/apei/Kconfig
>> +++ b/drivers/acpi/apei/Kconfig
>> @@ -60,6 +60,18 @@ config ACPI_APEI_EINJ
>>  	  mainly used for debugging and testing the other parts of
>>  	  APEI and some other RAS features.
>>  
>> +config ACPI_APEI_EINJ_CXL
>> +	tristate "CXL Error INJection Support"
> 
> This should still be a boolean because it is add-on functionality to the
> cxl_core.ko module which has its own tristate configuration.
> 

I tried this but was running into issues, more about this in the Makefile
portion of the patch.

>> +	default ACPI_APEI_EINJ
>> +	depends on ACPI_APEI_EINJ
> 
> The dependency still needs to be:
> 
>     depends on ACPI_APEI_EINJ && CXL_BUS >= ACPI_APEI_EINJ
> 
> ...because CXL_BUS can not tolerate being built-in when ACPI_APEI_EINJ
> is not.
> 

Will do.

>> +	help
>> +	  Support for CXL protocol Error INJection through debugfs/cxl.
>> +	  Availability and which errors are supported is dependent on
>> +	  the host platform. Look to ACPI v6.5 section 18.6.4 and kernel
>> +	  EINJ documentation for more information.
>> +
>> +	  If unsure say 'n'
>> +
>>  config ACPI_APEI_ERST_DEBUG
>>  	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
>>  	depends on ACPI_APEI
>> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
>> index 4dfac2128737..c18e96d342b2 100644
>> --- a/drivers/acpi/apei/Makefile
>> +++ b/drivers/acpi/apei/Makefile
>> @@ -2,6 +2,7 @@
>>  obj-$(CONFIG_ACPI_APEI)		+= apei.o
>>  obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
>>  obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
>> +obj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
> 
> No new module needed. It only needs another compilation unit optionally
> added to einj.ko. Something like this:
> 
> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
> index 4dfac2128737..2c474e6477e1 100644
> --- a/drivers/acpi/apei/Makefile
> +++ b/drivers/acpi/apei/Makefile
> @@ -2,6 +2,8 @@
>  obj-$(CONFIG_ACPI_APEI)                += apei.o
>  obj-$(CONFIG_ACPI_APEI_GHES)   += ghes.o
>  obj-$(CONFIG_ACPI_APEI_EINJ)   += einj.o
> +einj-y                         := einj-core.o
> +einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
>  obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
>  
>  apei-y := apei-base.o hest.o erst.o bert.o
> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj-core.c
> similarity index 100%
> rename from drivers/acpi/apei/einj.c
> rename to drivers/acpi/apei/einj-core.c
> 

And this is what was causing my issues. I couldn't CONFIG_ACPI_APEI_EINJ_CXL work
as a boolean because I didn't put it in the right compilation unit. This should
also allow me to remove the dependency in the next patch and the exports below.

>>  obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
>>  
>>  apei-y := apei-base.o hest.o erst.o bert.o
>> diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
>> index 67c2c3b959e1..336408f4f293 100644
>> --- a/drivers/acpi/apei/apei-internal.h
>> +++ b/drivers/acpi/apei/apei-internal.h
>> @@ -130,4 +130,21 @@ static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus)
>>  }
>>  
>>  int apei_osc_setup(void);
>> +
>> +int einj_get_available_error_type(u32 *type);
>> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
>> +		      u64 param4);
>> +bool einj_is_initialized(void);
>> +bool einj_is_cxl_error_type(u64 type);
>> +int einj_validate_error_type(u64 type);
>> +
>> +#ifndef ACPI_EINJ_CXL_CACHE_CORRECTABLE
>> +#define ACPI_EINJ_CXL_CACHE_CORRECTABLE     BIT(12)
>> +#define ACPI_EINJ_CXL_CACHE_UNCORRECTABLE   BIT(13)
>> +#define ACPI_EINJ_CXL_CACHE_FATAL           BIT(14)
>> +#define ACPI_EINJ_CXL_MEM_CORRECTABLE       BIT(15)
>> +#define ACPI_EINJ_CXL_MEM_UNCORRECTABLE     BIT(16)
>> +#define ACPI_EINJ_CXL_MEM_FATAL             BIT(17)
>> +#endif
>> +
>>  #endif
>> diff --git a/drivers/acpi/apei/einj-cxl.c b/drivers/acpi/apei/einj-cxl.c
>> new file mode 100644
>> index 000000000000..607d4f6adb98
>> --- /dev/null
>> +++ b/drivers/acpi/apei/einj-cxl.c
>> @@ -0,0 +1,121 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * CXL Error INJection support. Used by CXL core to inject
>> + * protocol errors into CXL ports.
>> + *
>> + * Copyright (C) 2023 Advanced Micro Devices, Inc.
>> + *
>> + * Author: Ben Cheatham <benjamin.cheatham@amd.com>
>> + */
>> +#include <linux/einj-cxl.h>
>> +#include <linux/debugfs.h>
>> +
>> +#include "apei-internal.h"
>> +
>> +static struct { u32 mask; const char *str; } const einj_cxl_error_type_string[] = {
>> +	{ BIT(12), "CXL.cache Protocol Correctable" },
>> +	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
>> +	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
>> +	{ BIT(15), "CXL.mem Protocol Correctable" },
>> +	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
>> +	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
>> +};
>> +
>> +int einj_cxl_available_error_type_show(struct seq_file *m, void *v)
>> +{
>> +	int cxl_err, rc;
>> +	u32 available_error_type = 0;
>> +
>> +	if (!einj_is_initialized())
>> +		return -ENXIO;
>> +
>> +	rc = einj_get_available_error_type(&available_error_type);
>> +	if (rc)
>> +		return rc;
>> +
>> +	for (int pos = 0; pos < ARRAY_SIZE(einj_cxl_error_type_string); pos++) {
>> +		cxl_err = ACPI_EINJ_CXL_CACHE_CORRECTABLE << pos;
>> +
>> +		if (available_error_type & cxl_err)
>> +			seq_printf(m, "0x%08x\t%s\n",
>> +				   einj_cxl_error_type_string[pos].mask,
>> +				   einj_cxl_error_type_string[pos].str);
>> +	}
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_available_error_type_show, CXL);
>> +
>> +static int cxl_dport_get_sbdf(struct pci_dev *dport_dev, u64 *sbdf)
>> +{
>> +	struct pci_bus *pbus;
>> +	struct pci_host_bridge *bridge;
>> +	u64 seg = 0, bus;
>> +
>> +	pbus = dport_dev->bus;
>> +	bridge = pci_find_host_bridge(pbus);
>> +
>> +	if (!bridge)
>> +		return -ENODEV;
>> +
>> +	if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
>> +		seg = bridge->domain_nr;
>> +
>> +	bus = pbus->number;
>> +	*sbdf = (seg << 24) | (bus << 16) | dport_dev->devfn;
>> +
>> +	return 0;
>> +}
>> +
>> +int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
>> +{
>> +	int rc;
>> +
>> +	if (!einj_is_initialized())
>> +		return -ENXIO;
>> +
>> +	/* Only CXL error types can be specified */
>> +	if (!einj_is_cxl_error_type(type))
>> +		return -EINVAL;
>> +
>> +	rc = einj_validate_error_type(type);
>> +	if (rc)
>> +		return rc;
>> +
>> +	return einj_error_inject(type, 0x2, rcrb, GENMASK_ULL(63, 12), 0, 0);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_rch_error, CXL);
>> +
>> +int einj_cxl_inject_error(struct pci_dev *dport, u64 type)
>> +{
>> +	u64 param4 = 0;
>> +	int rc;
>> +
>> +	if (!einj_is_initialized())
>> +		return -ENXIO;
>> +
>> +	/* Only CXL error types can be specified */
>> +	if (!einj_is_cxl_error_type(type))
>> +		return -EINVAL;
>> +
>> +	rc = einj_validate_error_type(type);
>> +	if (rc)
>> +		return rc;
>> +
>> +	rc = cxl_dport_get_sbdf(dport, &param4);
>> +	if (rc)
>> +		return rc;
>> +
>> +	return einj_error_inject(type, 0x4, 0, 0, 0, param4);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_error, CXL);
>> +
>> +bool einj_cxl_is_initialized(void)
>> +{
>> +	return einj_is_initialized();
>> +}
>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_is_initialized, CXL);
>> +
>> +MODULE_AUTHOR("Ben Cheatham");
>> +MODULE_DESCRIPTION("CXL Error INJection support");
>> +MODULE_LICENSE("GPL");
> 
> These go away when cxl-einj.ko is no longer its own module.
> 
>> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
>> index 6ea323b9d8ef..e76e64df97a7 100644
>> --- a/drivers/acpi/apei/einj.c
>> +++ b/drivers/acpi/apei/einj.c
>> @@ -37,6 +37,12 @@
>>  #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
>>  				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
>>  				ACPI_EINJ_MEMORY_FATAL)
>> +#define CXL_ERROR_MASK		(ACPI_EINJ_CXL_CACHE_CORRECTABLE | \
>> +				ACPI_EINJ_CXL_CACHE_UNCORRECTABLE | \
>> +				ACPI_EINJ_CXL_CACHE_FATAL | \
>> +				ACPI_EINJ_CXL_MEM_CORRECTABLE | \
>> +				ACPI_EINJ_CXL_MEM_UNCORRECTABLE | \
>> +				ACPI_EINJ_CXL_MEM_FATAL)
>>  
>>  /*
>>   * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
>> @@ -166,7 +172,7 @@ static int __einj_get_available_error_type(u32 *type)
>>  }
>>  
>>  /* Get error injection capabilities of the platform */
>> -static int einj_get_available_error_type(u32 *type)
>> +int einj_get_available_error_type(u32 *type)
>>  {
>>  	int rc;
>>  
>> @@ -176,6 +182,7 @@ static int einj_get_available_error_type(u32 *type)
>>  
>>  	return rc;
>>  }
>> +EXPORT_SYMBOL_GPL(einj_get_available_error_type);
> 
> There should not be any need for new exports from the legacy einj.c.
> 
>>  
>>  static int einj_timedout(u64 *t)
>>  {
>> @@ -536,8 +543,8 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  }
>>  
>>  /* Inject the specified hardware error */
>> -static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>> -			     u64 param3, u64 param4)
>> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
>> +		      u64 param4)
>>  {
>>  	int rc;
>>  	u64 base_addr, size;
>> @@ -560,8 +567,11 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  	if (type & ACPI5_VENDOR_BIT) {
>>  		if (vendor_flags != SETWA_FLAGS_MEM)
>>  			goto inject;
>> -	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
>> +	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
>>  		goto inject;
>> +	} else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
>> +		goto inject;
>> +	}
>>  
>>  	/*
>>  	 * Disallow crazy address masks that give BIOS leeway to pick
>> @@ -592,6 +602,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  
>>  	return rc;
>>  }
>> +EXPORT_SYMBOL_GPL(einj_error_inject);
>>  
>>  static u32 error_type;
>>  static u32 error_flags;
>> @@ -613,12 +624,6 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
>>  	{ BIT(9), "Platform Correctable" },
>>  	{ BIT(10), "Platform Uncorrectable non-fatal" },
>>  	{ BIT(11), "Platform Uncorrectable fatal"},
>> -	{ BIT(12), "CXL.cache Protocol Correctable" },
>> -	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
>> -	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
>> -	{ BIT(15), "CXL.mem Protocol Correctable" },
>> -	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
>> -	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
>>  	{ BIT(31), "Vendor Defined Error Types" },
>>  };
>>  
>> @@ -640,29 +645,21 @@ static int available_error_type_show(struct seq_file *m, void *v)
>>  
>>  DEFINE_SHOW_ATTRIBUTE(available_error_type);
>>  
>> -static int error_type_get(void *data, u64 *val)
>> -{
>> -	*val = error_type;
>> -
>> -	return 0;
>> -}
>> -
>> -static int error_type_set(void *data, u64 val)
>> +int einj_validate_error_type(u64 type)
>>  {
>> +	u32 tval, vendor, available_error_type = 0;
>>  	int rc;
>> -	u32 available_error_type = 0;
>> -	u32 tval, vendor;
>>  
>>  	/* Only low 32 bits for error type are valid */
>> -	if (val & GENMASK_ULL(63, 32))
>> +	if (type & GENMASK_ULL(63, 32))
>>  		return -EINVAL;
>>  
>>  	/*
>>  	 * Vendor defined types have 0x80000000 bit set, and
>>  	 * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE
>>  	 */
>> -	vendor = val & ACPI5_VENDOR_BIT;
>> -	tval = val & 0x7fffffff;
>> +	vendor = type & ACPI5_VENDOR_BIT;
>> +	tval = type & GENMASK(30, 0);
>>  
>>  	/* Only one error type can be specified */
>>  	if (tval & (tval - 1))
>> @@ -671,9 +668,39 @@ static int error_type_set(void *data, u64 val)
>>  		rc = einj_get_available_error_type(&available_error_type);
>>  		if (rc)
>>  			return rc;
>> -		if (!(val & available_error_type))
>> +		if (!(type & available_error_type))
>>  			return -EINVAL;
>>  	}
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(einj_validate_error_type);
>> +
>> +bool einj_is_cxl_error_type(u64 type)
>> +{
>> +	return (type & CXL_ERROR_MASK) && (!(type & ACPI5_VENDOR_BIT));
>> +}
>> +EXPORT_SYMBOL_GPL(einj_is_cxl_error_type);
>> +
>> +static int error_type_get(void *data, u64 *val)
>> +{
>> +	*val = error_type;
>> +
>> +	return 0;
>> +}
>> +
>> +static int error_type_set(void *data, u64 val)
>> +{
>> +	int rc;
>> +
>> +	/* CXL error types have to be injected from cxl debugfs */
>> +	if (einj_is_cxl_error_type(val))
>> +		return -EINVAL;
>> +
>> +	rc = einj_validate_error_type(val);
>> +	if (rc)
>> +		return rc;
>> +
>>  	error_type = val;
>>  
>>  	return 0;
>> @@ -709,6 +736,12 @@ static int einj_check_table(struct acpi_table_einj *einj_tab)
>>  	return 0;
>>  }
>>  
>> +bool einj_is_initialized(void)
>> +{
>> +	return einj_initialized;
>> +}
>> +EXPORT_SYMBOL_GPL(einj_is_initialized);
> 
> The variable can be referenced directly as a global symbol.
Ben Cheatham Feb. 21, 2024, 8:34 p.m. UTC | #3
On 2/21/24 2:27 PM, Ben Cheatham wrote:
> 
> 
> On 2/21/24 11:43 AM, Dan Williams wrote:
>> Ben Cheatham wrote:
>>> Remove CXL protocol error types from the EINJ module and move them to
>>> a new einj_cxl module. The einj_cxl module implements the necessary
>>> handling for CXL protocol error injection and exposes an API for the
>>> CXL core to use said functionality. Because the CXL error types
>>> require special handling, only allow them to be injected through the
>>> einj_cxl module and return an error when attempting to inject through
>>> "regular" EINJ.
>>
>> So Robustness Principle says be conservative in what you send and
>> liberal in what you accept. So cleaning up the reporting of CXL
>> capabilities over to the new interface is consistent with that
>> principle, but not removing the ability to inject via the legacy
>> interface. Especially since that has been the status quo for a few
>> kernel cycles is there a good reason to actively prevent usage of that
>> path?
>>
> 
> For CXL 2.0+ ports it's fine since EINJ only expects an SBDF which is
> pretty readily accessible by the user. CXL 1.1/1.0 ports however, it's a bit
> of a headache. It would require the user to find the address of the RCRB
> for the port and supply that to the EINJ module. I originally had this option
> anyway, but I think it got shot down for being too obtuse to use (I think by
> you, but it's been a while xD). If you think it's still worthwhile I can
> remove the restriction for both types of ports or just the 2.0+ ports.
> 
> For CXL 1.0/1.1 ports there's also the security issue of being able to inject
> to any address since the way it works is by skipping the memory address
> checks, but since this is a debug module I don't think it's that big
> of a deal.
> 
>>> Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
>>> ---
>>>  MAINTAINERS                       |   1 +
>>>  drivers/acpi/apei/Kconfig         |  12 +++
>>>  drivers/acpi/apei/Makefile        |   1 +
>>>  drivers/acpi/apei/apei-internal.h |  17 +++++
>>>  drivers/acpi/apei/einj-cxl.c      | 121 ++++++++++++++++++++++++++++++
>>>  drivers/acpi/apei/einj.c          |  81 ++++++++++++++------
>>>  include/linux/einj-cxl.h          |  40 ++++++++++
>>>  7 files changed, 249 insertions(+), 24 deletions(-)
>>>  create mode 100644 drivers/acpi/apei/einj-cxl.c
>>>  create mode 100644 include/linux/einj-cxl.h
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index 73d898383e51..51f9a0da57d7 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -5289,6 +5289,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>>>  L:	linux-cxl@vger.kernel.org
>>>  S:	Maintained
>>>  F:	drivers/cxl/
>>> +F:	include/linux/cxl-einj.h
>>>  F:	include/linux/cxl-event.h
>>>  F:	include/uapi/linux/cxl_mem.h
>>>  F:	tools/testing/cxl/
>>> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
>>> index 6b18f8bc7be3..040a9b2de235 100644
>>> --- a/drivers/acpi/apei/Kconfig
>>> +++ b/drivers/acpi/apei/Kconfig
>>> @@ -60,6 +60,18 @@ config ACPI_APEI_EINJ
>>>  	  mainly used for debugging and testing the other parts of
>>>  	  APEI and some other RAS features.
>>>  
>>> +config ACPI_APEI_EINJ_CXL
>>> +	tristate "CXL Error INJection Support"
>>
>> This should still be a boolean because it is add-on functionality to the
>> cxl_core.ko module which has its own tristate configuration.
>>
> 
> I tried this but was running into issues, more about this in the Makefile
> portion of the patch.
> 
>>> +	default ACPI_APEI_EINJ
>>> +	depends on ACPI_APEI_EINJ
>>
>> The dependency still needs to be:
>>
>>     depends on ACPI_APEI_EINJ && CXL_BUS >= ACPI_APEI_EINJ
>>
>> ...because CXL_BUS can not tolerate being built-in when ACPI_APEI_EINJ
>> is not.
>>
> 
> Will do.
> 

Sorry, little clarifying question. Shouldn't this be CXL_BUS <= ACPI_APEI_EINJ instead?
The other way would allow CXL_BUS to be built-in while ACPI_APEI_EINJ is a module, right?

Thanks,
Ben

>>> +	help
>>> +	  Support for CXL protocol Error INJection through debugfs/cxl.
>>> +	  Availability and which errors are supported is dependent on
>>> +	  the host platform. Look to ACPI v6.5 section 18.6.4 and kernel
>>> +	  EINJ documentation for more information.
>>> +
>>> +	  If unsure say 'n'
>>> +
>>>  config ACPI_APEI_ERST_DEBUG
>>>  	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
>>>  	depends on ACPI_APEI
>>> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
>>> index 4dfac2128737..c18e96d342b2 100644
>>> --- a/drivers/acpi/apei/Makefile
>>> +++ b/drivers/acpi/apei/Makefile
>>> @@ -2,6 +2,7 @@
>>>  obj-$(CONFIG_ACPI_APEI)		+= apei.o
>>>  obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
>>>  obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
>>> +obj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
>>
>> No new module needed. It only needs another compilation unit optionally
>> added to einj.ko. Something like this:
>>
>> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
>> index 4dfac2128737..2c474e6477e1 100644
>> --- a/drivers/acpi/apei/Makefile
>> +++ b/drivers/acpi/apei/Makefile
>> @@ -2,6 +2,8 @@
>>  obj-$(CONFIG_ACPI_APEI)                += apei.o
>>  obj-$(CONFIG_ACPI_APEI_GHES)   += ghes.o
>>  obj-$(CONFIG_ACPI_APEI_EINJ)   += einj.o
>> +einj-y                         := einj-core.o
>> +einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
>>  obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
>>  
>>  apei-y := apei-base.o hest.o erst.o bert.o
>> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj-core.c
>> similarity index 100%
>> rename from drivers/acpi/apei/einj.c
>> rename to drivers/acpi/apei/einj-core.c
>>
> 
> And this is what was causing my issues. I couldn't CONFIG_ACPI_APEI_EINJ_CXL work
> as a boolean because I didn't put it in the right compilation unit. This should
> also allow me to remove the dependency in the next patch and the exports below.
> 
>>>  obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
>>>  
>>>  apei-y := apei-base.o hest.o erst.o bert.o
>>> diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
>>> index 67c2c3b959e1..336408f4f293 100644
>>> --- a/drivers/acpi/apei/apei-internal.h
>>> +++ b/drivers/acpi/apei/apei-internal.h
>>> @@ -130,4 +130,21 @@ static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus)
>>>  }
>>>  
>>>  int apei_osc_setup(void);
>>> +
>>> +int einj_get_available_error_type(u32 *type);
>>> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
>>> +		      u64 param4);
>>> +bool einj_is_initialized(void);
>>> +bool einj_is_cxl_error_type(u64 type);
>>> +int einj_validate_error_type(u64 type);
>>> +
>>> +#ifndef ACPI_EINJ_CXL_CACHE_CORRECTABLE
>>> +#define ACPI_EINJ_CXL_CACHE_CORRECTABLE     BIT(12)
>>> +#define ACPI_EINJ_CXL_CACHE_UNCORRECTABLE   BIT(13)
>>> +#define ACPI_EINJ_CXL_CACHE_FATAL           BIT(14)
>>> +#define ACPI_EINJ_CXL_MEM_CORRECTABLE       BIT(15)
>>> +#define ACPI_EINJ_CXL_MEM_UNCORRECTABLE     BIT(16)
>>> +#define ACPI_EINJ_CXL_MEM_FATAL             BIT(17)
>>> +#endif
>>> +
>>>  #endif
>>> diff --git a/drivers/acpi/apei/einj-cxl.c b/drivers/acpi/apei/einj-cxl.c
>>> new file mode 100644
>>> index 000000000000..607d4f6adb98
>>> --- /dev/null
>>> +++ b/drivers/acpi/apei/einj-cxl.c
>>> @@ -0,0 +1,121 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * CXL Error INJection support. Used by CXL core to inject
>>> + * protocol errors into CXL ports.
>>> + *
>>> + * Copyright (C) 2023 Advanced Micro Devices, Inc.
>>> + *
>>> + * Author: Ben Cheatham <benjamin.cheatham@amd.com>
>>> + */
>>> +#include <linux/einj-cxl.h>
>>> +#include <linux/debugfs.h>
>>> +
>>> +#include "apei-internal.h"
>>> +
>>> +static struct { u32 mask; const char *str; } const einj_cxl_error_type_string[] = {
>>> +	{ BIT(12), "CXL.cache Protocol Correctable" },
>>> +	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
>>> +	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
>>> +	{ BIT(15), "CXL.mem Protocol Correctable" },
>>> +	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
>>> +	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
>>> +};
>>> +
>>> +int einj_cxl_available_error_type_show(struct seq_file *m, void *v)
>>> +{
>>> +	int cxl_err, rc;
>>> +	u32 available_error_type = 0;
>>> +
>>> +	if (!einj_is_initialized())
>>> +		return -ENXIO;
>>> +
>>> +	rc = einj_get_available_error_type(&available_error_type);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	for (int pos = 0; pos < ARRAY_SIZE(einj_cxl_error_type_string); pos++) {
>>> +		cxl_err = ACPI_EINJ_CXL_CACHE_CORRECTABLE << pos;
>>> +
>>> +		if (available_error_type & cxl_err)
>>> +			seq_printf(m, "0x%08x\t%s\n",
>>> +				   einj_cxl_error_type_string[pos].mask,
>>> +				   einj_cxl_error_type_string[pos].str);
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_available_error_type_show, CXL);
>>> +
>>> +static int cxl_dport_get_sbdf(struct pci_dev *dport_dev, u64 *sbdf)
>>> +{
>>> +	struct pci_bus *pbus;
>>> +	struct pci_host_bridge *bridge;
>>> +	u64 seg = 0, bus;
>>> +
>>> +	pbus = dport_dev->bus;
>>> +	bridge = pci_find_host_bridge(pbus);
>>> +
>>> +	if (!bridge)
>>> +		return -ENODEV;
>>> +
>>> +	if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
>>> +		seg = bridge->domain_nr;
>>> +
>>> +	bus = pbus->number;
>>> +	*sbdf = (seg << 24) | (bus << 16) | dport_dev->devfn;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
>>> +{
>>> +	int rc;
>>> +
>>> +	if (!einj_is_initialized())
>>> +		return -ENXIO;
>>> +
>>> +	/* Only CXL error types can be specified */
>>> +	if (!einj_is_cxl_error_type(type))
>>> +		return -EINVAL;
>>> +
>>> +	rc = einj_validate_error_type(type);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	return einj_error_inject(type, 0x2, rcrb, GENMASK_ULL(63, 12), 0, 0);
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_rch_error, CXL);
>>> +
>>> +int einj_cxl_inject_error(struct pci_dev *dport, u64 type)
>>> +{
>>> +	u64 param4 = 0;
>>> +	int rc;
>>> +
>>> +	if (!einj_is_initialized())
>>> +		return -ENXIO;
>>> +
>>> +	/* Only CXL error types can be specified */
>>> +	if (!einj_is_cxl_error_type(type))
>>> +		return -EINVAL;
>>> +
>>> +	rc = einj_validate_error_type(type);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	rc = cxl_dport_get_sbdf(dport, &param4);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	return einj_error_inject(type, 0x4, 0, 0, 0, param4);
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_error, CXL);
>>> +
>>> +bool einj_cxl_is_initialized(void)
>>> +{
>>> +	return einj_is_initialized();
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(einj_cxl_is_initialized, CXL);
>>> +
>>> +MODULE_AUTHOR("Ben Cheatham");
>>> +MODULE_DESCRIPTION("CXL Error INJection support");
>>> +MODULE_LICENSE("GPL");
>>
>> These go away when cxl-einj.ko is no longer its own module.
>>
>>> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
>>> index 6ea323b9d8ef..e76e64df97a7 100644
>>> --- a/drivers/acpi/apei/einj.c
>>> +++ b/drivers/acpi/apei/einj.c
>>> @@ -37,6 +37,12 @@
>>>  #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
>>>  				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
>>>  				ACPI_EINJ_MEMORY_FATAL)
>>> +#define CXL_ERROR_MASK		(ACPI_EINJ_CXL_CACHE_CORRECTABLE | \
>>> +				ACPI_EINJ_CXL_CACHE_UNCORRECTABLE | \
>>> +				ACPI_EINJ_CXL_CACHE_FATAL | \
>>> +				ACPI_EINJ_CXL_MEM_CORRECTABLE | \
>>> +				ACPI_EINJ_CXL_MEM_UNCORRECTABLE | \
>>> +				ACPI_EINJ_CXL_MEM_FATAL)
>>>  
>>>  /*
>>>   * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
>>> @@ -166,7 +172,7 @@ static int __einj_get_available_error_type(u32 *type)
>>>  }
>>>  
>>>  /* Get error injection capabilities of the platform */
>>> -static int einj_get_available_error_type(u32 *type)
>>> +int einj_get_available_error_type(u32 *type)
>>>  {
>>>  	int rc;
>>>  
>>> @@ -176,6 +182,7 @@ static int einj_get_available_error_type(u32 *type)
>>>  
>>>  	return rc;
>>>  }
>>> +EXPORT_SYMBOL_GPL(einj_get_available_error_type);
>>
>> There should not be any need for new exports from the legacy einj.c.
>>
>>>  
>>>  static int einj_timedout(u64 *t)
>>>  {
>>> @@ -536,8 +543,8 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>>  }
>>>  
>>>  /* Inject the specified hardware error */
>>> -static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>> -			     u64 param3, u64 param4)
>>> +int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
>>> +		      u64 param4)
>>>  {
>>>  	int rc;
>>>  	u64 base_addr, size;
>>> @@ -560,8 +567,11 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>>  	if (type & ACPI5_VENDOR_BIT) {
>>>  		if (vendor_flags != SETWA_FLAGS_MEM)
>>>  			goto inject;
>>> -	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
>>> +	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
>>>  		goto inject;
>>> +	} else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
>>> +		goto inject;
>>> +	}
>>>  
>>>  	/*
>>>  	 * Disallow crazy address masks that give BIOS leeway to pick
>>> @@ -592,6 +602,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>>  
>>>  	return rc;
>>>  }
>>> +EXPORT_SYMBOL_GPL(einj_error_inject);
>>>  
>>>  static u32 error_type;
>>>  static u32 error_flags;
>>> @@ -613,12 +624,6 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
>>>  	{ BIT(9), "Platform Correctable" },
>>>  	{ BIT(10), "Platform Uncorrectable non-fatal" },
>>>  	{ BIT(11), "Platform Uncorrectable fatal"},
>>> -	{ BIT(12), "CXL.cache Protocol Correctable" },
>>> -	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
>>> -	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
>>> -	{ BIT(15), "CXL.mem Protocol Correctable" },
>>> -	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
>>> -	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
>>>  	{ BIT(31), "Vendor Defined Error Types" },
>>>  };
>>>  
>>> @@ -640,29 +645,21 @@ static int available_error_type_show(struct seq_file *m, void *v)
>>>  
>>>  DEFINE_SHOW_ATTRIBUTE(available_error_type);
>>>  
>>> -static int error_type_get(void *data, u64 *val)
>>> -{
>>> -	*val = error_type;
>>> -
>>> -	return 0;
>>> -}
>>> -
>>> -static int error_type_set(void *data, u64 val)
>>> +int einj_validate_error_type(u64 type)
>>>  {
>>> +	u32 tval, vendor, available_error_type = 0;
>>>  	int rc;
>>> -	u32 available_error_type = 0;
>>> -	u32 tval, vendor;
>>>  
>>>  	/* Only low 32 bits for error type are valid */
>>> -	if (val & GENMASK_ULL(63, 32))
>>> +	if (type & GENMASK_ULL(63, 32))
>>>  		return -EINVAL;
>>>  
>>>  	/*
>>>  	 * Vendor defined types have 0x80000000 bit set, and
>>>  	 * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE
>>>  	 */
>>> -	vendor = val & ACPI5_VENDOR_BIT;
>>> -	tval = val & 0x7fffffff;
>>> +	vendor = type & ACPI5_VENDOR_BIT;
>>> +	tval = type & GENMASK(30, 0);
>>>  
>>>  	/* Only one error type can be specified */
>>>  	if (tval & (tval - 1))
>>> @@ -671,9 +668,39 @@ static int error_type_set(void *data, u64 val)
>>>  		rc = einj_get_available_error_type(&available_error_type);
>>>  		if (rc)
>>>  			return rc;
>>> -		if (!(val & available_error_type))
>>> +		if (!(type & available_error_type))
>>>  			return -EINVAL;
>>>  	}
>>> +
>>> +	return 0;
>>> +}
>>> +EXPORT_SYMBOL_GPL(einj_validate_error_type);
>>> +
>>> +bool einj_is_cxl_error_type(u64 type)
>>> +{
>>> +	return (type & CXL_ERROR_MASK) && (!(type & ACPI5_VENDOR_BIT));
>>> +}
>>> +EXPORT_SYMBOL_GPL(einj_is_cxl_error_type);
>>> +
>>> +static int error_type_get(void *data, u64 *val)
>>> +{
>>> +	*val = error_type;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int error_type_set(void *data, u64 val)
>>> +{
>>> +	int rc;
>>> +
>>> +	/* CXL error types have to be injected from cxl debugfs */
>>> +	if (einj_is_cxl_error_type(val))
>>> +		return -EINVAL;
>>> +
>>> +	rc = einj_validate_error_type(val);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>>  	error_type = val;
>>>  
>>>  	return 0;
>>> @@ -709,6 +736,12 @@ static int einj_check_table(struct acpi_table_einj *einj_tab)
>>>  	return 0;
>>>  }
>>>  
>>> +bool einj_is_initialized(void)
>>> +{
>>> +	return einj_initialized;
>>> +}
>>> +EXPORT_SYMBOL_GPL(einj_is_initialized);
>>
>> The variable can be referenced directly as a global symbol.
>
Dan Williams Feb. 21, 2024, 8:41 p.m. UTC | #4
Ben Cheatham wrote:
> 
> 
> On 2/21/24 11:43 AM, Dan Williams wrote:
> > Ben Cheatham wrote:
> >> Remove CXL protocol error types from the EINJ module and move them to
> >> a new einj_cxl module. The einj_cxl module implements the necessary
> >> handling for CXL protocol error injection and exposes an API for the
> >> CXL core to use said functionality. Because the CXL error types
> >> require special handling, only allow them to be injected through the
> >> einj_cxl module and return an error when attempting to inject through
> >> "regular" EINJ.
> > 
> > So Robustness Principle says be conservative in what you send and
> > liberal in what you accept. So cleaning up the reporting of CXL
> > capabilities over to the new interface is consistent with that
> > principle, but not removing the ability to inject via the legacy
> > interface. Especially since that has been the status quo for a few
> > kernel cycles is there a good reason to actively prevent usage of that
> > path?
> > 
> 
> For CXL 2.0+ ports it's fine since EINJ only expects an SBDF which is
> pretty readily accessible by the user. CXL 1.1/1.0 ports however, it's a bit
> of a headache. It would require the user to find the address of the RCRB
> for the port and supply that to the EINJ module. I originally had this option
> anyway, but I think it got shot down for being too obtuse to use (I think by
> you, but it's been a while xD). If you think it's still worthwhile I can
> remove the restriction for both types of ports or just the 2.0+ ports.

So, to be clear, yes I NAKd that being the primary interface, and I am
not changing my mind on it being too obtuse to inflict on end users.
However, just because it is too obtuse to be the primary interface does
not mean that if someone runs that gauntlet, or has expert knowledge of
where RCRB is located, that they be tripped up at the last moment from
specifying that parameter via the legacy path.

So consider it an undocumented feature, and I think it only ends up
being a one line change to let that parameter through, if it can be done
safely.

That said, if there is any concern about input validation and safety
then keep the policy as you have it.

> For CXL 1.0/1.1 ports there's also the security issue of being able to inject
> to any address since the way it works is by skipping the memory address
> checks, but since this is a debug module I don't think it's that big
> of a deal.

Say more here, this seems like just the safety issue I just mentioned
that would justify forcing folks through the CXL interface. Depending on
how severe this is it might also require backporting the removal of CXL
injection from older kernels.

The main concern would be whether einj needs to disabled when kernel
lockdown is enabled.
Ben Cheatham Feb. 21, 2024, 9 p.m. UTC | #5
On 2/21/24 2:41 PM, Dan Williams wrote:
> Ben Cheatham wrote:
>>
>>
>> On 2/21/24 11:43 AM, Dan Williams wrote:
>>> Ben Cheatham wrote:
>>>> Remove CXL protocol error types from the EINJ module and move them to
>>>> a new einj_cxl module. The einj_cxl module implements the necessary
>>>> handling for CXL protocol error injection and exposes an API for the
>>>> CXL core to use said functionality. Because the CXL error types
>>>> require special handling, only allow them to be injected through the
>>>> einj_cxl module and return an error when attempting to inject through
>>>> "regular" EINJ.
>>>
>>> So Robustness Principle says be conservative in what you send and
>>> liberal in what you accept. So cleaning up the reporting of CXL
>>> capabilities over to the new interface is consistent with that
>>> principle, but not removing the ability to inject via the legacy
>>> interface. Especially since that has been the status quo for a few
>>> kernel cycles is there a good reason to actively prevent usage of that
>>> path?
>>>
>>
>> For CXL 2.0+ ports it's fine since EINJ only expects an SBDF which is
>> pretty readily accessible by the user. CXL 1.1/1.0 ports however, it's a bit
>> of a headache. It would require the user to find the address of the RCRB
>> for the port and supply that to the EINJ module. I originally had this option
>> anyway, but I think it got shot down for being too obtuse to use (I think by
>> you, but it's been a while xD). If you think it's still worthwhile I can
>> remove the restriction for both types of ports or just the 2.0+ ports.
> 
> So, to be clear, yes I NAKd that being the primary interface, and I am
> not changing my mind on it being too obtuse to inflict on end users.
> However, just because it is too obtuse to be the primary interface does
> not mean that if someone runs that gauntlet, or has expert knowledge of
> where RCRB is located, that they be tripped up at the last moment from
> specifying that parameter via the legacy path.
> 
> So consider it an undocumented feature, and I think it only ends up
> being a one line change to let that parameter through, if it can be done
> safely.
> 
> That said, if there is any concern about input validation and safety
> then keep the policy as you have it.
> 
>> For CXL 1.0/1.1 ports there's also the security issue of being able to inject
>> to any address since the way it works is by skipping the memory address
>> checks, but since this is a debug module I don't think it's that big
>> of a deal.
> 
> Say more here, this seems like just the safety issue I just mentioned
> that would justify forcing folks through the CXL interface. Depending on
> how severe this is it might also require backporting the removal of CXL
> injection from older kernels.
> 
> The main concern would be whether einj needs to disabled when kernel
> lockdown is enabled.

So the way the EINJ module currently works (at least as I understand it)
is that any address supplied for memory errors is checked to make sure it's
a "normal" memory address. Looking at the comment above the memory checks:

	/*
	 * Disallow crazy address masks that give BIOS leeway to pick
	 * injection address almost anywhere. Insist on page or
	 * better granularity and that target address is normal RAM or
	 * NVDIMM.
	 */

it seems that's the case. What this means is that we can't supply the
RCRB of a CXL 1.0/1.1 port because it's an MMIO address and we have to disable
the checks to inject a CXL 1.0/1.1 error. So, there won't have to be anything
done in terms of backporting since CXL error injections will get caught by this
filter in all kernels that don't have these patches. For kernels that do have
these patches though, there won't be a check on the address you supply as long
as you specify a CXL error type.

So, it seems like a bad idea to provide legacy access for CXL 1.0/1.1 ports
due to this issue. CXL 2.0+ ports don't suffer from this issue though since
they are specified by an SBDF, not a MMIO address. And looking at the code,
it looks like EINJ error types that use an SBDF already get through the
filter. So it doesn't look like there's actually anything to be done for
CXL 2.0+ ports and the new interface is just a convenience feature for 2.0+
ports.

Thanks,
Ben
Dan Williams Feb. 21, 2024, 10:05 p.m. UTC | #6
Ben Cheatham wrote:
[..]
> So, it seems like a bad idea to provide legacy access for CXL 1.0/1.1 ports
> due to this issue. CXL 2.0+ ports don't suffer from this issue though since
> they are specified by an SBDF, not a MMIO address. And looking at the code,
> it looks like EINJ error types that use an SBDF already get through the
> filter. So it doesn't look like there's actually anything to be done for
> CXL 2.0+ ports and the new interface is just a convenience feature for 2.0+
> ports.

Ok, I am on board. Continue the status quo of blocking CXL 1.1 error
injection, and let CXL 2.0 error injection be accomplished through
either interface.
kernel test robot Feb. 22, 2024, 7:49 a.m. UTC | #7
Hi Ben,

kernel test robot noticed the following build errors:

[auto build test ERROR on rafael-pm/linux-next]
[also build test ERROR on linus/master v6.8-rc5 next-20240221]
[cannot apply to rafael-pm/acpi-bus rafael-pm/devprop]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ben-Cheatham/EINJ-Migrate-to-a-platform-driver/20240221-061359
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
patch link:    https://lore.kernel.org/r/20240220221146.399209-3-Benjamin.Cheatham%40amd.com
patch subject: [PATCH v13 2/4] EINJ: Add CXL error type support
config: i386-buildonly-randconfig-004-20240221 (https://download.01.org/0day-ci/archive/20240222/202402221559.7ZWbaUpL-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240222/202402221559.7ZWbaUpL-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402221559.7ZWbaUpL-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: drivers/acpi/apei/einj-cxl.o: in function `einj_cxl_inject_error':
>> einj-cxl.c:(.text+0x166): undefined reference to `pci_find_host_bridge'
Davidlohr Bueso Feb. 23, 2024, 1:13 a.m. UTC | #8
On Wed, 21 Feb 2024, Ben Cheatham wrote:

>So the way the EINJ module currently works (at least as I understand it)
>is that any address supplied for memory errors is checked to make sure it's
>a "normal" memory address. Looking at the comment above the memory checks:
>
>	/*
>	 * Disallow crazy address masks that give BIOS leeway to pick
>	 * injection address almost anywhere. Insist on page or
>	 * better granularity and that target address is normal RAM or
>	 * NVDIMM.
>	 */
>
>it seems that's the case. What this means is that we can't supply the
>RCRB of a CXL 1.0/1.1 port because it's an MMIO address and we have to disable
>the checks to inject a CXL 1.0/1.1 error.

Maybe worth a comment here as to why the error checking is skipped for cxl?

+	} else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
+		goto inject;

Thanks,
Davidlohr
Ben Cheatham Feb. 23, 2024, 3:33 p.m. UTC | #9
On 2/22/24 7:13 PM, Davidlohr Bueso wrote:
> On Wed, 21 Feb 2024, Ben Cheatham wrote:
> 
>> So the way the EINJ module currently works (at least as I understand it)
>> is that any address supplied for memory errors is checked to make sure it's
>> a "normal" memory address. Looking at the comment above the memory checks:
>>
>>     /*
>>      * Disallow crazy address masks that give BIOS leeway to pick
>>      * injection address almost anywhere. Insist on page or
>>      * better granularity and that target address is normal RAM or
>>      * NVDIMM.
>>      */
>>
>> it seems that's the case. What this means is that we can't supply the
>> RCRB of a CXL 1.0/1.1 port because it's an MMIO address and we have to disable
>> the checks to inject a CXL 1.0/1.1 error.
> 
> Maybe worth a comment here as to why the error checking is skipped for cxl?
> 
> +    } else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
> +        goto inject;
> 

I think that's a good idea, I'll go ahead and add one in.

Thanks,
Ben

> Thanks,
> Davidlohr
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 73d898383e51..51f9a0da57d7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5289,6 +5289,7 @@  M:	Dan Williams <dan.j.williams@intel.com>
 L:	linux-cxl@vger.kernel.org
 S:	Maintained
 F:	drivers/cxl/
+F:	include/linux/cxl-einj.h
 F:	include/linux/cxl-event.h
 F:	include/uapi/linux/cxl_mem.h
 F:	tools/testing/cxl/
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 6b18f8bc7be3..040a9b2de235 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -60,6 +60,18 @@  config ACPI_APEI_EINJ
 	  mainly used for debugging and testing the other parts of
 	  APEI and some other RAS features.
 
+config ACPI_APEI_EINJ_CXL
+	tristate "CXL Error INJection Support"
+	default ACPI_APEI_EINJ
+	depends on ACPI_APEI_EINJ
+	help
+	  Support for CXL protocol Error INJection through debugfs/cxl.
+	  Availability and which errors are supported is dependent on
+	  the host platform. Look to ACPI v6.5 section 18.6.4 and kernel
+	  EINJ documentation for more information.
+
+	  If unsure say 'n'
+
 config ACPI_APEI_ERST_DEBUG
 	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
 	depends on ACPI_APEI
diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index 4dfac2128737..c18e96d342b2 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -2,6 +2,7 @@ 
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
 obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
 obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
+obj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
 obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
 
 apei-y := apei-base.o hest.o erst.o bert.o
diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
index 67c2c3b959e1..336408f4f293 100644
--- a/drivers/acpi/apei/apei-internal.h
+++ b/drivers/acpi/apei/apei-internal.h
@@ -130,4 +130,21 @@  static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus)
 }
 
 int apei_osc_setup(void);
+
+int einj_get_available_error_type(u32 *type);
+int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
+		      u64 param4);
+bool einj_is_initialized(void);
+bool einj_is_cxl_error_type(u64 type);
+int einj_validate_error_type(u64 type);
+
+#ifndef ACPI_EINJ_CXL_CACHE_CORRECTABLE
+#define ACPI_EINJ_CXL_CACHE_CORRECTABLE     BIT(12)
+#define ACPI_EINJ_CXL_CACHE_UNCORRECTABLE   BIT(13)
+#define ACPI_EINJ_CXL_CACHE_FATAL           BIT(14)
+#define ACPI_EINJ_CXL_MEM_CORRECTABLE       BIT(15)
+#define ACPI_EINJ_CXL_MEM_UNCORRECTABLE     BIT(16)
+#define ACPI_EINJ_CXL_MEM_FATAL             BIT(17)
+#endif
+
 #endif
diff --git a/drivers/acpi/apei/einj-cxl.c b/drivers/acpi/apei/einj-cxl.c
new file mode 100644
index 000000000000..607d4f6adb98
--- /dev/null
+++ b/drivers/acpi/apei/einj-cxl.c
@@ -0,0 +1,121 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CXL Error INJection support. Used by CXL core to inject
+ * protocol errors into CXL ports.
+ *
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ *
+ * Author: Ben Cheatham <benjamin.cheatham@amd.com>
+ */
+#include <linux/einj-cxl.h>
+#include <linux/debugfs.h>
+
+#include "apei-internal.h"
+
+static struct { u32 mask; const char *str; } const einj_cxl_error_type_string[] = {
+	{ BIT(12), "CXL.cache Protocol Correctable" },
+	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
+	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
+	{ BIT(15), "CXL.mem Protocol Correctable" },
+	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
+	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
+};
+
+int einj_cxl_available_error_type_show(struct seq_file *m, void *v)
+{
+	int cxl_err, rc;
+	u32 available_error_type = 0;
+
+	if (!einj_is_initialized())
+		return -ENXIO;
+
+	rc = einj_get_available_error_type(&available_error_type);
+	if (rc)
+		return rc;
+
+	for (int pos = 0; pos < ARRAY_SIZE(einj_cxl_error_type_string); pos++) {
+		cxl_err = ACPI_EINJ_CXL_CACHE_CORRECTABLE << pos;
+
+		if (available_error_type & cxl_err)
+			seq_printf(m, "0x%08x\t%s\n",
+				   einj_cxl_error_type_string[pos].mask,
+				   einj_cxl_error_type_string[pos].str);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(einj_cxl_available_error_type_show, CXL);
+
+static int cxl_dport_get_sbdf(struct pci_dev *dport_dev, u64 *sbdf)
+{
+	struct pci_bus *pbus;
+	struct pci_host_bridge *bridge;
+	u64 seg = 0, bus;
+
+	pbus = dport_dev->bus;
+	bridge = pci_find_host_bridge(pbus);
+
+	if (!bridge)
+		return -ENODEV;
+
+	if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
+		seg = bridge->domain_nr;
+
+	bus = pbus->number;
+	*sbdf = (seg << 24) | (bus << 16) | dport_dev->devfn;
+
+	return 0;
+}
+
+int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
+{
+	int rc;
+
+	if (!einj_is_initialized())
+		return -ENXIO;
+
+	/* Only CXL error types can be specified */
+	if (!einj_is_cxl_error_type(type))
+		return -EINVAL;
+
+	rc = einj_validate_error_type(type);
+	if (rc)
+		return rc;
+
+	return einj_error_inject(type, 0x2, rcrb, GENMASK_ULL(63, 12), 0, 0);
+}
+EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_rch_error, CXL);
+
+int einj_cxl_inject_error(struct pci_dev *dport, u64 type)
+{
+	u64 param4 = 0;
+	int rc;
+
+	if (!einj_is_initialized())
+		return -ENXIO;
+
+	/* Only CXL error types can be specified */
+	if (!einj_is_cxl_error_type(type))
+		return -EINVAL;
+
+	rc = einj_validate_error_type(type);
+	if (rc)
+		return rc;
+
+	rc = cxl_dport_get_sbdf(dport, &param4);
+	if (rc)
+		return rc;
+
+	return einj_error_inject(type, 0x4, 0, 0, 0, param4);
+}
+EXPORT_SYMBOL_NS_GPL(einj_cxl_inject_error, CXL);
+
+bool einj_cxl_is_initialized(void)
+{
+	return einj_is_initialized();
+}
+EXPORT_SYMBOL_NS_GPL(einj_cxl_is_initialized, CXL);
+
+MODULE_AUTHOR("Ben Cheatham");
+MODULE_DESCRIPTION("CXL Error INJection support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index 6ea323b9d8ef..e76e64df97a7 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -37,6 +37,12 @@ 
 #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
 				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
 				ACPI_EINJ_MEMORY_FATAL)
+#define CXL_ERROR_MASK		(ACPI_EINJ_CXL_CACHE_CORRECTABLE | \
+				ACPI_EINJ_CXL_CACHE_UNCORRECTABLE | \
+				ACPI_EINJ_CXL_CACHE_FATAL | \
+				ACPI_EINJ_CXL_MEM_CORRECTABLE | \
+				ACPI_EINJ_CXL_MEM_UNCORRECTABLE | \
+				ACPI_EINJ_CXL_MEM_FATAL)
 
 /*
  * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
@@ -166,7 +172,7 @@  static int __einj_get_available_error_type(u32 *type)
 }
 
 /* Get error injection capabilities of the platform */
-static int einj_get_available_error_type(u32 *type)
+int einj_get_available_error_type(u32 *type)
 {
 	int rc;
 
@@ -176,6 +182,7 @@  static int einj_get_available_error_type(u32 *type)
 
 	return rc;
 }
+EXPORT_SYMBOL_GPL(einj_get_available_error_type);
 
 static int einj_timedout(u64 *t)
 {
@@ -536,8 +543,8 @@  static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 }
 
 /* Inject the specified hardware error */
-static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
-			     u64 param3, u64 param4)
+int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3,
+		      u64 param4)
 {
 	int rc;
 	u64 base_addr, size;
@@ -560,8 +567,11 @@  static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 	if (type & ACPI5_VENDOR_BIT) {
 		if (vendor_flags != SETWA_FLAGS_MEM)
 			goto inject;
-	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
+	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
 		goto inject;
+	} else if ((type & CXL_ERROR_MASK) && (flags & SETWA_FLAGS_MEM)) {
+		goto inject;
+	}
 
 	/*
 	 * Disallow crazy address masks that give BIOS leeway to pick
@@ -592,6 +602,7 @@  static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 
 	return rc;
 }
+EXPORT_SYMBOL_GPL(einj_error_inject);
 
 static u32 error_type;
 static u32 error_flags;
@@ -613,12 +624,6 @@  static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
 	{ BIT(9), "Platform Correctable" },
 	{ BIT(10), "Platform Uncorrectable non-fatal" },
 	{ BIT(11), "Platform Uncorrectable fatal"},
-	{ BIT(12), "CXL.cache Protocol Correctable" },
-	{ BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" },
-	{ BIT(14), "CXL.cache Protocol Uncorrectable fatal" },
-	{ BIT(15), "CXL.mem Protocol Correctable" },
-	{ BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" },
-	{ BIT(17), "CXL.mem Protocol Uncorrectable fatal" },
 	{ BIT(31), "Vendor Defined Error Types" },
 };
 
@@ -640,29 +645,21 @@  static int available_error_type_show(struct seq_file *m, void *v)
 
 DEFINE_SHOW_ATTRIBUTE(available_error_type);
 
-static int error_type_get(void *data, u64 *val)
-{
-	*val = error_type;
-
-	return 0;
-}
-
-static int error_type_set(void *data, u64 val)
+int einj_validate_error_type(u64 type)
 {
+	u32 tval, vendor, available_error_type = 0;
 	int rc;
-	u32 available_error_type = 0;
-	u32 tval, vendor;
 
 	/* Only low 32 bits for error type are valid */
-	if (val & GENMASK_ULL(63, 32))
+	if (type & GENMASK_ULL(63, 32))
 		return -EINVAL;
 
 	/*
 	 * Vendor defined types have 0x80000000 bit set, and
 	 * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE
 	 */
-	vendor = val & ACPI5_VENDOR_BIT;
-	tval = val & 0x7fffffff;
+	vendor = type & ACPI5_VENDOR_BIT;
+	tval = type & GENMASK(30, 0);
 
 	/* Only one error type can be specified */
 	if (tval & (tval - 1))
@@ -671,9 +668,39 @@  static int error_type_set(void *data, u64 val)
 		rc = einj_get_available_error_type(&available_error_type);
 		if (rc)
 			return rc;
-		if (!(val & available_error_type))
+		if (!(type & available_error_type))
 			return -EINVAL;
 	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(einj_validate_error_type);
+
+bool einj_is_cxl_error_type(u64 type)
+{
+	return (type & CXL_ERROR_MASK) && (!(type & ACPI5_VENDOR_BIT));
+}
+EXPORT_SYMBOL_GPL(einj_is_cxl_error_type);
+
+static int error_type_get(void *data, u64 *val)
+{
+	*val = error_type;
+
+	return 0;
+}
+
+static int error_type_set(void *data, u64 val)
+{
+	int rc;
+
+	/* CXL error types have to be injected from cxl debugfs */
+	if (einj_is_cxl_error_type(val))
+		return -EINVAL;
+
+	rc = einj_validate_error_type(val);
+	if (rc)
+		return rc;
+
 	error_type = val;
 
 	return 0;
@@ -709,6 +736,12 @@  static int einj_check_table(struct acpi_table_einj *einj_tab)
 	return 0;
 }
 
+bool einj_is_initialized(void)
+{
+	return einj_initialized;
+}
+EXPORT_SYMBOL_GPL(einj_is_initialized);
+
 static int __init einj_probe(struct platform_device *pdev)
 {
 	int rc;
diff --git a/include/linux/einj-cxl.h b/include/linux/einj-cxl.h
new file mode 100644
index 000000000000..4a1f4600539a
--- /dev/null
+++ b/include/linux/einj-cxl.h
@@ -0,0 +1,40 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CXL protocol Error INJection support.
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Ben Cheatham <benjamin.cheatham@amd.com>
+ */
+#ifndef EINJ_CXL_H
+#define EINJ_CXL_H
+
+#include <linux/pci.h>
+
+#if IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL)
+int einj_cxl_available_error_type_show(struct seq_file *m, void *v);
+int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type);
+int einj_cxl_inject_rch_error(u64 rcrb, u64 type);
+bool einj_cxl_is_initialized(void);
+#else /* !IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) */
+static inline int einj_cxl_available_error_type_show(struct seq_file *m,
+						     void *v)
+{
+	return -ENXIO;
+}
+
+static inline int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type)
+{
+	return -ENXIO;
+}
+
+static inline int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
+{
+	return -ENXIO;
+}
+
+static inline bool einj_cxl_is_initialized(void) { return false; }
+#endif /* CONFIG_ACPI_APEI_EINJ_CXL */
+
+#endif /* EINJ_CXL_H */