@@ -104,3 +104,49 @@ sysfs
Sysfs files are documented in
`Documentation/ABI/testing/sysfs-edac-memory-repair`.
+
+Examples
+--------
+
+The memory repair usage takes the form shown in this example:
+
+1. CXL memory device Soft Post Package Repair (Soft PPR)
+
+1.1. Read device supported capabilities for the Soft PPR.
+
+# cat /sys/bus/edac/devices/cxl_mem0/mem_repair0/persist_mode
+
+0
+
+# cat /sys/bus/edac/devices/cxl_mem0/mem_repair0/repair_type
+
+ppr
+
+# cat /sys/bus/edac/devices/cxl_mem0/mem_repair0/repair_safe_when_in_use
+
+1
+
+# cat /sys/bus/edac/devices/cxl_mem0/mem_repair0/min_dpa
+
+0x0
+
+# cat /sys/bus/edac/devices/cxl_mem0/mem_repair0/max_dpa
+
+0xfffffff
+
+ Soft PPR that is safe to use with ongoing accesses to the memory
+
+ and applies to 4GiB of DPA space.
+
+1.2. Set attributes for a soft PPR for a DPA=0x300000
+
+# echo 0x8a2d > /sys/bus/edac/devices/cxl_mem0/mem_repair0/nibble_mask
+
+# echo 0x300000 > /sys/bus/edac/devices/cxl_mem0/mem_repair0/dpa
+
+1.3. Start soft PPR operation for the DPA=0x300000
+
+Note: Repair command returns error if unsupported/resources are not
+available for the repair operation.
+
+# echo 1 > /sys/bus/edac/devices/cxl_mem0/mem_repair0/repair
@@ -166,6 +166,7 @@ config CXL_RAS_FEATURES
depends on CXL_MEM
depends on EDAC_SCRUB
depends on EDAC_ECS
+ depends on EDAC_MEM_REPAIR
help
The CXL memory RAS feature control is optional and allows host to
control the RAS features configurations of CXL Type 3 devices.
@@ -14,12 +14,13 @@
#include <linux/cleanup.h>
#include <linux/edac.h>
#include <linux/limits.h>
+#include <linux/unaligned.h>
#include <cxl/features.h>
#include <cxl.h>
#include <cxlmem.h>
#include "core.h"
-#define CXL_DEV_NUM_RAS_FEATURES 2
+#define CXL_DEV_NUM_RAS_FEATURES 3
#define CXL_DEV_HOUR_IN_SECS 3600
#define CXL_DEV_NAME_LEN 128
@@ -772,12 +773,344 @@ static int cxl_memdev_ecs_init(struct cxl_memdev *cxlmd,
return 0;
}
+/*
+ * CXL memory soft PPR & hard PPR control definitions
+ */
+struct cxl_ppr_context {
+ uuid_t repair_uuid;
+ u8 instance;
+ u16 get_feat_size;
+ u16 set_feat_size;
+ u8 get_version;
+ u8 set_version;
+ u16 effects;
+ struct cxl_memdev *cxlmd;
+ enum edac_mem_repair_type repair_type;
+ bool persist_mode;
+ u64 dpa;
+ u32 nibble_mask;
+};
+
+/**
+ * struct cxl_memdev_ppr_params - CXL memory PPR parameter data structure.
+ * @dpa: device physical address.
+ * @op_class: PPR operation class.
+ * @op_subclass: PPR operation subclass.
+ * @media_accessible: memory media is accessible or not during PPR operation.
+ * @data_retained: data is retained or not during PPR operation.
+ */
+struct cxl_memdev_ppr_params {
+ u64 dpa;
+ u8 op_class;
+ u8 op_subclass;
+ bool media_accessible;
+ bool data_retained;
+};
+
+/*
+ * See CXL rev 3.1 @8.2.9.7.2.1 Table 8-113 sPPR Feature Readable Attributes
+ *
+ * See CXL rev 3.1 @8.2.9.7.2.2 Table 8-116 hPPR Feature Readable Attributes
+ */
+#define CXL_MEMDEV_PPR_QUERY_RESOURCE_FLAG BIT(0)
+
+#define CXL_MEMDEV_PPR_DEVICE_INITIATED_MASK BIT(0)
+#define CXL_MEMDEV_PPR_FLAG_DPA_SUPPORT_MASK BIT(0)
+#define CXL_MEMDEV_PPR_FLAG_NIBBLE_SUPPORT_MASK BIT(1)
+#define CXL_MEMDEV_PPR_FLAG_MEM_SPARING_EV_REC_SUPPORT_MASK BIT(2)
+
+#define CXL_MEMDEV_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK BIT(0)
+#define CXL_MEMDEV_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK BIT(2)
+
+#define CXL_MEMDEV_PPR_SPARING_EV_REC_EN_MASK BIT(0)
+
+struct cxl_memdev_repair_rd_attrs_hdr {
+ u8 max_op_latency;
+ __le16 op_cap;
+ __le16 op_mode;
+ u8 op_class;
+ u8 op_subclass;
+ u8 rsvd[9];
+} __packed;
+
+struct cxl_memdev_ppr_rd_attrs {
+ struct cxl_memdev_repair_rd_attrs_hdr hdr;
+ u8 ppr_flags;
+ __le16 restriction_flags;
+ u8 ppr_op_mode;
+} __packed;
+
+/*
+ * See CXL rev 3.1 @8.2.9.7.1.2 Table 8-103 sPPR Maintenance Input Payload
+ *
+ * See CXL rev 3.1 @8.2.9.7.1.3 Table 8-104 hPPR Maintenance Input Payload
+ */
+struct cxl_memdev_ppr_maintenance_attrs {
+ u8 flags;
+ __le64 dpa;
+ u8 nibble_mask[3];
+} __packed;
+
+static int cxl_mem_ppr_get_attrs(struct cxl_ppr_context *cxl_ppr_ctx,
+ struct cxl_memdev_ppr_params *params)
+{
+ size_t rd_data_size = sizeof(struct cxl_memdev_ppr_rd_attrs);
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ u16 restriction_flags;
+ size_t data_size;
+ u16 return_code;
+
+ struct cxl_memdev_ppr_rd_attrs *rd_attrs __free(kfree) =
+ kmalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &cxl_ppr_ctx->repair_uuid,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE,
+ rd_attrs, rd_data_size, 0, &return_code);
+ if (!data_size)
+ return -EIO;
+
+ params->op_class = rd_attrs->hdr.op_class;
+ params->op_subclass = rd_attrs->hdr.op_subclass;
+ restriction_flags = le16_to_cpu(rd_attrs->restriction_flags);
+ params->media_accessible = FIELD_GET(CXL_MEMDEV_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK,
+ restriction_flags) ^ 1;
+ params->data_retained = FIELD_GET(CXL_MEMDEV_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK,
+ restriction_flags) ^ 1;
+
+ return 0;
+}
+
+static int cxl_mem_do_ppr_op(struct device *dev,
+ struct cxl_ppr_context *cxl_ppr_ctx,
+ struct cxl_memdev_ppr_params *rd_params)
+{
+ struct cxl_memdev_ppr_maintenance_attrs maintenance_attrs;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_mem_repair_attrbs attrbs = { 0 };
+
+ if (!rd_params->media_accessible || !rd_params->data_retained) {
+ /* Memory to repair must be offline */
+ if (cxl_are_decoders_committed(cxlmd))
+ return -EBUSY;
+ /* offline, so good for repair */
+ } else {
+ /* If offline all good, otherwise check for match with record */
+ if (cxl_are_decoders_committed(cxlmd)) {
+ /* Check memory to repair is from the current boot */
+ attrbs.repair_type = CXL_PPR;
+ attrbs.dpa = cxl_ppr_ctx->dpa;
+ attrbs.nibble_mask = cxl_ppr_ctx->nibble_mask;
+ if (!cxl_find_rec_dram(cxlmd, &attrbs) &&
+ !cxl_find_rec_gen_media(cxlmd, &attrbs))
+ return -EINVAL;
+ /* Record matched, so even though online good for repair */
+ }
+ }
+
+ memset(&maintenance_attrs, 0, sizeof(maintenance_attrs));
+ maintenance_attrs.flags = 0;
+ maintenance_attrs.dpa = cpu_to_le64(cxl_ppr_ctx->dpa);
+ put_unaligned_le24(cxl_ppr_ctx->nibble_mask, maintenance_attrs.nibble_mask);
+
+ return cxl_do_maintenance(&cxlmd->cxlds->cxl_mbox, rd_params->op_class,
+ rd_params->op_subclass, &maintenance_attrs,
+ sizeof(maintenance_attrs));
+}
+
+static int cxl_mem_ppr_set_attrs(struct device *dev,
+ struct cxl_ppr_context *cxl_ppr_ctx)
+{
+ struct cxl_memdev_ppr_params rd_params;
+ int ret;
+
+ ret = cxl_mem_ppr_get_attrs(cxl_ppr_ctx, &rd_params);
+ if (ret)
+ return ret;
+
+ ret = cxl_hold_region_and_dpa();
+ if (ret)
+ return ret;
+
+ ret = cxl_mem_do_ppr_op(dev, cxl_ppr_ctx, &rd_params);
+ cxl_release_region_and_dpa();
+
+ return ret;
+}
+
+static int cxl_ppr_get_repair_type(struct device *dev, void *drv_data,
+ const char **repair_type)
+{
+ *repair_type = edac_repair_type[EDAC_PPR];
+
+ return 0;
+}
+
+static int cxl_ppr_get_persist_mode(struct device *dev, void *drv_data,
+ bool *persist_mode)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *persist_mode = cxl_ppr_ctx->persist_mode;
+
+ return 0;
+}
+
+static int cxl_get_ppr_safe_when_in_use(struct device *dev, void *drv_data,
+ bool *safe)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev_ppr_params params;
+ int ret;
+
+ ret = cxl_mem_ppr_get_attrs(cxl_ppr_ctx, ¶ms);
+ if (ret)
+ return ret;
+
+ *safe = params.media_accessible & params.data_retained;
+
+ return 0;
+}
+
+static int cxl_ppr_get_min_dpa(struct device *dev, void *drv_data,
+ u64 *min_dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *min_dpa = cxlds->dpa_res.start;
+
+ return 0;
+}
+
+static int cxl_ppr_get_max_dpa(struct device *dev, void *drv_data,
+ u64 *max_dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *max_dpa = cxlds->dpa_res.end;
+
+ return 0;
+}
+
+static int cxl_ppr_get_dpa(struct device *dev, void *drv_data,
+ u64 *dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *dpa = cxl_ppr_ctx->dpa;
+
+ return 0;
+}
+
+static int cxl_ppr_set_dpa(struct device *dev, void *drv_data, u64 dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end)
+ return -EINVAL;
+
+ cxl_ppr_ctx->dpa = dpa;
+
+ return 0;
+}
+
+static int cxl_ppr_get_nibble_mask(struct device *dev, void *drv_data,
+ u32 *nibble_mask)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *nibble_mask = cxl_ppr_ctx->nibble_mask;
+
+ return 0;
+}
+
+static int cxl_ppr_set_nibble_mask(struct device *dev, void *drv_data, u32 nibble_mask)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ cxl_ppr_ctx->nibble_mask = nibble_mask;
+
+ return 0;
+}
+
+static int cxl_do_ppr(struct device *dev, void *drv_data, u32 val)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ if (!cxl_ppr_ctx->dpa || val != EDAC_DO_MEM_REPAIR)
+ return -EINVAL;
+
+ return cxl_mem_ppr_set_attrs(dev, cxl_ppr_ctx);
+}
+
+static const struct edac_mem_repair_ops cxl_sppr_ops = {
+ .get_repair_type = cxl_ppr_get_repair_type,
+ .get_persist_mode = cxl_ppr_get_persist_mode,
+ .get_repair_safe_when_in_use = cxl_get_ppr_safe_when_in_use,
+ .get_min_dpa = cxl_ppr_get_min_dpa,
+ .get_max_dpa = cxl_ppr_get_max_dpa,
+ .get_dpa = cxl_ppr_get_dpa,
+ .set_dpa = cxl_ppr_set_dpa,
+ .get_nibble_mask = cxl_ppr_get_nibble_mask,
+ .set_nibble_mask = cxl_ppr_set_nibble_mask,
+ .do_repair = cxl_do_ppr,
+};
+
+static int cxl_memdev_soft_ppr_init(struct cxl_memdev *cxlmd,
+ struct edac_dev_feature *ras_feature,
+ u8 repair_inst)
+{
+ struct cxl_ppr_context *cxl_sppr_ctx;
+ struct cxl_feat_entry *feat_entry;
+
+ feat_entry = cxl_get_feature_entry(cxlmd, &CXL_FEAT_SPPR_UUID);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ cxl_sppr_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_sppr_ctx),
+ GFP_KERNEL);
+ if (!cxl_sppr_ctx)
+ return -ENOMEM;
+
+ *cxl_sppr_ctx = (struct cxl_ppr_context) {
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .cxlmd = cxlmd,
+ .repair_type = EDAC_PPR,
+ .persist_mode = 0,
+ .instance = repair_inst,
+ };
+ uuid_copy(&cxl_sppr_ctx->repair_uuid, &CXL_FEAT_SPPR_UUID);
+
+ ras_feature->ft_type = RAS_FEAT_MEM_REPAIR;
+ ras_feature->instance = cxl_sppr_ctx->instance;
+ ras_feature->mem_repair_ops = &cxl_sppr_ops;
+ ras_feature->ctx = cxl_sppr_ctx;
+
+ return 0;
+}
+
int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
{
struct edac_dev_feature ras_features[CXL_DEV_NUM_RAS_FEATURES];
char cxl_dev_name[CXL_DEV_NAME_LEN];
int num_ras_features = 0;
u8 scrub_inst = 0;
+ u8 repair_inst = 0;
int rc;
rc = cxl_memdev_scrub_init(cxlmd, &ras_features[num_ras_features],
@@ -795,6 +1128,16 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
if (rc != -EOPNOTSUPP)
num_ras_features++;
+ rc = cxl_memdev_soft_ppr_init(cxlmd, &ras_features[num_ras_features],
+ repair_inst);
+ if (rc < 0 && rc != -EOPNOTSUPP)
+ return rc;
+
+ if (rc != -EOPNOTSUPP) {
+ repair_inst++;
+ num_ras_features++;
+ }
+
snprintf(cxl_dev_name, sizeof(cxl_dev_name), "%s_%s",
"cxl", dev_name(&cxlmd->dev));
@@ -745,9 +745,14 @@ static inline int edac_ecs_get_desc(struct device *ecs_dev,
#endif /* CONFIG_EDAC_ECS */
enum edac_mem_repair_type {
+ EDAC_PPR,
EDAC_REPAIR_MAX
};
+static const char * const edac_repair_type[] = {
+ [EDAC_PPR] = "ppr",
+};
+
enum edac_mem_repair_cmd {
EDAC_DO_MEM_REPAIR = 1,
};