@@ -60,6 +60,32 @@ Description: (RO) Read returns the device health status.
The driver does not monitor for Heartbeat. It is left for a user
to poll the status periodically.
+What: /sys/kernel/debug/qat_<device>_<BDF>/heartbeat/inject_error
+Date: January 2024
+KernelVersion: 6.7
+Contact: qat-linux@intel.com
+Description: (WO) Write to inject an error that simulates an heartbeat
+ failure. This is to be used for testing purposes.
+
+ After writing this file, the driver stops arbitration on a
+ random engine and disables the fetching of heartbeat counters.
+ If a workload is running on the device, a job submitted to the
+ accelerator might not get a response and a read of the
+ `heartbeat/status` attribute might report -1, i.e. device
+ unresponsive.
+ The error is unrecoverable thus the device must be restarted to
+ restore its functionality.
+
+ This attribute is available only when the kernel is built with
+ CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION=y.
+
+ A write of 1 enables error injection.
+
+ The following example shows how to enable error injection::
+
+ # cd /sys/kernel/debug/qat_<device>_<BDF>
+ # echo 1 > heartbeat/inject_error
+
What: /sys/kernel/debug/qat_<device>_<BDF>/pm_status
Date: January 2024
KernelVersion: 6.7
@@ -95,3 +95,18 @@ config CRYPTO_DEV_QAT_C62XVF
To compile this as a module, choose M here: the module
will be called qat_c62xvf.
+
+config CRYPTO_DEV_QAT_ERROR_INJECTION
+ bool "Support for Intel(R) QAT Devices Heartbeat Error Injection"
+ default n
+ depends on CRYPTO_DEV_QAT
+ depends on DEBUG_FS
+ help
+ Enables a mechanism that allows to inject a heartbeat error on
+ Intel(R) QuickAssist devices for testing purposes.
+
+ This is intended for developer use only.
+ If unsure, say N.
+
+ This functionality is available via debugfs entry of the Intel(R)
+ QuickAssist device
@@ -49,3 +49,7 @@ intel_qat-$(CONFIG_PCI_IOV) += adf_sriov.o adf_vf_isr.o adf_pfvf_utils.o \
adf_pfvf_pf_msg.o adf_pfvf_pf_proto.o \
adf_pfvf_vf_msg.o adf_pfvf_vf_proto.o \
adf_gen2_pfvf.o adf_gen4_pfvf.o
+
+intel_qat-$(CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION) += adf_heartbeat_inject.o
+
+ccflags-$(CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION) += -DQAT_HB_ERROR_INJECTION
@@ -90,6 +90,9 @@ void adf_exit_aer(void);
int adf_init_arb(struct adf_accel_dev *accel_dev);
void adf_exit_arb(struct adf_accel_dev *accel_dev);
void adf_update_ring_arb(struct adf_etr_ring_data *ring);
+#ifdef QAT_HB_ERROR_INJECTION
+int adf_disable_arb_thd(struct adf_accel_dev *accel_dev, u32 ae, u32 thr);
+#endif
int adf_dev_get(struct adf_accel_dev *accel_dev);
void adf_dev_put(struct adf_accel_dev *accel_dev);
@@ -23,12 +23,6 @@
#define ADF_HB_EMPTY_SIG 0xA5A5A5A5
-/* Heartbeat counter pair */
-struct hb_cnt_pair {
- __u16 resp_heartbeat_cnt;
- __u16 req_heartbeat_cnt;
-};
-
static int adf_hb_check_polling_freq(struct adf_accel_dev *accel_dev)
{
u64 curr_time = adf_clock_get_current_time();
@@ -19,6 +19,12 @@ enum adf_device_heartbeat_status {
HB_DEV_UNSUPPORTED,
};
+/* Heartbeat counter pair */
+struct hb_cnt_pair {
+ __u16 resp_heartbeat_cnt;
+ __u16 req_heartbeat_cnt;
+};
+
struct adf_heartbeat {
unsigned int hb_sent_counter;
unsigned int hb_failed_counter;
@@ -35,6 +41,9 @@ struct adf_heartbeat {
struct dentry *cfg;
struct dentry *sent;
struct dentry *failed;
+#ifdef QAT_HB_ERROR_INJECTION
+ struct dentry *inject_error;
+#endif
} dbgfs;
};
@@ -50,6 +59,9 @@ int adf_heartbeat_save_cfg_param(struct adf_accel_dev *accel_dev,
void adf_heartbeat_status(struct adf_accel_dev *accel_dev,
enum adf_device_heartbeat_status *hb_status);
void adf_heartbeat_check_ctrs(struct adf_accel_dev *accel_dev);
+#ifdef QAT_HB_ERROR_INJECTION
+int adf_heartbeat_inject_error(struct adf_accel_dev *accel_dev);
+#endif
#else
static inline int adf_heartbeat_init(struct adf_accel_dev *accel_dev)
@@ -155,6 +155,45 @@ static const struct file_operations adf_hb_cfg_fops = {
.write = adf_hb_cfg_write,
};
+#ifdef QAT_HB_ERROR_INJECTION
+static ssize_t adf_hb_error_inject_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct adf_accel_dev *accel_dev = file->private_data;
+ size_t written_chars;
+ char buf[3];
+ int ret;
+
+ /* last byte left as string termination */
+ if (count != 2)
+ return -EINVAL;
+
+ written_chars = simple_write_to_buffer(buf, sizeof(buf) - 1,
+ ppos, user_buf, count);
+ if (buf[0] != '1')
+ return -EINVAL;
+
+ ret = adf_heartbeat_inject_error(accel_dev);
+ if (ret) {
+ dev_err(&GET_DEV(accel_dev),
+ "Heartbeat error injection failed with status %d\n",
+ ret);
+ return ret;
+ }
+
+ dev_info(&GET_DEV(accel_dev), "Heartbeat error injection enabled\n");
+
+ return written_chars;
+}
+
+static const struct file_operations adf_hb_error_inject_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .write = adf_hb_error_inject_write,
+};
+#endif
+
void adf_heartbeat_dbgfs_add(struct adf_accel_dev *accel_dev)
{
struct adf_heartbeat *hb = accel_dev->heartbeat;
@@ -171,6 +210,11 @@ void adf_heartbeat_dbgfs_add(struct adf_accel_dev *accel_dev)
&hb->hb_failed_counter, &adf_hb_stats_fops);
hb->dbgfs.cfg = debugfs_create_file("config", 0600, hb->dbgfs.base_dir,
accel_dev, &adf_hb_cfg_fops);
+#ifdef QAT_HB_ERROR_INJECTION
+ hb->dbgfs.inject_error = debugfs_create_file("inject_error", 0200,
+ hb->dbgfs.base_dir, accel_dev,
+ &adf_hb_error_inject_fops);
+#endif
}
EXPORT_SYMBOL_GPL(adf_heartbeat_dbgfs_add);
@@ -189,6 +233,10 @@ void adf_heartbeat_dbgfs_rm(struct adf_accel_dev *accel_dev)
hb->dbgfs.failed = NULL;
debugfs_remove(hb->dbgfs.cfg);
hb->dbgfs.cfg = NULL;
+#ifdef QAT_HB_ERROR_INJECTION
+ debugfs_remove(hb->dbgfs.inject_error);
+ hb->dbgfs.inject_error = NULL;
+#endif
debugfs_remove(hb->dbgfs.base_dir);
hb->dbgfs.base_dir = NULL;
}
new file mode 100644
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation */
+#include <linux/random.h>
+
+#include "adf_admin.h"
+#include "adf_common_drv.h"
+#include "adf_heartbeat.h"
+
+#define MAX_HB_TICKS 0xFFFFFFFF
+
+static int adf_hb_set_timer_to_max(struct adf_accel_dev *accel_dev)
+{
+ struct adf_hw_device_data *hw_data = accel_dev->hw_device;
+
+ accel_dev->heartbeat->hb_timer = 0;
+
+ if (hw_data->stop_timer)
+ hw_data->stop_timer(accel_dev);
+
+ return adf_send_admin_hb_timer(accel_dev, MAX_HB_TICKS);
+}
+
+static void adf_set_hb_counters_fail(struct adf_accel_dev *accel_dev, u32 ae,
+ u32 thr)
+{
+ struct hb_cnt_pair *stats = accel_dev->heartbeat->dma.virt_addr;
+ struct adf_hw_device_data *hw_device = accel_dev->hw_device;
+ const size_t max_aes = hw_device->get_num_aes(hw_device);
+ const size_t hb_ctrs = hw_device->num_hb_ctrs;
+ size_t thr_id = ae * hb_ctrs + thr;
+ u16 num_rsp = stats[thr_id].resp_heartbeat_cnt;
+
+ /*
+ * Inject live.req != live.rsp and live.rsp == last.rsp
+ * to trigger the heartbeat error detection
+ */
+ stats[thr_id].req_heartbeat_cnt++;
+ stats += (max_aes * hb_ctrs);
+ stats[thr_id].resp_heartbeat_cnt = num_rsp;
+}
+
+int adf_heartbeat_inject_error(struct adf_accel_dev *accel_dev)
+{
+ struct adf_hw_device_data *hw_device = accel_dev->hw_device;
+ const size_t max_aes = hw_device->get_num_aes(hw_device);
+ const size_t hb_ctrs = hw_device->num_hb_ctrs;
+ u32 rand, rand_ae, rand_thr;
+ unsigned long ae_mask;
+ int ret;
+
+ ae_mask = hw_device->ae_mask;
+
+ do {
+ /* Ensure we have a valid ae */
+ get_random_bytes(&rand, sizeof(rand));
+ rand_ae = rand % max_aes;
+ } while (!test_bit(rand_ae, &ae_mask));
+
+ get_random_bytes(&rand, sizeof(rand));
+ rand_thr = rand % hb_ctrs;
+
+ /* Increase the heartbeat timer to prevent FW updating HB counters */
+ ret = adf_hb_set_timer_to_max(accel_dev);
+ if (ret)
+ return ret;
+
+ /* Configure worker threads to stop processing any packet */
+ ret = adf_disable_arb_thd(accel_dev, rand_ae, rand_thr);
+ if (ret)
+ return ret;
+
+ /* Change HB counters memory to simulate a hang */
+ adf_set_hb_counters_fail(accel_dev, rand_ae, rand_thr);
+
+ return 0;
+}
@@ -103,3 +103,37 @@ void adf_exit_arb(struct adf_accel_dev *accel_dev)
csr_ops->write_csr_ring_srv_arb_en(csr, i, 0);
}
EXPORT_SYMBOL_GPL(adf_exit_arb);
+
+#ifdef QAT_HB_ERROR_INJECTION
+static void adf_write_arb_wt2sam(void __iomem *csr_addr, u32 csr_offset,
+ u32 wrk_to_ser_map_offset, size_t index, u32 value)
+{
+ WRITE_CSR_ARB_WT2SAM(csr_addr, csr_offset, wrk_to_ser_map_offset, index,
+ value);
+}
+
+int adf_disable_arb_thd(struct adf_accel_dev *accel_dev, u32 ae, u32 thr)
+{
+ void __iomem *csr = accel_dev->transport->banks[0].csr_addr;
+ struct adf_hw_device_data *hw_data = accel_dev->hw_device;
+ const u32 *thd_2_arb_cfg;
+ struct arb_info info;
+ u32 ae_thr_map;
+
+ if (ADF_AE_STRAND0_THREAD == thr || ADF_AE_STRAND1_THREAD == thr)
+ thr = ADF_AE_ADMIN_THREAD;
+
+ hw_data->get_arb_info(&info);
+ thd_2_arb_cfg = hw_data->get_arb_mapping(accel_dev);
+ if (!thd_2_arb_cfg)
+ return -EFAULT;
+
+ /* Disable scheduling for this particular AE and thread */
+ ae_thr_map = *(thd_2_arb_cfg + ae);
+ ae_thr_map &= ~(0x0F << (thr * 4));
+
+ adf_write_arb_wt2sam(csr, info.arb_offset, info.wt2sam_offset, ae,
+ ae_thr_map);
+ return 0;
+}
+#endif