diff mbox series

[7/8] qedi: Add firmware error recovery invocation support.

Message ID 20200908052412.21917-8-mrangankar@marvell.com
State New
Headers show
Series qedi: Misc bug fixes and enhancements | expand

Commit Message

Manish Rangankar Sept. 8, 2020, 5:24 a.m. UTC
Add support to initiate MFW process recovery for all
the devices if storage function receive the event first.

Signed-off-by: Manish Rangankar <mrangankar@marvell.com>
---
 drivers/scsi/qedi/qedi.h       |  4 +++
 drivers/scsi/qedi/qedi_fw.c    |  7 ++--
 drivers/scsi/qedi/qedi_iscsi.c |  3 +-
 drivers/scsi/qedi/qedi_main.c  | 61 +++++++++++++++++++++++++++++++++-
 4 files changed, 71 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/drivers/scsi/qedi/qedi.h b/drivers/scsi/qedi/qedi.h
index 9c19ec9dc682..7e59d50f2fab 100644
--- a/drivers/scsi/qedi/qedi.h
+++ b/drivers/scsi/qedi/qedi.h
@@ -274,6 +274,10 @@  struct qedi_ctx {
 	spinlock_t ll2_lock;	/* Light L2 lock */
 	spinlock_t hba_lock;	/* per port lock */
 	struct task_struct *ll2_recv_thread;
+	unsigned long qedi_err_flags;
+#define QEDI_ERR_ATTN_CLR_EN	0
+#define QEDI_ERR_IS_RECOVERABLE	2
+#define QEDI_ERR_OVERRIDE_EN	31
 	unsigned long flags;
 #define UIO_DEV_OPENED		1
 #define QEDI_IOTHREAD_WAKE	2
diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
index f158fde0a43c..440ddd2309f1 100644
--- a/drivers/scsi/qedi/qedi_fw.c
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -1267,7 +1267,8 @@  int qedi_cleanup_all_io(struct qedi_ctx *qedi, struct qedi_conn *qedi_conn,
 	rval  = wait_event_interruptible_timeout(qedi_conn->wait_queue,
 						 ((qedi_conn->cmd_cleanup_req ==
 						 qedi_conn->cmd_cleanup_cmpl) ||
-						 qedi_conn->ep),
+						 test_bit(QEDI_IN_RECOVERY,
+							  &qedi->flags)),
 						 5 * HZ);
 	if (rval) {
 		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
@@ -1292,7 +1293,9 @@  int qedi_cleanup_all_io(struct qedi_ctx *qedi, struct qedi_conn *qedi_conn,
 	/* Enable IOs for all other sessions except current.*/
 	if (!wait_event_interruptible_timeout(qedi_conn->wait_queue,
 					      (qedi_conn->cmd_cleanup_req ==
-					       qedi_conn->cmd_cleanup_cmpl),
+					       qedi_conn->cmd_cleanup_cmpl) ||
+					       test_bit(QEDI_IN_RECOVERY,
+							&qedi->flags),
 					      5 * HZ)) {
 		iscsi_host_for_each_session(qedi->shost,
 					    qedi_mark_device_available);
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index ae86a40ca040..08c05403cd72 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1072,7 +1072,8 @@  static void qedi_ep_disconnect(struct iscsi_endpoint *ep)
 
 	qedi_ep->state = EP_STATE_DISCONN_START;
 
-	if (test_bit(QEDI_IN_SHUTDOWN, &qedi->flags))
+	if (test_bit(QEDI_IN_SHUTDOWN, &qedi->flags) ||
+	    test_bit(QEDI_IN_RECOVERY, &qedi->flags))
 		goto ep_release_conn;
 
 	ret = qedi_ops->destroy_conn(qedi->cdev, qedi_ep->handle, abrt_conn);
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 4f43e2a24b50..28b5d6670580 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -50,6 +50,10 @@  module_param(qedi_ll2_buf_size, uint, 0644);
 MODULE_PARM_DESC(qedi_ll2_buf_size,
 		 "parameter to set ping packet size, default - 0x400, Jumbo packets - 0x2400.");
 
+static uint qedi_flags_override;
+module_param(qedi_flags_override, uint, 0644);
+MODULE_PARM_DESC(qedi_flags_override, "Disable/Enable MFW error flags bits action.");
+
 const struct qed_iscsi_ops *qedi_ops;
 static struct scsi_transport_template *qedi_scsi_transport;
 static struct pci_driver qedi_pci_driver;
@@ -1113,6 +1117,39 @@  static void qedi_get_protocol_tlv_data(void *dev, void *data)
 	return;
 }
 
+void qedi_schedule_hw_err_handler(void *dev,
+				  enum qed_hw_err_type err_type)
+{
+	struct qedi_ctx *qedi = (struct qedi_ctx *)dev;
+	unsigned long override_flags = qedi_flags_override;
+
+	if (override_flags && test_bit(QEDI_ERR_OVERRIDE_EN, &override_flags))
+		qedi->qedi_err_flags = qedi_flags_override;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "HW error handler scheduled, err=%d err_flags=0x%x\n",
+		  err_type, qedi->qedi_err_flags);
+
+	switch (err_type) {
+	case QED_HW_ERR_MFW_RESP_FAIL:
+	case QED_HW_ERR_HW_ATTN:
+	case QED_HW_ERR_DMAE_FAIL:
+	case QED_HW_ERR_RAMROD_FAIL:
+	case QED_HW_ERR_FW_ASSERT:
+		/* Prevent HW attentions from being reasserted */
+		if (test_bit(QEDI_ERR_ATTN_CLR_EN, &qedi->qedi_err_flags))
+			qedi_ops->common->attn_clr_enable(qedi->cdev, true);
+
+		if (err_type == QED_HW_ERR_RAMROD_FAIL &&
+		    test_bit(QEDI_ERR_IS_RECOVERABLE, &qedi->qedi_err_flags))
+			qedi_ops->common->recovery_process(qedi->cdev);
+
+		break;
+	default:
+		break;
+	}
+}
+
 static void qedi_schedule_recovery_handler(void *dev)
 {
 	struct qedi_ctx *qedi = dev;
@@ -1155,6 +1192,7 @@  static struct qed_iscsi_cb_ops qedi_cb_ops = {
 	{
 		.link_update =		qedi_link_update,
 		.schedule_recovery_handler = qedi_schedule_recovery_handler,
+		.schedule_hw_err_handler = qedi_schedule_hw_err_handler,
 		.get_protocol_tlv_data = qedi_get_protocol_tlv_data,
 		.get_generic_tlv_data = qedi_get_generic_tlv_data,
 	}
@@ -2355,6 +2393,7 @@  static void __qedi_remove(struct pci_dev *pdev, int mode)
 {
 	struct qedi_ctx *qedi = pci_get_drvdata(pdev);
 	int rval;
+	u16 retry = 10;
 
 	if (mode == QEDI_MODE_SHUTDOWN)
 		iscsi_host_for_each_session(qedi->shost,
@@ -2383,7 +2422,13 @@  static void __qedi_remove(struct pci_dev *pdev, int mode)
 	qedi_sync_free_irqs(qedi);
 
 	if (!test_bit(QEDI_IN_OFFLINE, &qedi->flags)) {
-		qedi_ops->stop(qedi->cdev);
+		while (retry--) {
+			rval = qedi_ops->stop(qedi->cdev);
+			if (rval < 0)
+				msleep(1000);
+			else
+				break;
+		}
 		qedi_ops->ll2->stop(qedi->cdev);
 	}
 
@@ -2442,6 +2487,7 @@  static int __qedi_probe(struct pci_dev *pdev, int mode)
 	struct qed_probe_params qed_params;
 	void *task_start, *task_end;
 	int rc;
+	u16 retry = 10;
 
 	if (mode != QEDI_MODE_RECOVERY) {
 		qedi = qedi_host_alloc(pdev);
@@ -2453,6 +2499,10 @@  static int __qedi_probe(struct pci_dev *pdev, int mode)
 		qedi = pci_get_drvdata(pdev);
 	}
 
+retry_probe:
+	if (mode == QEDI_MODE_RECOVERY)
+		msleep(2000);
+
 	memset(&qed_params, 0, sizeof(qed_params));
 	qed_params.protocol = QED_PROTOCOL_ISCSI;
 	qed_params.dp_module = qedi_qed_debug;
@@ -2460,11 +2510,20 @@  static int __qedi_probe(struct pci_dev *pdev, int mode)
 	qed_params.is_vf = is_vf;
 	qedi->cdev = qedi_ops->common->probe(pdev, &qed_params);
 	if (!qedi->cdev) {
+		if (mode == QEDI_MODE_RECOVERY && retry) {
+			QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+				  "Retry %d initialize hardware\n", retry);
+			retry--;
+			goto retry_probe;
+		}
+
 		rc = -ENODEV;
 		QEDI_ERR(&qedi->dbg_ctx, "Cannot initialize hardware\n");
 		goto free_host;
 	}
 
+	set_bit(QEDI_ERR_ATTN_CLR_EN, &qedi->qedi_err_flags);
+	set_bit(QEDI_ERR_IS_RECOVERABLE, &qedi->qedi_err_flags);
 	atomic_set(&qedi->link_state, QEDI_LINK_DOWN);
 
 	rc = qedi_ops->fill_dev_info(qedi->cdev, &qedi->dev_info);