mbox series

[00/10] qla2xxx driver bug fixes

Message ID 20210908072846.10011-1-njavali@marvell.com
Headers show
Series qla2xxx driver bug fixes | expand

Message

Nilesh Javali Sept. 8, 2021, 7:28 a.m. UTC
Martin,

Please apply the qla2xxx driver bug fixes to the scsi tree at your
earliest convenience.

Thanks,
Nilesh

Arun Easi (2):
  qla2xxx: Fix crash in NVME abort path
  qla2xxx: Fix kernel crash when accessing port_speed sysfs file

Bikash Hazarika (1):
  qla2xxx: Add support for mailbox passthru

Manish Rangankar (1):
  qla2xxx: Move heart beat handling from dpc thread to workqueue

Nilesh Javali (1):
  qla2xxx: Update version to 10.02.07.100-k

Quinn Tran (2):
  qla2xxx: edif: Use link event to wake up app
  qla2xxx: Fix use after free in eh_abort path

Saurav Kashyap (2):
  qla2xxx: Display 16G only as supported speeds for 3830c card
  qla2xxx: Check for firmware capability before creating QPair

Shreyas Deodhar (1):
  qla2xxx: Call process_response_queue() in Tx path

 drivers/scsi/qla2xxx/qla_attr.c    | 24 ++++++++-
 drivers/scsi/qla2xxx/qla_bsg.c     | 48 +++++++++++++++++
 drivers/scsi/qla2xxx/qla_bsg.h     |  7 +++
 drivers/scsi/qla2xxx/qla_def.h     |  4 +-
 drivers/scsi/qla2xxx/qla_gbl.h     |  4 ++
 drivers/scsi/qla2xxx/qla_gs.c      |  3 +-
 drivers/scsi/qla2xxx/qla_init.c    | 17 +++---
 drivers/scsi/qla2xxx/qla_mbx.c     | 33 ++++++++++++
 drivers/scsi/qla2xxx/qla_nvme.c    | 20 ++++++-
 drivers/scsi/qla2xxx/qla_os.c      | 86 +++++++++++++++---------------
 drivers/scsi/qla2xxx/qla_version.h |  6 +--
 11 files changed, 191 insertions(+), 61 deletions(-)


base-commit: 9b5ac8ab4e8bf5636d1d425aee68ddf45af12057

Comments

Nilesh Javali Sept. 8, 2021, 9:12 a.m. UTC | #1
Chaitanya,

> -----Original Message-----

> From: Chaitanya Kulkarni <chaitanyak@nvidia.com>

> Sent: Wednesday, September 8, 2021 1:28 PM

> To: Nilesh Javali <njavali@marvell.com>; martin.petersen@oracle.com; linux-

> nvme@lists.infradead.org

> Cc: linux-scsi@vger.kernel.org; GR-QLogic-Storage-Upstream <GR-QLogic-

> Storage-Upstream@marvell.com>; djeffery@redhat.com;

> loberman@redhat.com

> Subject: [EXT] Re: [PATCH 00/10] qla2xxx driver bug fixes

> 

> External Email

> 

> ----------------------------------------------------------------------

> On 9/8/21 12:28 AM, Nilesh Javali wrote:> Martin,

>  >

>  > Please apply the qla2xxx driver bug fixes to the scsi tree at your

>  > earliest convenience.

>  >

>  > Thanks,

>  > Nilesh

>  >

> 

> why linux-nvme ?


Please ignore. This is meant for Linux-scsi only.
I apologize for mistakenly sending it to Linux-nvme.

--
Nilesh
Himanshu Madhani Sept. 8, 2021, 1:58 p.m. UTC | #2
> On Sep 8, 2021, at 2:28 AM, Nilesh Javali <njavali@marvell.com> wrote:

> 

> From: Arun Easi <aeasi@marvell.com>

> 

> System crash was seen when I/O was run against a NVME target and when I/O

> aborts were occurring.

> 

> Crash stack is:

> 

>    -- relevant crash stack --

>    BUG: kernel NULL pointer dereference, address: 0000000000000010

>    :

>    #6 [ffffae1f8666bdd0] page_fault at ffffffffa740122e

>       [exception RIP: qla_nvme_abort_work+339]

>       RIP: ffffffffc0f592e3  RSP: ffffae1f8666be80  RFLAGS: 00010297

>       RAX: 0000000000000000  RBX: ffff9b581fc8af80  RCX: ffffffffc0f83bd0

>       RDX: 0000000000000001  RSI: ffff9b5839c6c7c8  RDI: 0000000008000000

>       RBP: ffff9b6832f85000   R8: ffffffffc0f68160   R9: ffffffffc0f70652

>       R10: ffffae1f862ffdc8  R11: 0000000000000300  R12: 000000000000010d

>       R13: 0000000000000000  R14: ffff9b5839cea000  R15: 0ffff9b583fab170

>       ORIG_RAX: ffffffffffffffff   CS: 0010  SS: 0018

>    #7 [ffffae1f8666be98] process_one_work at ffffffffa6aba184

>    #8 [ffffae1f8666bed8] worker_thread at ffffffffa6aba39d

>    #9 [ffffae1f8666bf10] kthread at ffffffffa6ac06ed

> 

> The crash was due to a stale SRB structure access after it was aborted.

> Fixed the issue by removing stale access.

> 


Add following 

Fixes: 2cabf10dbbe38 (“scsi: qla2xxx: Fix hang on NVMe command timeouts ”)
Cc: stable@vger.kernel.org

> Signed-off-by: Arun Easi <aeasi@marvell.com>

> Signed-off-by: Nilesh Javali <njavali@marvell.com>

> ---

> drivers/scsi/qla2xxx/qla_nvme.c | 14 ++++++++++++--

> 1 file changed, 12 insertions(+), 2 deletions(-)

> 

> diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c

> index 1c5da2dbd6f9..877b2b625020 100644

> --- a/drivers/scsi/qla2xxx/qla_nvme.c

> +++ b/drivers/scsi/qla2xxx/qla_nvme.c

> @@ -228,6 +228,8 @@ static void qla_nvme_abort_work(struct work_struct *work)

> 	fc_port_t *fcport = sp->fcport;

> 	struct qla_hw_data *ha = fcport->vha->hw;

> 	int rval, abts_done_called = 1;

> +	bool io_wait_for_abort_done;

> +	uint32_t handle;

> 

> 	ql_dbg(ql_dbg_io, fcport->vha, 0xffff,

> 	       "%s called for sp=%p, hndl=%x on fcport=%p desc=%p deleted=%d\n",

> @@ -244,12 +246,20 @@ static void qla_nvme_abort_work(struct work_struct *work)

> 		goto out;

> 	}

> 

> +	/*

> +	 * sp may not be valid after abort_command if return code is either

> +	 * SUCCESS or ERR_FROM_FW codes, so cache the value here.

> +	 */

> +	io_wait_for_abort_done = ql2xabts_wait_nvme &&

> +					QLA_ABTS_WAIT_ENABLED(sp);

> +	handle = sp->handle;

> +

> 	rval = ha->isp_ops->abort_command(sp);

> 

> 	ql_dbg(ql_dbg_io, fcport->vha, 0x212b,

> 	    "%s: %s command for sp=%p, handle=%x on fcport=%p rval=%x\n",

> 	    __func__, (rval != QLA_SUCCESS) ? "Failed to abort" : "Aborted",

> -	    sp, sp->handle, fcport, rval);

> +	    sp, handle, fcport, rval);

> 

> 	/*

> 	 * If async tmf is enabled, the abort callback is called only on

> @@ -264,7 +274,7 @@ static void qla_nvme_abort_work(struct work_struct *work)

> 	 * are waited until ABTS complete. This kref is decreased

> 	 * at qla24xx_abort_sp_done function.

> 	 */

> -	if (abts_done_called && ql2xabts_wait_nvme && QLA_ABTS_WAIT_ENABLED(sp))

> +	if (abts_done_called && io_wait_for_abort_done)

> 		return;

> out:

> 	/* kref_get was done before work was schedule. */

> -- 

> 2.19.0.rc0

> 


Otherwise

Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>


--
Himanshu Madhani	 Oracle Linux Engineering
Himanshu Madhani Sept. 8, 2021, 2:14 p.m. UTC | #3
> On Sep 8, 2021, at 2:28 AM, Nilesh Javali <njavali@marvell.com> wrote:

> 

> From: Manish Rangankar <mrangankar@marvell.com>

> 

> DPC thread gets restricted due to a no-op mailbox, which is a blocking call

> and has a high execution frequency. To free up the DPC thread we move no-op

> handling to the workqueue. Also, modified qla_do_hb to send no-op MBC if

> we don’t have any active interrupts, but there are still IOs outstanding

> with firmware.

> 

> Fixes: d94d8158e184 ("scsi: qla2xxx: Add heartbeat check")


Cc: stable@vger.kernel.org

> Signed-off-by: Manish Rangankar <mrangankar@marvell.com>

> Signed-off-by: Nilesh Javali <njavali@marvell.com>

> ---

> drivers/scsi/qla2xxx/qla_def.h  |  4 +-

> drivers/scsi/qla2xxx/qla_init.c |  2 +

> drivers/scsi/qla2xxx/qla_os.c   | 74 +++++++++++++++------------------

> 3 files changed, 38 insertions(+), 42 deletions(-)

> 

> diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h

> index be2eb75ee1a3..d6e131bf1824 100644

> --- a/drivers/scsi/qla2xxx/qla_def.h

> +++ b/drivers/scsi/qla2xxx/qla_def.h

> @@ -3750,6 +3750,7 @@ struct qla_qpair {

> 	struct qla_fw_resources fwres ____cacheline_aligned;

> 	u32	cmd_cnt;

> 	u32	cmd_completion_cnt;

> +	u32	prev_completion_cnt;

> };

> 

> /* Place holder for FW buffer parameters */

> @@ -4607,6 +4608,7 @@ struct qla_hw_data {

> 	struct qla_chip_state_84xx *cs84xx;

> 	struct isp_operations *isp_ops;

> 	struct workqueue_struct *wq;

> +	struct work_struct hb_work;


Use heartbeat_work (which is not awfully long and reads better.

> 	struct qlfc_fw fw_buf;

> 

> 	/* FCP_CMND priority support */

> @@ -4708,7 +4710,6 @@ struct qla_hw_data {

> 

> 	struct qla_hw_data_stat stat;

> 	pci_error_state_t pci_error_state;

> -	u64 prev_cmd_cnt;

> 	struct dma_pool *purex_dma_pool;

> 	struct btree_head32 host_map;

> 

> @@ -4854,7 +4855,6 @@ typedef struct scsi_qla_host {

> #define SET_ZIO_THRESHOLD_NEEDED 32

> #define ISP_ABORT_TO_ROM	33

> #define VPORT_DELETE		34

> -#define HEARTBEAT_CHK		38

> 

> #define PROCESS_PUREX_IOCB	63

> 

> diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c

> index c6b3d0e7489e..a9a4243cb15a 100644

> --- a/drivers/scsi/qla2xxx/qla_init.c

> +++ b/drivers/scsi/qla2xxx/qla_init.c

> @@ -7025,12 +7025,14 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha)

> 	ha->chip_reset++;

> 	ha->base_qpair->chip_reset = ha->chip_reset;

> 	ha->base_qpair->cmd_cnt = ha->base_qpair->cmd_completion_cnt = 0;

> +	ha->base_qpair->prev_completion_cnt = 0;

> 	for (i = 0; i < ha->max_qpairs; i++) {

> 		if (ha->queue_pair_map[i]) {

> 			ha->queue_pair_map[i]->chip_reset =

> 				ha->base_qpair->chip_reset;

> 			ha->queue_pair_map[i]->cmd_cnt =

> 			    ha->queue_pair_map[i]->cmd_completion_cnt = 0;

> +			ha->base_qpair->prev_completion_cnt = 0;

> 		}

> 	}

> 

> diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c

> index a1e861ecfc01..0454f79a8047 100644

> --- a/drivers/scsi/qla2xxx/qla_os.c

> +++ b/drivers/scsi/qla2xxx/qla_os.c

> @@ -2794,6 +2794,16 @@ qla2xxx_scan_finished(struct Scsi_Host *shost, unsigned long time)

> 	return atomic_read(&vha->loop_state) == LOOP_READY;

> }

> 

> +static void qla_hb_work_fn(struct work_struct *work)


              ^^^^^^^^^^^^^^
Use qla_heartbeat_work_fn for better readability 

> +{

> +	struct qla_hw_data *ha = container_of(work,

> +		struct qla_hw_data, hb_work);

> +	struct scsi_qla_host *base_vha = pci_get_drvdata(ha->pdev);

> +

> +	if (!ha->flags.mbox_busy && base_vha->flags.init_done)

> +		qla_no_op_mb(base_vha);

> +}

> +

> static void qla2x00_iocb_work_fn(struct work_struct *work)

> {

> 	struct scsi_qla_host *vha = container_of(work,

> @@ -3232,6 +3242,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)

> 	    host->transportt, sht->vendor_id);

> 

> 	INIT_WORK(&base_vha->iocb_work, qla2x00_iocb_work_fn);

> +	INIT_WORK(&ha->hb_work, qla_hb_work_fn);

> 

> 	/* Set up the irqs */

> 	ret = qla2x00_request_irqs(ha, rsp);

> @@ -7118,17 +7129,6 @@ qla2x00_do_dpc(void *data)

> 			qla2x00_lip_reset(base_vha);

> 		}

> 

> -		if (test_bit(HEARTBEAT_CHK, &base_vha->dpc_flags)) {

> -			/*

> -			 * if there is a mb in progress then that's

> -			 * enough of a check to see if fw is still ticking.

> -			 */

> -			if (!ha->flags.mbox_busy && base_vha->flags.init_done)

> -				qla_no_op_mb(base_vha);

> -

> -			clear_bit(HEARTBEAT_CHK, &base_vha->dpc_flags);

> -		}

> -

> 		ha->dpc_active = 0;

> end_loop:

> 		set_current_state(TASK_INTERRUPTIBLE);

> @@ -7187,57 +7187,51 @@ qla2x00_rst_aen(scsi_qla_host_t *vha)

> 

> static bool qla_do_heartbeat(struct scsi_qla_host *vha)

> {

> -	u64 cmd_cnt, prev_cmd_cnt;

> -	bool do_hb = false;

> 	struct qla_hw_data *ha = vha->hw;

> -	int i;

> +	u32 cmpl_cnt;

> +	u16 i;

> +	bool do_hb = false;

> 


use do_heartbeat which is self-explanatory and reads better

> -	/* if cmds are still pending down in fw, then do hb */

> -	if (ha->base_qpair->cmd_cnt != ha->base_qpair->cmd_completion_cnt) {

> +	/*

> +	 * Allow do_hb only if we don’t have any active interrupts,

> +	 * but there are still IOs outstanding with firmware.

> +	 */

> +	cmpl_cnt = ha->base_qpair->cmd_completion_cnt;

> +	if (cmpl_cnt == ha->base_qpair->prev_completion_cnt &&

> +	    cmpl_cnt != ha->base_qpair->cmd_cnt) {

> 		do_hb = true;

> 		goto skip;

> 	}

> +	ha->base_qpair->prev_completion_cnt = cmpl_cnt;

> 

> 	for (i = 0; i < ha->max_qpairs; i++) {

> -		if (ha->queue_pair_map[i] &&

> -		    ha->queue_pair_map[i]->cmd_cnt !=

> -		    ha->queue_pair_map[i]->cmd_completion_cnt) {

> -			do_hb = true;

> -			break;

> +		if (ha->queue_pair_map[i]) {

> +			cmpl_cnt = ha->queue_pair_map[i]->cmd_completion_cnt;

> +			if (cmpl_cnt == ha->queue_pair_map[i]->prev_completion_cnt &&

> +			    cmpl_cnt != ha->queue_pair_map[i]->cmd_cnt) {

> +				do_hb = true;

> +				break;

> +			}

> +			ha->queue_pair_map[i]->prev_completion_cnt = cmpl_cnt;

> 		}

> 	}

> 

> skip:

> -	prev_cmd_cnt = ha->prev_cmd_cnt;

> -	cmd_cnt = ha->base_qpair->cmd_cnt;

> -	for (i = 0; i < ha->max_qpairs; i++) {

> -		if (ha->queue_pair_map[i])

> -			cmd_cnt += ha->queue_pair_map[i]->cmd_cnt;

> -	}

> -	ha->prev_cmd_cnt = cmd_cnt;

> -

> -	if (!do_hb && ((cmd_cnt - prev_cmd_cnt) > 50))

> -		/*

> -		 * IOs are completing before periodic hb check.

> -		 * IOs seems to be running, do hb for sanity check.

> -		 */

> -		do_hb = true;

> -

> 	return do_hb;

> }

> 

> static void qla_heart_beat(struct scsi_qla_host *vha)

> {

> +	struct qla_hw_data *ha = vha->hw;

> +

> 	if (vha->vp_idx)

> 		return;

> 

> 	if (vha->hw->flags.eeh_busy || qla2x00_chip_is_down(vha))

> 		return;

> 

> -	if (qla_do_heartbeat(vha)) {

> -		set_bit(HEARTBEAT_CHK, &vha->dpc_flags);

> -		qla2xxx_wake_dpc(vha);

> -	}

> +	if (qla_do_heartbeat(vha))

> +		queue_work(ha->wq, &ha->hb_work);

> }

> 

> /**************************************************************************

> -- 

> 2.19.0.rc0

> 


--
Himanshu Madhani	 Oracle Linux Engineering