diff mbox series

[v3] scsi_debug: add new defer_type for mq_poll

Message ID 20210209225624.108341-1-dgilbert@interlog.com
State New
Headers show
Series [v3] scsi_debug: add new defer_type for mq_poll | expand

Commit Message

Douglas Gilbert Feb. 9, 2021, 10:56 p.m. UTC
Add a new sdeb_defer_type enumeration: SDEB_DEFER_POLL for requests
that have REQ_HIPRI set in cmd_flags field. It is expected that
these requests will be polled via the mq_poll entry point which
is driven by calls to blk_poll() in the block layer. Therefore
timer events are not 'wired up' in the normal fashion.

There are still cases with short delays (e.g. < 10 microseconds)
where by the time the command response processing occurs, the delay
is already exceeded in which case the code calls scsi_done()
directly. In such cases there is no window for mq_poll() to be
called.

Add 'mq_polls' counter that increments on each scsi_done() called
via the mq_poll entry point. Can be used to show (with 'cat
/proc/scsi/scsi_debug/<host_id>') that blk_poll() is causing
completions rather than some other mechanism.


This patch is against the 5.12/scsi-staging branch which includes
   a98d6bdf181eb71bf4686666eaf47c642a061642
   scsi_debug : iouring iopoll support
which it alters. So this patch is a fix of that patch.

Changes since version 2 [sent 20210206 to linux-scsi list]
  - the sdebug_blk_mq_poll() callback didn't cope with the
    uncommon case where sqcp->sd_dp is NULL. Fix.

Changes since version 1 [sent 20210201 to linux-scsi list]
  - harden SDEB_DEFER_POLL which broke under testing
  - add mq_polls counter for debug output

Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
---
 drivers/scsi/scsi_debug.c | 141 +++++++++++++++++++++++++-------------
 1 file changed, 94 insertions(+), 47 deletions(-)

Comments

Kashyap Desai Feb. 10, 2021, 3:08 a.m. UTC | #1
On Wed, Feb 10, 2021 at 4:26 AM Douglas Gilbert <dgilbert@interlog.com> wrote:
>

> Add a new sdeb_defer_type enumeration: SDEB_DEFER_POLL for requests

> that have REQ_HIPRI set in cmd_flags field. It is expected that

> these requests will be polled via the mq_poll entry point which

> is driven by calls to blk_poll() in the block layer. Therefore

> timer events are not 'wired up' in the normal fashion.

>

> There are still cases with short delays (e.g. < 10 microseconds)

> where by the time the command response processing occurs, the delay

> is already exceeded in which case the code calls scsi_done()

> directly. In such cases there is no window for mq_poll() to be

> called.

>

> Add 'mq_polls' counter that increments on each scsi_done() called

> via the mq_poll entry point. Can be used to show (with 'cat

> /proc/scsi/scsi_debug/<host_id>') that blk_poll() is causing

> completions rather than some other mechanism.

>

>

> This patch is against the 5.12/scsi-staging branch which includes

>    a98d6bdf181eb71bf4686666eaf47c642a061642

>    scsi_debug : iouring iopoll support

> which it alters. So this patch is a fix of that patch.

>

> Changes since version 2 [sent 20210206 to linux-scsi list]

>   - the sdebug_blk_mq_poll() callback didn't cope with the

>     uncommon case where sqcp->sd_dp is NULL. Fix.


Hi Doug, I tried this patch on top of below iouring patch series -
"[v3,1/4] add io_uring with IOPOLL support in scsi layer"

After applying patch, I am seeing an IO hang issue - I see
io_wqe_worker goes into a tight loop.

18210 root      20   0 1316348   3552   2376 R  99.1   0.0   0:24.09
fio                                       18303 root      20   0
0      0      0 D  78.8   0.0   0:01.75 io_wqe_worker-0
           18219 root      20   0       0      0      0 R  71.7   0.0
 0:06.59 io_wqe_worker-0


I used below command -
insmod drivers/scsi/scsi_debug.ko dev_size_mb=1024 sector_size=512
add_host=24 per_host_store=1 ndelay=10000 host_max_queue=32
submit_queues=76 num_parts=1 poll_queues=8

and here is my fio script -
[global]
ioengine=io_uring
hipri=1
direct=1
runtime=30s
rw=randread
norandommap
bs=4k
iodepth=64

[seqprecon]
filename=/dev/sdd
[seqprecon]
filename=/dev/sde
[seqprecon]
filename=/dev/sdf


I will ask Martin to pick all the patches from "[v3,1/4] add io_uring
with IOPOLL support in the scsi layer" except scsi_debug. We can work
on scsi_debug and send standalone patch.

Kashyap




>

> Changes since version 1 [sent 20210201 to linux-scsi list]

>   - harden SDEB_DEFER_POLL which broke under testing

>   - add mq_polls counter for debug output

>

> Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>

> ---

>  drivers/scsi/scsi_debug.c | 141 +++++++++++++++++++++++++-------------

>  1 file changed, 94 insertions(+), 47 deletions(-)

>

> diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c

> index 746eec521f79..b9ec3a47fbf1 100644

> --- a/drivers/scsi/scsi_debug.c

> +++ b/drivers/scsi/scsi_debug.c

> @@ -322,17 +322,19 @@ struct sdeb_store_info {

>         container_of(d, struct sdebug_host_info, dev)

>

>  enum sdeb_defer_type {SDEB_DEFER_NONE = 0, SDEB_DEFER_HRT = 1,

> -                     SDEB_DEFER_WQ = 2};

> +                     SDEB_DEFER_WQ = 2, SDEB_DEFER_POLL = 3};

>

>  struct sdebug_defer {

>         struct hrtimer hrt;

>         struct execute_work ew;

> +       ktime_t cmpl_ts;/* time since boot to complete this cmd */

>         int sqa_idx;    /* index of sdebug_queue array */

>         int qc_idx;     /* index of sdebug_queued_cmd array within sqa_idx */

>         int hc_idx;     /* hostwide tag index */

>         int issuing_cpu;

>         bool init_hrt;

>         bool init_wq;

> +       bool init_poll;

>         bool aborted;   /* true when blk_abort_request() already called */

>         enum sdeb_defer_type defer_t;

>  };

> @@ -357,6 +359,7 @@ static atomic_t sdebug_completions;  /* count of deferred completions */

>  static atomic_t sdebug_miss_cpus;    /* submission + completion cpus differ */

>  static atomic_t sdebug_a_tsf;       /* 'almost task set full' counter */

>  static atomic_t sdeb_inject_pending;

> +static atomic_t sdeb_mq_poll_count;  /* bumped when mq_poll returns > 0 */

>

>  struct opcode_info_t {

>         u8 num_attached;        /* 0 if this is it (i.e. a leaf); use 0xff */

> @@ -4730,7 +4733,6 @@ static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)

>         struct scsi_cmnd *scp;

>         struct sdebug_dev_info *devip;

>

> -       sd_dp->defer_t = SDEB_DEFER_NONE;

>         if (unlikely(aborted))

>                 sd_dp->aborted = false;

>         qc_idx = sd_dp->qc_idx;

> @@ -4745,6 +4747,7 @@ static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)

>                 return;

>         }

>         spin_lock_irqsave(&sqp->qc_lock, iflags);

> +       sd_dp->defer_t = SDEB_DEFER_NONE;

>         sqcp = &sqp->qc_arr[qc_idx];

>         scp = sqcp->a_cmnd;

>         if (unlikely(scp == NULL)) {

> @@ -5517,40 +5520,66 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,

>                                 kt -= d;

>                         }

>                 }

> -               if (!sd_dp->init_hrt) {

> -                       sd_dp->init_hrt = true;

> -                       sqcp->sd_dp = sd_dp;

> -                       hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,

> -                                    HRTIMER_MODE_REL_PINNED);

> -                       sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;

> -                       sd_dp->sqa_idx = sqp - sdebug_q_arr;

> -                       sd_dp->qc_idx = k;

> +               sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);

> +               if (cmnd->request->cmd_flags & REQ_HIPRI) {

> +                       spin_lock_irqsave(&sqp->qc_lock, iflags);

> +                       if (!sd_dp->init_poll) {

> +                               sd_dp->init_poll = true;

> +                               sqcp->sd_dp = sd_dp;

> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

> +                               sd_dp->qc_idx = k;

> +                       }

> +                       sd_dp->defer_t = SDEB_DEFER_POLL;

> +                       spin_unlock_irqrestore(&sqp->qc_lock, iflags);

> +               } else {

> +                       if (!sd_dp->init_hrt) {

> +                               sd_dp->init_hrt = true;

> +                               sqcp->sd_dp = sd_dp;

> +                               hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,

> +                                            HRTIMER_MODE_REL_PINNED);

> +                               sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;

> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

> +                               sd_dp->qc_idx = k;

> +                       }

> +                       sd_dp->defer_t = SDEB_DEFER_HRT;

> +                       /* schedule the invocation of scsi_done() for a later time */

> +                       hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);

>                 }

>                 if (sdebug_statistics)

>                         sd_dp->issuing_cpu = raw_smp_processor_id();

> -               sd_dp->defer_t = SDEB_DEFER_HRT;

> -               /* schedule the invocation of scsi_done() for a later time */

> -               hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);

>         } else {        /* jdelay < 0, use work queue */

> -               if (!sd_dp->init_wq) {

> -                       sd_dp->init_wq = true;

> -                       sqcp->sd_dp = sd_dp;

> -                       sd_dp->sqa_idx = sqp - sdebug_q_arr;

> -                       sd_dp->qc_idx = k;

> -                       INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);

> -               }

> -               if (sdebug_statistics)

> -                       sd_dp->issuing_cpu = raw_smp_processor_id();

> -               sd_dp->defer_t = SDEB_DEFER_WQ;

>                 if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&

>                              atomic_read(&sdeb_inject_pending)))

>                         sd_dp->aborted = true;

> -               schedule_work(&sd_dp->ew.work);

> -               if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&

> -                            atomic_read(&sdeb_inject_pending))) {

> +               sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);

> +               if (cmnd->request->cmd_flags & REQ_HIPRI) {

> +                       spin_lock_irqsave(&sqp->qc_lock, iflags);

> +                       if (!sd_dp->init_poll) {

> +                               sd_dp->init_poll = true;

> +                               sqcp->sd_dp = sd_dp;

> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

> +                               sd_dp->qc_idx = k;

> +                       }

> +                       sd_dp->defer_t = SDEB_DEFER_POLL;

> +                       spin_unlock_irqrestore(&sqp->qc_lock, iflags);

> +               } else {

> +                       if (!sd_dp->init_wq) {

> +                               sd_dp->init_wq = true;

> +                               sqcp->sd_dp = sd_dp;

> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

> +                               sd_dp->qc_idx = k;

> +                               INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);

> +                       }

> +                       sd_dp->defer_t = SDEB_DEFER_WQ;

> +                       schedule_work(&sd_dp->ew.work);

> +               }

> +               if (sdebug_statistics)

> +                       sd_dp->issuing_cpu = raw_smp_processor_id();

> +               if (unlikely(sd_dp->aborted)) {

>                         sdev_printk(KERN_INFO, sdp, "abort request tag %d\n", cmnd->request->tag);

>                         blk_abort_request(cmnd->request);

>                         atomic_set(&sdeb_inject_pending, 0);

> +                       sd_dp->aborted = false;

>                 }

>         }

>         if (unlikely((SDEBUG_OPT_Q_NOISE & sdebug_opts) && scsi_result == device_qfull_result))

> @@ -5779,11 +5808,12 @@ static int scsi_debug_show_info(struct seq_file *m, struct Scsi_Host *host)

>                    dix_reads, dix_writes, dif_errors);

>         seq_printf(m, "usec_in_jiffy=%lu, statistics=%d\n", TICK_NSEC / 1000,

>                    sdebug_statistics);

> -       seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d\n",

> +       seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d, mq_polls=%d\n",

>                    atomic_read(&sdebug_cmnd_count),

>                    atomic_read(&sdebug_completions),

>                    "miss_cpus", atomic_read(&sdebug_miss_cpus),

> -                  atomic_read(&sdebug_a_tsf));

> +                  atomic_read(&sdebug_a_tsf),

> +                  atomic_read(&sdeb_mq_poll_count));

>

>         seq_printf(m, "submit_queues=%d\n", submit_queues);

>         for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {

> @@ -7246,52 +7276,68 @@ static int sdebug_map_queues(struct Scsi_Host *shost)

>

>  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)

>  {

> -       int qc_idx;

> -       int retiring = 0;

> +       bool first;

> +       bool retiring = false;

> +       int num_entries = 0;

> +       unsigned int qc_idx = 0;

>         unsigned long iflags;

> +       ktime_t kt_from_boot = ktime_get_boottime();

>         struct sdebug_queue *sqp;

>         struct sdebug_queued_cmd *sqcp;

>         struct scsi_cmnd *scp;

>         struct sdebug_dev_info *devip;

> -       int num_entries = 0;

> +       struct sdebug_defer *sd_dp;

>

>         sqp = sdebug_q_arr + queue_num;

> +       spin_lock_irqsave(&sqp->qc_lock, iflags);

>

> -       do {

> -               spin_lock_irqsave(&sqp->qc_lock, iflags);

> -               qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);

> -               if (unlikely((qc_idx < 0) || (qc_idx >= sdebug_max_queue)))

> -                       goto out;

> +       for (first = true; first || qc_idx + 1 < sdebug_max_queue; )   {

> +               if (first) {

> +                       qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);

> +                       first = false;

> +               } else {

> +                       qc_idx = find_next_bit(sqp->in_use_bm, sdebug_max_queue, qc_idx + 1);

> +               }

> +               if (unlikely(qc_idx >= sdebug_max_queue))

> +                       break;

>

>                 sqcp = &sqp->qc_arr[qc_idx];

> +               sd_dp = sqcp->sd_dp;

> +               if (unlikely(!sd_dp))

> +                       continue;

>                 scp = sqcp->a_cmnd;

>                 if (unlikely(scp == NULL)) {

> -                       pr_err("scp is NULL, queue_num=%d, qc_idx=%d from %s\n",

> +                       pr_err("scp is NULL, queue_num=%d, qc_idx=%u from %s\n",

>                                queue_num, qc_idx, __func__);

> -                       goto out;

> +                       break;

>                 }

> +               if (sd_dp->defer_t == SDEB_DEFER_POLL) {

> +                       if (kt_from_boot < sd_dp->cmpl_ts)

> +                               continue;

> +

> +               } else          /* ignoring non REQ_HIPRI requests */

> +                       continue;

>                 devip = (struct sdebug_dev_info *)scp->device->hostdata;

>                 if (likely(devip))

>                         atomic_dec(&devip->num_in_q);

>                 else

>                         pr_err("devip=NULL from %s\n", __func__);

>                 if (unlikely(atomic_read(&retired_max_queue) > 0))

> -                       retiring = 1;

> +                       retiring = true;

>

>                 sqcp->a_cmnd = NULL;

>                 if (unlikely(!test_and_clear_bit(qc_idx, sqp->in_use_bm))) {

> -                       pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%d from %s\n",

> +                       pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%u from %s\n",

>                                 sqp, queue_num, qc_idx, __func__);

> -                       goto out;

> +                       break;

>                 }

> -

>                 if (unlikely(retiring)) {       /* user has reduced max_queue */

>                         int k, retval;

>

>                         retval = atomic_read(&retired_max_queue);

>                         if (qc_idx >= retval) {

>                                 pr_err("index %d too large\n", retval);

> -                               goto out;

> +                               break;

>                         }

>                         k = find_last_bit(sqp->in_use_bm, retval);

>                         if ((k < sdebug_max_queue) || (k == retval))

> @@ -7299,17 +7345,18 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)

>                         else

>                                 atomic_set(&retired_max_queue, k + 1);

>                 }

> +               sd_dp->defer_t = SDEB_DEFER_NONE;

>                 spin_unlock_irqrestore(&sqp->qc_lock, iflags);

>                 scp->scsi_done(scp); /* callback to mid level */

> +               spin_lock_irqsave(&sqp->qc_lock, iflags);

>                 num_entries++;

> -       } while (1);

> -

> -out:

> +       }

>         spin_unlock_irqrestore(&sqp->qc_lock, iflags);

> +       if (num_entries > 0)

> +               atomic_add(num_entries, &sdeb_mq_poll_count);

>         return num_entries;

>  }

>

> -

>  static int scsi_debug_queuecommand(struct Scsi_Host *shost,

>                                    struct scsi_cmnd *scp)

>  {

> --

> 2.25.1

>
Douglas Gilbert Feb. 10, 2021, 5:31 p.m. UTC | #2
On 2021-02-09 10:08 p.m., Kashyap Desai wrote:
> On Wed, Feb 10, 2021 at 4:26 AM Douglas Gilbert <dgilbert@interlog.com> wrote:

>>

>> Add a new sdeb_defer_type enumeration: SDEB_DEFER_POLL for requests

>> that have REQ_HIPRI set in cmd_flags field. It is expected that

>> these requests will be polled via the mq_poll entry point which

>> is driven by calls to blk_poll() in the block layer. Therefore

>> timer events are not 'wired up' in the normal fashion.

>>

>> There are still cases with short delays (e.g. < 10 microseconds)

>> where by the time the command response processing occurs, the delay

>> is already exceeded in which case the code calls scsi_done()

>> directly. In such cases there is no window for mq_poll() to be

>> called.

>>

>> Add 'mq_polls' counter that increments on each scsi_done() called

>> via the mq_poll entry point. Can be used to show (with 'cat

>> /proc/scsi/scsi_debug/<host_id>') that blk_poll() is causing

>> completions rather than some other mechanism.

>>

>>

>> This patch is against the 5.12/scsi-staging branch which includes

>>     a98d6bdf181eb71bf4686666eaf47c642a061642

>>     scsi_debug : iouring iopoll support

>> which it alters. So this patch is a fix of that patch.

>>

>> Changes since version 2 [sent 20210206 to linux-scsi list]

>>    - the sdebug_blk_mq_poll() callback didn't cope with the

>>      uncommon case where sqcp->sd_dp is NULL. Fix.

> 

> Hi Doug, I tried this patch on top of below iouring patch series -

> "[v3,1/4] add io_uring with IOPOLL support in scsi layer"

> 

> After applying patch, I am seeing an IO hang issue - I see

> io_wqe_worker goes into a tight loop.

> 

> 18210 root      20   0 1316348   3552   2376 R  99.1   0.0   0:24.09

> fio                                       18303 root      20   0

> 0      0      0 D  78.8   0.0   0:01.75 io_wqe_worker-0

>             18219 root      20   0       0      0      0 R  71.7   0.0

>   0:06.59 io_wqe_worker-0

> 

> 

> I used below command -

> insmod drivers/scsi/scsi_debug.ko dev_size_mb=1024 sector_size=512

> add_host=24 per_host_store=1 ndelay=10000 host_max_queue=32

> submit_queues=76 num_parts=1 poll_queues=8

> 

> and here is my fio script -

> [global]

> ioengine=io_uring

> hipri=1

> direct=1

> runtime=30s

> rw=randread

> norandommap

> bs=4k

> iodepth=64

> 

> [seqprecon]

> filename=/dev/sdd

> [seqprecon]

> filename=/dev/sde

> [seqprecon]

> filename=/dev/sdf

> 

> 

> I will ask Martin to pick all the patches from "[v3,1/4] add io_uring

> with IOPOLL support in the scsi layer" except scsi_debug. We can work

> on scsi_debug and send standalone patch.


Hi Kashyap,
Yes, you are correct, that script certainly fails when sdd, sde
and sdf are scsi_debug devices.


On closer examination, IMO there is a fundamental misunderstanding about
iopoll on one of our parts. I am assuming that all these things are
closely coupled:
    1) a request with REQ_HIPRI set in its cmd_flags
    2) the associated LLD receiving mq_poll callbacks
    3) _something_ in the upper layers calls blk_poll()

And it is point 3) that is missing in the case of that fio script shown
above. To be able to use sd devices, I believe the sd driver needs to
be altered to issue blk_poll() calls when it sees that it is waiting
for completion of request/commands that have REQ_HIPRI set. And that
is not happening when I run the above fio script.

My scsi_debug patch counts the number of times mq_poll returns a value
 > 0 (i.e. it completed a request) and this is what I see:


# cat /proc/scsi/scsi_debug/3
scsi_debug adapter driver, version 0190 [20200710]
num_tgts=1, shared (ram) size=1024 MB, opts=0x0, every_nth=0
delay=-9999, ndelay=10000, max_luns=1, sector_size=4096 bytes
cylinders=128, heads=64, sectors=32, command aborts=98
RESETs: device=5, target=5, bus=5, host=2
dix_reads=0, dix_writes=0, dif_errors=0
usec_in_jiffy=4000, statistics=0
cmnd_count=0, completions=0, miss_cpus=0, a_tsf=0, mq_polls=0
submit_queues=4
   queue 0:
   queue 1:
   queue 2:
   queue 3:
this host_no=3
....

The fact that 'mq_poll=0' tells me that blk_poll() was never called.

Assuming the above fio script works when sdd, sde snd sdf are MR
devices suggests to me your LLD treats mq_poll callbacks as optional.
The way I have coded the sg an scsi_debug drivers is that when REQ_HIPRI
is given, then blk_poll() calls are _essential_ and said request will
not complete without them.

This code is from drivers/nvme/host/core.c :

static void nvme_execute_rq_polled(struct request_queue *q,
                 struct gendisk *bd_disk, struct request *rq, int at_head)
{
         DECLARE_COMPLETION_ONSTACK(wait);

         WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));

         rq->cmd_flags |= REQ_HIPRI;
         rq->end_io_data = &wait;
         blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);

         while (!completion_done(&wait)) {
                 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
                 cond_resched();
         }
}


For requests with REQ_HIPRI set, the sg driver (with blk_poll patch) does
an async version of the synchronous procedure shown in that nvme function.

Doug Gilbert


>> Changes since version 1 [sent 20210201 to linux-scsi list]

>>    - harden SDEB_DEFER_POLL which broke under testing

>>    - add mq_polls counter for debug output

>>

>> Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>

>> ---

>>   drivers/scsi/scsi_debug.c | 141 +++++++++++++++++++++++++-------------

>>   1 file changed, 94 insertions(+), 47 deletions(-)

>>

>> diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c

>> index 746eec521f79..b9ec3a47fbf1 100644

>> --- a/drivers/scsi/scsi_debug.c

>> +++ b/drivers/scsi/scsi_debug.c

>> @@ -322,17 +322,19 @@ struct sdeb_store_info {

>>          container_of(d, struct sdebug_host_info, dev)

>>

>>   enum sdeb_defer_type {SDEB_DEFER_NONE = 0, SDEB_DEFER_HRT = 1,

>> -                     SDEB_DEFER_WQ = 2};

>> +                     SDEB_DEFER_WQ = 2, SDEB_DEFER_POLL = 3};

>>

>>   struct sdebug_defer {

>>          struct hrtimer hrt;

>>          struct execute_work ew;

>> +       ktime_t cmpl_ts;/* time since boot to complete this cmd */

>>          int sqa_idx;    /* index of sdebug_queue array */

>>          int qc_idx;     /* index of sdebug_queued_cmd array within sqa_idx */

>>          int hc_idx;     /* hostwide tag index */

>>          int issuing_cpu;

>>          bool init_hrt;

>>          bool init_wq;

>> +       bool init_poll;

>>          bool aborted;   /* true when blk_abort_request() already called */

>>          enum sdeb_defer_type defer_t;

>>   };

>> @@ -357,6 +359,7 @@ static atomic_t sdebug_completions;  /* count of deferred completions */

>>   static atomic_t sdebug_miss_cpus;    /* submission + completion cpus differ */

>>   static atomic_t sdebug_a_tsf;       /* 'almost task set full' counter */

>>   static atomic_t sdeb_inject_pending;

>> +static atomic_t sdeb_mq_poll_count;  /* bumped when mq_poll returns > 0 */

>>

>>   struct opcode_info_t {

>>          u8 num_attached;        /* 0 if this is it (i.e. a leaf); use 0xff */

>> @@ -4730,7 +4733,6 @@ static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)

>>          struct scsi_cmnd *scp;

>>          struct sdebug_dev_info *devip;

>>

>> -       sd_dp->defer_t = SDEB_DEFER_NONE;

>>          if (unlikely(aborted))

>>                  sd_dp->aborted = false;

>>          qc_idx = sd_dp->qc_idx;

>> @@ -4745,6 +4747,7 @@ static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)

>>                  return;

>>          }

>>          spin_lock_irqsave(&sqp->qc_lock, iflags);

>> +       sd_dp->defer_t = SDEB_DEFER_NONE;

>>          sqcp = &sqp->qc_arr[qc_idx];

>>          scp = sqcp->a_cmnd;

>>          if (unlikely(scp == NULL)) {

>> @@ -5517,40 +5520,66 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,

>>                                  kt -= d;

>>                          }

>>                  }

>> -               if (!sd_dp->init_hrt) {

>> -                       sd_dp->init_hrt = true;

>> -                       sqcp->sd_dp = sd_dp;

>> -                       hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,

>> -                                    HRTIMER_MODE_REL_PINNED);

>> -                       sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;

>> -                       sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> -                       sd_dp->qc_idx = k;

>> +               sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);

>> +               if (cmnd->request->cmd_flags & REQ_HIPRI) {

>> +                       spin_lock_irqsave(&sqp->qc_lock, iflags);

>> +                       if (!sd_dp->init_poll) {

>> +                               sd_dp->init_poll = true;

>> +                               sqcp->sd_dp = sd_dp;

>> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> +                               sd_dp->qc_idx = k;

>> +                       }

>> +                       sd_dp->defer_t = SDEB_DEFER_POLL;

>> +                       spin_unlock_irqrestore(&sqp->qc_lock, iflags);

>> +               } else {

>> +                       if (!sd_dp->init_hrt) {

>> +                               sd_dp->init_hrt = true;

>> +                               sqcp->sd_dp = sd_dp;

>> +                               hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,

>> +                                            HRTIMER_MODE_REL_PINNED);

>> +                               sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;

>> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> +                               sd_dp->qc_idx = k;

>> +                       }

>> +                       sd_dp->defer_t = SDEB_DEFER_HRT;

>> +                       /* schedule the invocation of scsi_done() for a later time */

>> +                       hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);

>>                  }

>>                  if (sdebug_statistics)

>>                          sd_dp->issuing_cpu = raw_smp_processor_id();

>> -               sd_dp->defer_t = SDEB_DEFER_HRT;

>> -               /* schedule the invocation of scsi_done() for a later time */

>> -               hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);

>>          } else {        /* jdelay < 0, use work queue */

>> -               if (!sd_dp->init_wq) {

>> -                       sd_dp->init_wq = true;

>> -                       sqcp->sd_dp = sd_dp;

>> -                       sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> -                       sd_dp->qc_idx = k;

>> -                       INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);

>> -               }

>> -               if (sdebug_statistics)

>> -                       sd_dp->issuing_cpu = raw_smp_processor_id();

>> -               sd_dp->defer_t = SDEB_DEFER_WQ;

>>                  if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&

>>                               atomic_read(&sdeb_inject_pending)))

>>                          sd_dp->aborted = true;

>> -               schedule_work(&sd_dp->ew.work);

>> -               if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&

>> -                            atomic_read(&sdeb_inject_pending))) {

>> +               sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);

>> +               if (cmnd->request->cmd_flags & REQ_HIPRI) {

>> +                       spin_lock_irqsave(&sqp->qc_lock, iflags);

>> +                       if (!sd_dp->init_poll) {

>> +                               sd_dp->init_poll = true;

>> +                               sqcp->sd_dp = sd_dp;

>> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> +                               sd_dp->qc_idx = k;

>> +                       }

>> +                       sd_dp->defer_t = SDEB_DEFER_POLL;

>> +                       spin_unlock_irqrestore(&sqp->qc_lock, iflags);

>> +               } else {

>> +                       if (!sd_dp->init_wq) {

>> +                               sd_dp->init_wq = true;

>> +                               sqcp->sd_dp = sd_dp;

>> +                               sd_dp->sqa_idx = sqp - sdebug_q_arr;

>> +                               sd_dp->qc_idx = k;

>> +                               INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);

>> +                       }

>> +                       sd_dp->defer_t = SDEB_DEFER_WQ;

>> +                       schedule_work(&sd_dp->ew.work);

>> +               }

>> +               if (sdebug_statistics)

>> +                       sd_dp->issuing_cpu = raw_smp_processor_id();

>> +               if (unlikely(sd_dp->aborted)) {

>>                          sdev_printk(KERN_INFO, sdp, "abort request tag %d\n", cmnd->request->tag);

>>                          blk_abort_request(cmnd->request);

>>                          atomic_set(&sdeb_inject_pending, 0);

>> +                       sd_dp->aborted = false;

>>                  }

>>          }

>>          if (unlikely((SDEBUG_OPT_Q_NOISE & sdebug_opts) && scsi_result == device_qfull_result))

>> @@ -5779,11 +5808,12 @@ static int scsi_debug_show_info(struct seq_file *m, struct Scsi_Host *host)

>>                     dix_reads, dix_writes, dif_errors);

>>          seq_printf(m, "usec_in_jiffy=%lu, statistics=%d\n", TICK_NSEC / 1000,

>>                     sdebug_statistics);

>> -       seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d\n",

>> +       seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d, mq_polls=%d\n",

>>                     atomic_read(&sdebug_cmnd_count),

>>                     atomic_read(&sdebug_completions),

>>                     "miss_cpus", atomic_read(&sdebug_miss_cpus),

>> -                  atomic_read(&sdebug_a_tsf));

>> +                  atomic_read(&sdebug_a_tsf),

>> +                  atomic_read(&sdeb_mq_poll_count));

>>

>>          seq_printf(m, "submit_queues=%d\n", submit_queues);

>>          for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {

>> @@ -7246,52 +7276,68 @@ static int sdebug_map_queues(struct Scsi_Host *shost)

>>

>>   static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)

>>   {

>> -       int qc_idx;

>> -       int retiring = 0;

>> +       bool first;

>> +       bool retiring = false;

>> +       int num_entries = 0;

>> +       unsigned int qc_idx = 0;

>>          unsigned long iflags;

>> +       ktime_t kt_from_boot = ktime_get_boottime();

>>          struct sdebug_queue *sqp;

>>          struct sdebug_queued_cmd *sqcp;

>>          struct scsi_cmnd *scp;

>>          struct sdebug_dev_info *devip;

>> -       int num_entries = 0;

>> +       struct sdebug_defer *sd_dp;

>>

>>          sqp = sdebug_q_arr + queue_num;

>> +       spin_lock_irqsave(&sqp->qc_lock, iflags);

>>

>> -       do {

>> -               spin_lock_irqsave(&sqp->qc_lock, iflags);

>> -               qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);

>> -               if (unlikely((qc_idx < 0) || (qc_idx >= sdebug_max_queue)))

>> -                       goto out;

>> +       for (first = true; first || qc_idx + 1 < sdebug_max_queue; )   {

>> +               if (first) {

>> +                       qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);

>> +                       first = false;

>> +               } else {

>> +                       qc_idx = find_next_bit(sqp->in_use_bm, sdebug_max_queue, qc_idx + 1);

>> +               }

>> +               if (unlikely(qc_idx >= sdebug_max_queue))

>> +                       break;

>>

>>                  sqcp = &sqp->qc_arr[qc_idx];

>> +               sd_dp = sqcp->sd_dp;

>> +               if (unlikely(!sd_dp))

>> +                       continue;

>>                  scp = sqcp->a_cmnd;

>>                  if (unlikely(scp == NULL)) {

>> -                       pr_err("scp is NULL, queue_num=%d, qc_idx=%d from %s\n",

>> +                       pr_err("scp is NULL, queue_num=%d, qc_idx=%u from %s\n",

>>                                 queue_num, qc_idx, __func__);

>> -                       goto out;

>> +                       break;

>>                  }

>> +               if (sd_dp->defer_t == SDEB_DEFER_POLL) {

>> +                       if (kt_from_boot < sd_dp->cmpl_ts)

>> +                               continue;

>> +

>> +               } else          /* ignoring non REQ_HIPRI requests */

>> +                       continue;

>>                  devip = (struct sdebug_dev_info *)scp->device->hostdata;

>>                  if (likely(devip))

>>                          atomic_dec(&devip->num_in_q);

>>                  else

>>                          pr_err("devip=NULL from %s\n", __func__);

>>                  if (unlikely(atomic_read(&retired_max_queue) > 0))

>> -                       retiring = 1;

>> +                       retiring = true;

>>

>>                  sqcp->a_cmnd = NULL;

>>                  if (unlikely(!test_and_clear_bit(qc_idx, sqp->in_use_bm))) {

>> -                       pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%d from %s\n",

>> +                       pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%u from %s\n",

>>                                  sqp, queue_num, qc_idx, __func__);

>> -                       goto out;

>> +                       break;

>>                  }

>> -

>>                  if (unlikely(retiring)) {       /* user has reduced max_queue */

>>                          int k, retval;

>>

>>                          retval = atomic_read(&retired_max_queue);

>>                          if (qc_idx >= retval) {

>>                                  pr_err("index %d too large\n", retval);

>> -                               goto out;

>> +                               break;

>>                          }

>>                          k = find_last_bit(sqp->in_use_bm, retval);

>>                          if ((k < sdebug_max_queue) || (k == retval))

>> @@ -7299,17 +7345,18 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)

>>                          else

>>                                  atomic_set(&retired_max_queue, k + 1);

>>                  }

>> +               sd_dp->defer_t = SDEB_DEFER_NONE;

>>                  spin_unlock_irqrestore(&sqp->qc_lock, iflags);

>>                  scp->scsi_done(scp); /* callback to mid level */

>> +               spin_lock_irqsave(&sqp->qc_lock, iflags);

>>                  num_entries++;

>> -       } while (1);

>> -

>> -out:

>> +       }

>>          spin_unlock_irqrestore(&sqp->qc_lock, iflags);

>> +       if (num_entries > 0)

>> +               atomic_add(num_entries, &sdeb_mq_poll_count);

>>          return num_entries;

>>   }

>>

>> -

>>   static int scsi_debug_queuecommand(struct Scsi_Host *shost,

>>                                     struct scsi_cmnd *scp)

>>   {

>> --

>> 2.25.1

>>
Douglas Gilbert Feb. 11, 2021, 5:56 a.m. UTC | #3
On 2021-02-09 10:08 p.m., Kashyap Desai wrote:
> On Wed, Feb 10, 2021 at 4:26 AM Douglas Gilbert <dgilbert@interlog.com> wrote:

>>

>> Add a new sdeb_defer_type enumeration: SDEB_DEFER_POLL for requests

>> that have REQ_HIPRI set in cmd_flags field. It is expected that

>> these requests will be polled via the mq_poll entry point which

>> is driven by calls to blk_poll() in the block layer. Therefore

>> timer events are not 'wired up' in the normal fashion.

>>

>> There are still cases with short delays (e.g. < 10 microseconds)

>> where by the time the command response processing occurs, the delay

>> is already exceeded in which case the code calls scsi_done()

>> directly. In such cases there is no window for mq_poll() to be

>> called.

>>

>> Add 'mq_polls' counter that increments on each scsi_done() called

>> via the mq_poll entry point. Can be used to show (with 'cat

>> /proc/scsi/scsi_debug/<host_id>') that blk_poll() is causing

>> completions rather than some other mechanism.

>>

>>

>> This patch is against the 5.12/scsi-staging branch which includes

>>     a98d6bdf181eb71bf4686666eaf47c642a061642

>>     scsi_debug : iouring iopoll support

>> which it alters. So this patch is a fix of that patch.

>>

>> Changes since version 2 [sent 20210206 to linux-scsi list]

>>    - the sdebug_blk_mq_poll() callback didn't cope with the

>>      uncommon case where sqcp->sd_dp is NULL. Fix.

> 

> Hi Doug, I tried this patch on top of below iouring patch series -

> "[v3,1/4] add io_uring with IOPOLL support in scsi layer"

> 

> After applying patch, I am seeing an IO hang issue - I see

> io_wqe_worker goes into a tight loop.

> 

> 18210 root      20   0 1316348   3552   2376 R  99.1   0.0   0:24.09

> fio                                       18303 root      20   0

> 0      0      0 D  78.8   0.0   0:01.75 io_wqe_worker-0

>             18219 root      20   0       0      0      0 R  71.7   0.0

>   0:06.59 io_wqe_worker-0

> 

> 

> I used below command -

> insmod drivers/scsi/scsi_debug.ko dev_size_mb=1024 sector_size=512

> add_host=24 per_host_store=1 ndelay=10000 host_max_queue=32

> submit_queues=76 num_parts=1 poll_queues=8

> 

> and here is my fio script -

> [global]

> ioengine=io_uring

> hipri=1

> direct=1

> runtime=30s

> rw=randread

> norandommap

> bs=4k

> iodepth=64


Kashyap,
There is another issue here, namely iodepth > host_max_queue (64 > 32)
and in my setup that is not handled well. So there is a problem with
scsi_debug *** or the mid-level in handling this case.

If you have modified the sd driver to call blk_poll() then perhaps
you could try the above test again with a reduced iodepth.

Doug Gilbert

*** the scsi_debug should either yield a TASK SET FULL status or return
     SCSI_MLQUEUE_HOST_BUSY from queuecommand() when it has run out of
     slots.

> [seqprecon]

> filename=/dev/sdd

> [seqprecon]

> filename=/dev/sde

> [seqprecon]

> filename=/dev/sdf

> 

> 

> I will ask Martin to pick all the patches from "[v3,1/4] add io_uring

> with IOPOLL support in the scsi layer" except scsi_debug. We can work

> on scsi_debug and send standalone patch.

> 

> Kashyap
Kashyap Desai Feb. 11, 2021, 12:13 p.m. UTC | #4
> Kashyap,

> There is another issue here, namely iodepth > host_max_queue (64 > 32) and

> in my setup that is not handled well. So there is a problem with

> scsi_debug

> *** or the mid-level in handling this case.

>

> If you have modified the sd driver to call blk_poll() then perhaps you

> could try

> the above test again with a reduced iodepth.


Doug -

I debug the issue and found the fix. We need to remove below code to handle
this new defer type for mq poll.
Earlier I did not used " REQ_HIPRI" to handle mq poll, but now you are
managing it through REQ_HIPRI check, it is safe and required to have below
patch.


diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index b9ec3a47fbf1..c50de49a2c2f 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5437,13 +5437,6 @@ static int schedule_resp(struct scsi_cmnd *cmnd,
struct sdebug_dev_info *devip,
        sd_dp = sqcp->sd_dp;
        spin_unlock_irqrestore(&sqp->qc_lock, iflags);

-       /* Do not complete IO from default completion path.
-        * Let it to be on queue.
-        * Completion should happen from mq_poll interface.
-        */
-       if ((sqp - sdebug_q_arr) >= (submit_queues - poll_queues))
-               return 0;
-
        if (!sd_dp) {
                sd_dp = kzalloc(sizeof(*sd_dp), GFP_ATOMIC);
                if (!sd_dp) {


On top of your v3 patch + above fix, I am able to run IO and there are no
issues.  There is no issue with higher QD as well, so we are good to go.

Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>


So how shall we progress now ? Shall I ask Martin to pick all the patches
from " [PATCH v3 0/4] io_uring iopoll in scsi layer" since this patch has
dependency on my series.

Kashyap

>

> Doug Gilbert

>

> *** the scsi_debug should either yield a TASK SET FULL status or return

>      SCSI_MLQUEUE_HOST_BUSY from queuecommand() when it has run out of

>      slots.

>

> > [seqprecon]

> > filename=/dev/sdd

> > [seqprecon]

> > filename=/dev/sde

> > [seqprecon]

> > filename=/dev/sdf

> >

> >

> > I will ask Martin to pick all the patches from "[v3,1/4] add io_uring

> > with IOPOLL support in the scsi layer" except scsi_debug. We can work

> > on scsi_debug and send standalone patch.

> >

> > Kashyap
Douglas Gilbert Feb. 11, 2021, 4:11 p.m. UTC | #5
On 2021-02-11 7:13 a.m., Kashyap Desai wrote:
>> Kashyap,

>> There is another issue here, namely iodepth > host_max_queue (64 > 32) and

>> in my setup that is not handled well. So there is a problem with

>> scsi_debug

>> *** or the mid-level in handling this case.

>>

>> If you have modified the sd driver to call blk_poll() then perhaps you

>> could try

>> the above test again with a reduced iodepth.

> 

> Doug -

> 

> I debug the issue and found the fix. We need to remove below code to handle

> this new defer type for mq poll.

> Earlier I did not used " REQ_HIPRI" to handle mq poll, but now you are

> managing it through REQ_HIPRI check, it is safe and required to have below

> patch.

> 

> 

> diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c

> index b9ec3a47fbf1..c50de49a2c2f 100644

> --- a/drivers/scsi/scsi_debug.c

> +++ b/drivers/scsi/scsi_debug.c

> @@ -5437,13 +5437,6 @@ static int schedule_resp(struct scsi_cmnd *cmnd,

> struct sdebug_dev_info *devip,

>          sd_dp = sqcp->sd_dp;

>          spin_unlock_irqrestore(&sqp->qc_lock, iflags);

> 

> -       /* Do not complete IO from default completion path.

> -        * Let it to be on queue.

> -        * Completion should happen from mq_poll interface.

> -        */

> -       if ((sqp - sdebug_q_arr) >= (submit_queues - poll_queues))

> -               return 0;

> -

>          if (!sd_dp) {

>                  sd_dp = kzalloc(sizeof(*sd_dp), GFP_ATOMIC);

>                  if (!sd_dp) {

> 

> 

> On top of your v3 patch + above fix, I am able to run IO and there are no

> issues.  There is no issue with higher QD as well, so we are good to go.

> 

> Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>

> 

> So how shall we progress now ? Shall I ask Martin to pick all the patches

> from " [PATCH v3 0/4] io_uring iopoll in scsi layer" since this patch has

> dependency on my series.


Kashyap,
Good catch!

You could roll all the scsi_debug stuff into one patch and put both
our sign-offs on it, or present it as two patches:
   - first one: your original plus this fix
   - second one: mine with your tested-by on it

Since this is new functionality for the scsi sub-system, it is unlikely
that all the corner cases have been visited. However, with what we have
in place for the next cycle, others have something reasonably solid to
build on.

Doug Gilbert

>> *** the scsi_debug should either yield a TASK SET FULL status or return

>>       SCSI_MLQUEUE_HOST_BUSY from queuecommand() when it has run out of

>>       slots.

>>

>>> [seqprecon]

>>> filename=/dev/sdd

>>> [seqprecon]

>>> filename=/dev/sde

>>> [seqprecon]

>>> filename=/dev/sdf

>>>

>>>

>>> I will ask Martin to pick all the patches from "[v3,1/4] add io_uring

>>> with IOPOLL support in the scsi layer" except scsi_debug. We can work

>>> on scsi_debug and send standalone patch.

>>>

>>> Kashyap
Kashyap Desai Feb. 12, 2021, 11:11 a.m. UTC | #6
> > So how shall we progress now ? Shall I ask Martin to pick all the

> > patches from " [PATCH v3 0/4] io_uring iopoll in scsi layer" since

> > this patch has dependency on my series.

>

> Kashyap,

> Good catch!

>

> You could roll all the scsi_debug stuff into one patch and put both our

> sign-

> offs on it, or present it as two patches:

>    - first one: your original plus this fix

>    - second one: mine with your tested-by on it


Doug - I will include your patch "scsi_debug: add new defer_type for
mq_poll" (including the recent fix) as separate patch in my series and
provide Tested-by tag for the same.

>

> Since this is new functionality for the scsi sub-system, it is unlikely

> that all the

> corner cases have been visited. However, with what we have in place for

> the

> next cycle, others have something reasonably solid to build on.


Agree.
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 746eec521f79..b9ec3a47fbf1 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -322,17 +322,19 @@  struct sdeb_store_info {
 	container_of(d, struct sdebug_host_info, dev)
 
 enum sdeb_defer_type {SDEB_DEFER_NONE = 0, SDEB_DEFER_HRT = 1,
-		      SDEB_DEFER_WQ = 2};
+		      SDEB_DEFER_WQ = 2, SDEB_DEFER_POLL = 3};
 
 struct sdebug_defer {
 	struct hrtimer hrt;
 	struct execute_work ew;
+	ktime_t cmpl_ts;/* time since boot to complete this cmd */
 	int sqa_idx;	/* index of sdebug_queue array */
 	int qc_idx;	/* index of sdebug_queued_cmd array within sqa_idx */
 	int hc_idx;	/* hostwide tag index */
 	int issuing_cpu;
 	bool init_hrt;
 	bool init_wq;
+	bool init_poll;
 	bool aborted;	/* true when blk_abort_request() already called */
 	enum sdeb_defer_type defer_t;
 };
@@ -357,6 +359,7 @@  static atomic_t sdebug_completions;  /* count of deferred completions */
 static atomic_t sdebug_miss_cpus;    /* submission + completion cpus differ */
 static atomic_t sdebug_a_tsf;	     /* 'almost task set full' counter */
 static atomic_t sdeb_inject_pending;
+static atomic_t sdeb_mq_poll_count;  /* bumped when mq_poll returns > 0 */
 
 struct opcode_info_t {
 	u8 num_attached;	/* 0 if this is it (i.e. a leaf); use 0xff */
@@ -4730,7 +4733,6 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 	struct scsi_cmnd *scp;
 	struct sdebug_dev_info *devip;
 
-	sd_dp->defer_t = SDEB_DEFER_NONE;
 	if (unlikely(aborted))
 		sd_dp->aborted = false;
 	qc_idx = sd_dp->qc_idx;
@@ -4745,6 +4747,7 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 		return;
 	}
 	spin_lock_irqsave(&sqp->qc_lock, iflags);
+	sd_dp->defer_t = SDEB_DEFER_NONE;
 	sqcp = &sqp->qc_arr[qc_idx];
 	scp = sqcp->a_cmnd;
 	if (unlikely(scp == NULL)) {
@@ -5517,40 +5520,66 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 				kt -= d;
 			}
 		}
-		if (!sd_dp->init_hrt) {
-			sd_dp->init_hrt = true;
-			sqcp->sd_dp = sd_dp;
-			hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,
-				     HRTIMER_MODE_REL_PINNED);
-			sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;
-			sd_dp->sqa_idx = sqp - sdebug_q_arr;
-			sd_dp->qc_idx = k;
+		sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);
+		if (cmnd->request->cmd_flags & REQ_HIPRI) {
+			spin_lock_irqsave(&sqp->qc_lock, iflags);
+			if (!sd_dp->init_poll) {
+				sd_dp->init_poll = true;
+				sqcp->sd_dp = sd_dp;
+				sd_dp->sqa_idx = sqp - sdebug_q_arr;
+				sd_dp->qc_idx = k;
+			}
+			sd_dp->defer_t = SDEB_DEFER_POLL;
+			spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+		} else {
+			if (!sd_dp->init_hrt) {
+				sd_dp->init_hrt = true;
+				sqcp->sd_dp = sd_dp;
+				hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,
+					     HRTIMER_MODE_REL_PINNED);
+				sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;
+				sd_dp->sqa_idx = sqp - sdebug_q_arr;
+				sd_dp->qc_idx = k;
+			}
+			sd_dp->defer_t = SDEB_DEFER_HRT;
+			/* schedule the invocation of scsi_done() for a later time */
+			hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);
 		}
 		if (sdebug_statistics)
 			sd_dp->issuing_cpu = raw_smp_processor_id();
-		sd_dp->defer_t = SDEB_DEFER_HRT;
-		/* schedule the invocation of scsi_done() for a later time */
-		hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);
 	} else {	/* jdelay < 0, use work queue */
-		if (!sd_dp->init_wq) {
-			sd_dp->init_wq = true;
-			sqcp->sd_dp = sd_dp;
-			sd_dp->sqa_idx = sqp - sdebug_q_arr;
-			sd_dp->qc_idx = k;
-			INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);
-		}
-		if (sdebug_statistics)
-			sd_dp->issuing_cpu = raw_smp_processor_id();
-		sd_dp->defer_t = SDEB_DEFER_WQ;
 		if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
 			     atomic_read(&sdeb_inject_pending)))
 			sd_dp->aborted = true;
-		schedule_work(&sd_dp->ew.work);
-		if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
-			     atomic_read(&sdeb_inject_pending))) {
+		sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);
+		if (cmnd->request->cmd_flags & REQ_HIPRI) {
+			spin_lock_irqsave(&sqp->qc_lock, iflags);
+			if (!sd_dp->init_poll) {
+				sd_dp->init_poll = true;
+				sqcp->sd_dp = sd_dp;
+				sd_dp->sqa_idx = sqp - sdebug_q_arr;
+				sd_dp->qc_idx = k;
+			}
+			sd_dp->defer_t = SDEB_DEFER_POLL;
+			spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+		} else {
+			if (!sd_dp->init_wq) {
+				sd_dp->init_wq = true;
+				sqcp->sd_dp = sd_dp;
+				sd_dp->sqa_idx = sqp - sdebug_q_arr;
+				sd_dp->qc_idx = k;
+				INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);
+			}
+			sd_dp->defer_t = SDEB_DEFER_WQ;
+			schedule_work(&sd_dp->ew.work);
+		}
+		if (sdebug_statistics)
+			sd_dp->issuing_cpu = raw_smp_processor_id();
+		if (unlikely(sd_dp->aborted)) {
 			sdev_printk(KERN_INFO, sdp, "abort request tag %d\n", cmnd->request->tag);
 			blk_abort_request(cmnd->request);
 			atomic_set(&sdeb_inject_pending, 0);
+			sd_dp->aborted = false;
 		}
 	}
 	if (unlikely((SDEBUG_OPT_Q_NOISE & sdebug_opts) && scsi_result == device_qfull_result))
@@ -5779,11 +5808,12 @@  static int scsi_debug_show_info(struct seq_file *m, struct Scsi_Host *host)
 		   dix_reads, dix_writes, dif_errors);
 	seq_printf(m, "usec_in_jiffy=%lu, statistics=%d\n", TICK_NSEC / 1000,
 		   sdebug_statistics);
-	seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d\n",
+	seq_printf(m, "cmnd_count=%d, completions=%d, %s=%d, a_tsf=%d, mq_polls=%d\n",
 		   atomic_read(&sdebug_cmnd_count),
 		   atomic_read(&sdebug_completions),
 		   "miss_cpus", atomic_read(&sdebug_miss_cpus),
-		   atomic_read(&sdebug_a_tsf));
+		   atomic_read(&sdebug_a_tsf),
+		   atomic_read(&sdeb_mq_poll_count));
 
 	seq_printf(m, "submit_queues=%d\n", submit_queues);
 	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
@@ -7246,52 +7276,68 @@  static int sdebug_map_queues(struct Scsi_Host *shost)
 
 static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 {
-	int qc_idx;
-	int retiring = 0;
+	bool first;
+	bool retiring = false;
+	int num_entries = 0;
+	unsigned int qc_idx = 0;
 	unsigned long iflags;
+	ktime_t kt_from_boot = ktime_get_boottime();
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
 	struct scsi_cmnd *scp;
 	struct sdebug_dev_info *devip;
-	int num_entries = 0;
+	struct sdebug_defer *sd_dp;
 
 	sqp = sdebug_q_arr + queue_num;
+	spin_lock_irqsave(&sqp->qc_lock, iflags);
 
-	do {
-		spin_lock_irqsave(&sqp->qc_lock, iflags);
-		qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);
-		if (unlikely((qc_idx < 0) || (qc_idx >= sdebug_max_queue)))
-			goto out;
+	for (first = true; first || qc_idx + 1 < sdebug_max_queue; )   {
+		if (first) {
+			qc_idx = find_first_bit(sqp->in_use_bm, sdebug_max_queue);
+			first = false;
+		} else {
+			qc_idx = find_next_bit(sqp->in_use_bm, sdebug_max_queue, qc_idx + 1);
+		}
+		if (unlikely(qc_idx >= sdebug_max_queue))
+			break;
 
 		sqcp = &sqp->qc_arr[qc_idx];
+		sd_dp = sqcp->sd_dp;
+		if (unlikely(!sd_dp))
+			continue;
 		scp = sqcp->a_cmnd;
 		if (unlikely(scp == NULL)) {
-			pr_err("scp is NULL, queue_num=%d, qc_idx=%d from %s\n",
+			pr_err("scp is NULL, queue_num=%d, qc_idx=%u from %s\n",
 			       queue_num, qc_idx, __func__);
-			goto out;
+			break;
 		}
+		if (sd_dp->defer_t == SDEB_DEFER_POLL) {
+			if (kt_from_boot < sd_dp->cmpl_ts)
+				continue;
+
+		} else		/* ignoring non REQ_HIPRI requests */
+			continue;
 		devip = (struct sdebug_dev_info *)scp->device->hostdata;
 		if (likely(devip))
 			atomic_dec(&devip->num_in_q);
 		else
 			pr_err("devip=NULL from %s\n", __func__);
 		if (unlikely(atomic_read(&retired_max_queue) > 0))
-			retiring = 1;
+			retiring = true;
 
 		sqcp->a_cmnd = NULL;
 		if (unlikely(!test_and_clear_bit(qc_idx, sqp->in_use_bm))) {
-			pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%d from %s\n",
+			pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%u from %s\n",
 				sqp, queue_num, qc_idx, __func__);
-			goto out;
+			break;
 		}
-
 		if (unlikely(retiring)) {	/* user has reduced max_queue */
 			int k, retval;
 
 			retval = atomic_read(&retired_max_queue);
 			if (qc_idx >= retval) {
 				pr_err("index %d too large\n", retval);
-				goto out;
+				break;
 			}
 			k = find_last_bit(sqp->in_use_bm, retval);
 			if ((k < sdebug_max_queue) || (k == retval))
@@ -7299,17 +7345,18 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 			else
 				atomic_set(&retired_max_queue, k + 1);
 		}
+		sd_dp->defer_t = SDEB_DEFER_NONE;
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 		scp->scsi_done(scp); /* callback to mid level */
+		spin_lock_irqsave(&sqp->qc_lock, iflags);
 		num_entries++;
-	} while (1);
-
-out:
+	}
 	spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+	if (num_entries > 0)
+		atomic_add(num_entries, &sdeb_mq_poll_count);
 	return num_entries;
 }
 
-
 static int scsi_debug_queuecommand(struct Scsi_Host *shost,
 				   struct scsi_cmnd *scp)
 {